修复1在WebSocket回调线程内执行stop/start竞争条件，'socket already closed'循环出现，2陈旧结果5秒复用窗口旧识别结果污染新请求，意图混乱

fix run issues
修复rclpy.spin() 单线程执行器导致异步回调死锁，增加ASR WebSocket 自动重连机制
2026-03-06 17:29:55 +08:00 · 2026-01-30 10:53:07 +08:00 · 2026-01-29 17:24:49 +08:00 · 2026-01-28 14:45:42 +08:00 · 2026-01-27 20:53:43 +08:00 · 2026-01-22 17:28:28 +08:00
30 changed files with 4585 additions and 67 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,9 @@
+build/
+install/
+log/
+__pycache__/
+*.pyc
+*.egg-info/
+dist/
+lib/
+installed_files.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,116 @@
+cmake_minimum_required(VERSION 3.8)
+project(robot_speaker)
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+find_package(ament_cmake REQUIRED)
+find_package(ament_cmake_python REQUIRED)
+find_package(interfaces REQUIRED)
+
+# 确保使用系统 Python（而不是 conda/miniconda 的 Python）
+find_program(PYTHON3_CMD python3 PATHS /usr/bin /usr/local/bin NO_DEFAULT_PATH)
+if(NOT PYTHON3_CMD)
+  find_program(PYTHON3_CMD python3)
+endif()
+if(PYTHON3_CMD)
+  set(Python3_EXECUTABLE ${PYTHON3_CMD} CACHE FILEPATH "Python 3 executable" FORCE)
+  set(PYTHON_EXECUTABLE ${PYTHON3_CMD} CACHE FILEPATH "Python executable" FORCE)
+endif()
+
+install(CODE "
+  execute_process(
+    COMMAND ${PYTHON3_CMD} -m pip install --prefix=${CMAKE_INSTALL_PREFIX} --no-deps ${CMAKE_CURRENT_SOURCE_DIR}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    RESULT_VARIABLE install_result
+    OUTPUT_VARIABLE install_output
+    ERROR_VARIABLE install_error
+  )
+  if(NOT install_result EQUAL 0)
+    message(FATAL_ERROR \"Failed to install Python package. Output: ${install_output} Error: ${install_error}\")
+  endif()
+  
+  execute_process(
+    COMMAND ${PYTHON3_CMD} -c \"
+import os
+import shutil
+import glob
+import sysconfig
+
+install_prefix = '${CMAKE_INSTALL_PREFIX}'
+build_dir = '${CMAKE_CURRENT_BINARY_DIR}'
+python_version = f'{sysconfig.get_python_version()}'
+
+# ROS2 期望的 Python 包位置
+ros2_site_packages = os.path.join(install_prefix, 'lib', f'python{python_version}', 'site-packages')
+os.makedirs(ros2_site_packages, exist_ok=True)
+  
+# pip install --prefix 可能将包安装到不同位置（系统环境通常是 local/lib/pythonX/dist-packages）
+pip_locations = [
+    os.path.join(install_prefix, 'local', 'lib', f'python{python_version}', 'dist-packages'),
+    os.path.join(install_prefix, 'lib', f'python{python_version}', 'site-packages'),
+    os.path.join(install_prefix, 'local', 'lib', f'python{python_version}', 'site-packages'),
+]
+
+# 查找并复制 robot_speaker 包到 ROS2 期望的位置
+robot_speaker_src = None
+for location in pip_locations:
+    candidate = os.path.join(location, 'robot_speaker')
+    if os.path.exists(candidate) and os.path.isdir(candidate):
+        robot_speaker_src = candidate
+        break
+
+if robot_speaker_src:
+    robot_speaker_dest = os.path.join(ros2_site_packages, 'robot_speaker')
+    if os.path.exists(robot_speaker_dest):
+        shutil.rmtree(robot_speaker_dest)
+    if robot_speaker_src != robot_speaker_dest:
+        shutil.copytree(robot_speaker_src, robot_speaker_dest)
+        print(f'Copied robot_speaker from {robot_speaker_src} to {ros2_site_packages}')
+    else:
+        print(f'robot_speaker already in correct location')
+
+# 处理 entry_points 脚本
+lib_dir = os.path.join(install_prefix, 'lib', 'robot_speaker')
+os.makedirs(lib_dir, exist_ok=True)
+
+# 脚本可能在 local/bin 或 bin 中
+for bin_dir in [os.path.join(install_prefix, 'local', 'bin'), os.path.join(install_prefix, 'bin')]:
+    if os.path.exists(bin_dir):
+        scripts = glob.glob(os.path.join(bin_dir, '*_node'))
+        for script in scripts:
+            script_name = os.path.basename(script)
+            dest = os.path.join(lib_dir, script_name)
+            if script != dest:
+                shutil.copy2(script, dest)
+                os.chmod(dest, 0o755)
+                print(f'Copied {script_name} to {lib_dir}')
+\"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    RESULT_VARIABLE python_result
+    OUTPUT_VARIABLE python_output
+  )
+  if(python_result EQUAL 0)
+    message(STATUS \"${python_output}\")
+  else()
+    message(WARNING \"Failed to setup Python package: ${python_output}\")
+  endif()
+")
+
+install(DIRECTORY launch/
+  DESTINATION share/${PROJECT_NAME}/launch
+  FILES_MATCHING PATTERN "*.launch.py"
+)
+
+install(DIRECTORY config/
+  DESTINATION share/${PROJECT_NAME}/config
+  FILES_MATCHING PATTERN "*.yaml" PATTERN "*.json"
+)
+
+if(BUILD_TESTING)
+  find_package(ament_lint_auto REQUIRED)
+  ament_lint_auto_find_test_dependencies()
+endif()
+
+ament_package()
--- a/README.md
+++ b/README.md
@@ -1,2 +1,102 @@
-# hivecore_robot_voice
+# ROS 语音包 (robot_speaker)
+
+## 注册阿里云百炼获取api_key
+https://bailian.console.aliyun.com/?tab=model#/api-key
+->密钥管理
+放到config/voice.yaml
+
+## 安装依赖
+1. 系统依赖
+```bash
+sudo apt-get update
+sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg swig meson ninja-build build-essential pkg-config libwebrtc-audio-processing-dev
+```
+
+2. Python依赖
+```bash
+cd ~/ros_learn/hivecore_robot_voice
+# 在 Python 3.10 环境下，需要单独安装 aec-audio-processing 以跳过版本检查
+pip3 install aec-audio-processing --no-binary :all: --ignore-requires-python --break-system-packages
+pip3 install -r requirements.txt --break-system-packages
+```
+
+## 编译启动
+1. 注册声纹
+  - 启动节点后可以说：er gou我现在正在注册声纹，这是一段很长的测试语音，请把我的声音录进去。
+  - 正确的注册姿势：包含唤醒词二狗，不要停顿的尽量说完3秒
+    
+  - 现在的逻辑只要识别到二狗就注册，然后退出节点，识别不到二狗继续等待
+  - 多注册几段，换方向距离注册，可以提高识别相似度，注册方向对声纹相似性影响很大
+```bash
+cd ~/ros_learn/hivecore_robot_voice
+colcon build
+source install/setup.bash
+```
+
+```bash
+# 终端1: 启动ASR节点
+ros2 run robot_speaker asr_audio_node
+# 终端2: 注册声纹
+ros2 run robot_speaker register_speaker_node
+```
+
+2. 主节点
+  - 启动节点后每句交互包含唤醒词，唤醒词和语句之间不要有停顿
+  - 二狗拍照看看开启图文交互
+  - 支持已注册声纹用户打断
+```bash
+cd ~/ros_learn/hivecore_robot_voice
+colcon build
+source install/setup.bash
+ros2 launch robot_speaker voice.launch.py
+```
+
+3. ASR节点
+```bash
+ros2 run robot_speaker asr_audio_node
+```
+
+4. TTS节点
+```bash
+# 终端1: 启动TTS节点
+ros2 run robot_speaker tts_audio_node
+
+# 终端2: 启动播放
+source install/setup.bash
+ros2 service call /tts/synthesize robot_speaker/srv/TTSSynthesize \
+  "{command: 'synthesize', text: '这是一段很长的测试文本，用于测试TTS中断功能。我需要说很多很多内容，这样你才有足够的时间来测试中断命令。让我继续说下去，这是一段很长的测试文本，用于测试TTS中断功能。我需要说很多很多内容，这样你才有足够的时间来测试中断命令。让我继续说下去，这是一段很长的测试文本，用于测试TTS中断功能。我需要说很多很多内容，这样你才有足够的时间来测试中断命令。', voice: ''}"
+
+# 终端3: 立即执行中断
+source install/setup.bash
+ros2 service call /tts/synthesize robot_speaker/srv/TTSSynthesize \
+  "{command: 'interrupt', text: '', voice: ''}"
+```
+
+5. 完整运行
+```bash
+# 终端1：启动 brain 节点
+# 终端2：启动 voice 节点
+# 终端3：启动 bridge 节点
+# 终端4：订阅相机
+```
+
+## 用到的命令
+1. 音频设备
+```bash
+# 1. 查看所有音频设备
+cat /proc/asound/cards
+# 2. 查看 card(1)的流信息（设备参数）
+cat /proc/asound/card1/stream0
+```
+
+2. 相机设备
+```bash
+# 1. 查看相机所有基础信息（型号、固件版本、序列号等）
+rs-enumerate-devices -c 
+```
+
+3. 模型下载
+```bash
+modelscope download --model iic/speech_campplus_sv_zh-cn_16k-common --local_dir [指定路径]
+```

--- a/config/knowledge.json
+++ b/config/knowledge.json
@@ -0,0 +1,46 @@
+{
+  "entries": [
+    {
+      "id": "robot_identity_1",
+      "patterns": [
+        "ni shi shui"
+      ],
+      "answer": "我叫二狗，是蜂核科技的机器人，很高兴为你服务"
+    },
+     {
+      "id": "robot_identity_2",
+      "patterns": [
+        "ni jiao sha"
+      ],
+      "answer": "我叫二狗呀，我是你的好帮手"
+    },
+    {
+      "id": "wake_word",
+      "patterns": [
+        "ni de ming zi"
+      ],
+      "answer": "我的名字是二狗"
+    },
+    {
+      "id": "skill_1",
+      "patterns": [
+        "tiao ge wu"
+      ],
+      "answer": "这个我真不会，我怕跳起来吓到你"
+    },
+    {
+      "id": "skill_2",
+      "patterns": [
+        "ni neng gan"
+      ],
+      "answer": "我可以陪你聊天，也能帮你干活"
+    },
+    {
+      "id": "skill_3",
+      "patterns": [
+        "ni hui gan"
+      ],
+      "answer": "我可以陪你聊天，你也可以发布具体的指令让我干活"
+    }
+  ]
+}
--- a/config/speakers.json
+++ b/config/speakers.json
@@ -0,0 +1,596 @@
+{
+  "user_1769589229": {
+    "embedding": [
+      0.018443606793880463,
+      0.12385621666908264,
+      0.42172902822494507,
+      1.3724409341812134,
+      -0.4492957293987274,
+      -0.6218937635421753,
+      -0.9678031802177429,
+      0.678302526473999,
+      1.744055151939392,
+      -1.8670854568481445,
+      -1.9064403772354126,
+      0.5380862951278687,
+      0.16627110540866852,
+      -0.6322636008262634,
+      -1.7715388536453247,
+      -0.2003282904624939,
+      -2.1722018718719482,
+      0.5719940662384033,
+      -0.6866416931152344,
+      1.5751206874847412,
+      0.27836838364601135,
+      -0.03192685544490814,
+      -0.486663818359375,
+      1.6337751150131226,
+      -1.0401458740234375,
+      0.0581182986497879,
+      0.9309709072113037,
+      -0.00908487569540739,
+      -0.05825135484337807,
+      1.042805552482605,
+      0.95391845703125,
+      0.5708717107772827,
+      -1.3427493572235107,
+      -0.46104469895362854,
+      -0.4387856423854828,
+      -2.2000691890716553,
+      -1.2598334550857544,
+      -0.34516626596450806,
+      -1.5205646753311157,
+      -1.3810551166534424,
+      -0.9685532450675964,
+      0.33360639214515686,
+      0.7115882039070129,
+      -0.6262675523757935,
+      -1.831620216369629,
+      -1.0514777898788452,
+      0.677291750907898,
+      1.6341345310211182,
+      1.0802626609802246,
+      0.2750645875930786,
+      2.517354726791382,
+      -0.5022090077400208,
+      -0.512808084487915,
+      -1.0913103818893433,
+      -0.5228419899940491,
+      0.7334955334663391,
+      -0.04904095083475113,
+      0.5420397520065308,
+      0.76543128490448,
+      -0.28510582447052,
+      -0.015149342827498913,
+      -0.38553595542907715,
+      -0.8873414993286133,
+      -0.7940725684165955,
+      2.0196990966796875,
+      1.079050064086914,
+      -0.3385912775993347,
+      0.687140703201294,
+      0.8218201994895935,
+      -0.8151140809059143,
+      -0.12016838788986206,
+      -0.5360821485519409,
+      1.5735585689544678,
+      2.2081315517425537,
+      -0.8545964956283569,
+      -0.7184719443321228,
+      1.0227694511413574,
+      1.004757285118103,
+      1.279994010925293,
+      1.0615602731704712,
+      -0.026518817991018295,
+      -0.12089776247739792,
+      1.9652493000030518,
+      -2.219129800796509,
+      1.3730603456497192,
+      -0.2324638068675995,
+      1.1085208654403687,
+      0.38454243540763855,
+      -0.7640709280967712,
+      1.8690227270126343,
+      -2.371783971786499,
+      0.4353397786617279,
+      0.6538525223731995,
+      -1.0312976837158203,
+      -0.06995117664337158,
+      2.4163870811462402,
+      0.16073228418827057,
+      -0.6870989799499512,
+      -1.6179540157318115,
+      -1.3476271629333496,
+      0.20239552855491638,
+      -0.050261445343494415,
+      -0.038828205317258835,
+      0.4753866195678711,
+      0.6126185059547424,
+      0.8918412923812866,
+      -0.3909176290035248,
+      0.2147030234336853,
+      0.39352068305015564,
+      -0.6788452863693237,
+      -2.1740481853485107,
+      1.1571974754333496,
+      -0.4064839482307434,
+      1.2412688732147217,
+      0.7256757616996765,
+      1.7226027250289917,
+      -0.0026558407116681337,
+      -0.5800378918647766,
+      -0.15300726890563965,
+      -0.7650083899497986,
+      -2.0132904052734375,
+      -1.0595450401306152,
+      -0.49976038932800293,
+      0.9254617094993591,
+      -1.2378792762756348,
+      1.6656403541564941,
+      -0.7135428786277771,
+      -0.9382724761962891,
+      0.9358375668525696,
+      0.3685700595378876,
+      -0.10180468112230301,
+      -0.1037834882736206,
+      -0.23670005798339844,
+      1.75762140750885,
+      -0.17887072265148163,
+      0.046728529036045074,
+      -0.8897371888160706,
+      -1.3732428550720215,
+      -1.258161187171936,
+      -1.8424062728881836,
+      -0.20653045177459717,
+      1.2090659141540527,
+      -2.8419432640075684,
+      -0.21915671229362488,
+      0.9777458310127258,
+      -0.4830246567726135,
+      -1.0184019804000854,
+      -1.981907606124878,
+      -0.9043097496032715,
+      1.2316601276397705,
+      0.4337644577026367,
+      -1.4176150560379028,
+      -0.0775287076830864,
+      1.9701248407363892,
+      -0.49479153752326965,
+      -0.8893828988075256,
+      -1.4819709062576294,
+      1.7628812789916992,
+      -1.1569868326187134,
+      -0.5023629069328308,
+      1.0665892362594604,
+      0.380581796169281,
+      0.8616085052490234,
+      1.566547155380249,
+      -0.08466020226478577,
+      -6.428647611755878e-05,
+      -0.4506562650203705,
+      1.4498881101608276,
+      -0.8292654752731323,
+      -1.5012402534484863,
+      -2.3441176414489746,
+      -0.1354956328868866,
+      0.9400366544723511,
+      -2.566408157348633,
+      -0.6355810761451721,
+      0.6913732290267944,
+      -1.6313157081604004,
+      -0.7377245426177979,
+      -0.6275296807289124,
+      1.2654041051864624,
+      -1.2346998453140259,
+      -0.9682437181472778,
+      1.750296950340271,
+      0.145521342754364,
+      0.3888598680496216,
+      -0.10642947256565094,
+      0.534409761428833,
+      -0.07756417989730835,
+      -0.36027759313583374,
+      0.45393145084381104,
+      0.48670390248298645,
+      -0.41557130217552185
+    ],
+    "env": "",
+    "registered_at": 1769589229.8906083
+  },
+  "user_1769589397": {
+    "embedding": [
+      -0.4532654285430908,
+      0.9910935163497925,
+      0.7677441835403442,
+      0.6021982431411743,
+      -0.15526464581489563,
+      0.07699152082204819,
+      -0.20115968585014343,
+      1.1546334028244019,
+      1.3028098344802856,
+      -1.102020263671875,
+      -1.785357117652893,
+      1.0002834796905518,
+      0.29556989669799805,
+      -1.1847732067108154,
+      -1.6235555410385132,
+      -0.37263453006744385,
+      -1.0660096406936646,
+      1.1186366081237793,
+      -0.2739306390285492,
+      1.2053704261779785,
+      -0.4484007656574249,
+      -0.036067165434360504,
+      -0.22930052876472473,
+      0.7094787359237671,
+      -1.289236307144165,
+      0.6730620265007019,
+      0.139224573969841,
+      0.9508735537528992,
+      0.19451767206192017,
+      0.09167198091745377,
+      0.6681411266326904,
+      0.5114644169807434,
+      -0.41296282410621643,
+      -0.3286001980304718,
+      -0.13978855311870575,
+      -1.4886829853057861,
+      -1.125450849533081,
+      -0.5365853309631348,
+      -1.491755723953247,
+      -0.9122400879859924,
+      -0.336325466632843,
+      0.4180590510368347,
+      0.28993961215019226,
+      -0.18810254335403442,
+      -0.8575659990310669,
+      -0.7043600082397461,
+      0.1335042417049408,
+      0.7772237658500671,
+      0.5636520385742188,
+      -0.7948008179664612,
+      1.7150989770889282,
+      -0.13010169565677643,
+      -0.17901964485645294,
+      0.049516208469867706,
+      -0.3525894284248352,
+      0.47636479139328003,
+      0.4723852276802063,
+      0.21579991281032562,
+      0.4706135094165802,
+      -0.7862219214439392,
+      0.3285289406776428,
+      0.06317808479070663,
+      -0.44086384773254395,
+      -0.48760634660720825,
+      0.5548083782196045,
+      0.9824976921081543,
+      0.002366408007219434,
+      0.9341856837272644,
+      0.7644594311714172,
+      -0.4781777560710907,
+      0.140120267868042,
+      -0.27633413672447205,
+      0.2346642166376114,
+      1.050230860710144,
+      -1.269995927810669,
+      -0.05720380321145058,
+      1.291229248046875,
+      0.9839679002761841,
+      0.8129491209983826,
+      1.5021783113479614,
+      -0.3042735457420349,
+      -0.5572257041931152,
+      0.9156222343444824,
+      -1.9603447914123535,
+      0.43610018491744995,
+      0.4057847559452057,
+      0.7319568395614624,
+      0.20832139253616333,
+      -0.3430367410182953,
+      1.1169347763061523,
+      -1.3572204113006592,
+      -0.338941365480423,
+      0.68513023853302,
+      -0.5876723527908325,
+      0.028429267928004265,
+      1.647197961807251,
+      0.16790558397769928,
+      -0.39321064949035645,
+      -0.6376479864120483,
+      -0.8013231754302979,
+      0.2443818897008896,
+      -0.4631305932998657,
+      0.22423194348812103,
+      1.2424927949905396,
+      -0.29924842715263367,
+      0.8623120784759521,
+      -0.1876244992017746,
+      0.4357032775878906,
+      -0.1294589787721634,
+      -0.6075098514556885,
+      -0.13139747083187103,
+      0.7296662330627441,
+      -0.535290539264679,
+      0.36691513657569885,
+      0.7906659841537476,
+      1.353682279586792,
+      -0.09513506293296814,
+      -0.25815069675445557,
+      0.49696165323257446,
+      -0.8457471132278442,
+      -1.6415969133377075,
+      -1.4221503734588623,
+      -0.8390084505081177,
+      0.78926020860672,
+      -0.6399183869361877,
+      1.2397722005844116,
+      -0.4215489625930786,
+      -1.6843048334121704,
+      0.2801710367202759,
+      0.14025956392288208,
+      -0.07066306471824646,
+      0.6200811862945557,
+      0.06813270598649979,
+      1.0460718870162964,
+      -0.10868484526872635,
+      -0.4543164074420929,
+      -0.2009115219116211,
+      -1.5997940301895142,
+      -0.901277482509613,
+      -0.6989807486534119,
+      -0.6416334509849548,
+      0.6334083676338196,
+      -1.9596667289733887,
+      0.5712984204292297,
+      0.46919143199920654,
+      -0.29728618264198303,
+      -1.1560853719711304,
+      -1.0001498460769653,
+      -0.514187753200531,
+      0.5281404256820679,
+      -0.30581149458885193,
+      -0.509894073009491,
+      -0.5975268483161926,
+      1.3572251796722412,
+      -0.6662765145301819,
+      -0.42911258339881897,
+      -1.1632274389266968,
+      1.3836815357208252,
+      -0.3148840367794037,
+      -0.4249371290206909,
+      0.7550786733627319,
+      -0.05023616552352905,
+      0.4652675986289978,
+      0.5009594559669495,
+      -0.539340615272522,
+      0.5251657366752625,
+      -0.3844148814678192,
+      1.1907575130462646,
+      -0.05959271639585495,
+      -1.3751143217086792,
+      -1.4880049228668213,
+      0.07974031567573547,
+      1.0876556634902954,
+      -1.8819210529327393,
+      -0.33337870240211487,
+      0.8860157132148743,
+      -0.7781083583831787,
+      -0.18586120009422302,
+      0.36383724212646484,
+      -0.05233919247984886,
+      -1.4240131378173828,
+      -0.6472991704940796,
+      0.9354408383369446,
+      -0.22309261560440063,
+      0.8367215991020203,
+      -0.20836658775806427,
+      0.7580796480178833,
+      -0.06159410998225212,
+      -0.1761341243982315,
+      -0.4837302267551422,
+      -0.1933494508266449,
+      -0.23003722727298737
+    ],
+    "env": "",
+    "registered_at": 1769589397.5840247
+  },
+  "user_1769589494": {
+    "embedding": [
+      0.23541471362113953,
+      0.667961597442627,
+      0.38707974553108215,
+      0.6673084497451782,
+      -1.869005560874939,
+      -0.4901138246059418,
+      -0.9352726936340332,
+      0.49656397104263306,
+      0.004735413007438183,
+      1.1503483057022095,
+      -0.7223904728889465,
+      1.1780078411102295,
+      -1.1934415102005005,
+      0.5933876633644104,
+      -0.047901105135679245,
+      -0.6350924372673035,
+      0.9101377725601196,
+      0.9945328235626221,
+      -0.6955628395080566,
+      -1.4766680002212524,
+      0.14297445118427277,
+      1.0183905363082886,
+      -0.5544767379760742,
+      0.7108471989631653,
+      0.12324491143226624,
+      0.8664625287055969,
+      -1.0339009761810303,
+      0.6388123035430908,
+      -0.3606623709201813,
+      1.1092636585235596,
+      -0.2134912759065628,
+      -1.0129042863845825,
+      1.1676888465881348,
+      -0.25849631428718567,
+      0.21622547507286072,
+      -0.21850265562534332,
+      -2.146343469619751,
+      0.9746832251548767,
+      -1.0417606830596924,
+      -1.118934988975525,
+      0.45158135890960693,
+      -0.12440077215433121,
+      0.9278182983398438,
+      0.673552393913269,
+      -1.4133691787719727,
+      -0.9833011031150818,
+      1.7980570793151855,
+      1.1249372959136963,
+      0.6850293278694153,
+      -0.4094180762767792,
+      1.3220067024230957,
+      -0.5562354922294617,
+      0.35797858238220215,
+      0.7082096338272095,
+      0.38267695903778076,
+      -0.3067215085029602,
+      -0.12430296093225479,
+      -1.3622304201126099,
+      -1.2127659320831299,
+      -0.14369715750217438,
+      0.744861900806427,
+      0.35735955834388733,
+      0.30824899673461914,
+      -0.3879246413707733,
+      0.332281231880188,
+      0.31966903805732727,
+      -0.014374539256095886,
+      0.37477824091911316,
+      1.2712546586990356,
+      -0.1365097314119339,
+      0.5229204893112183,
+      0.47963225841522217,
+      0.8237362504005432,
+      0.7043209671974182,
+      -1.673892855644226,
+      0.13583803176879883,
+      0.5652695298194885,
+      0.40299320220947266,
+      0.08790996670722961,
+      0.2492693066596985,
+      -0.4379039406776428,
+      -1.14923894405365,
+      -0.5844811797142029,
+      -1.132568359375,
+      0.49928411841392517,
+      -0.4650140404701233,
+      1.1566886901855469,
+      -0.07155625522136688,
+      0.36949872970581055,
+      0.31576940417289734,
+      -0.4941798746585846,
+      0.8808521628379822,
+      0.12892158329486847,
+      -0.3473222255706787,
+      -0.1342766135931015,
+      0.6350370645523071,
+      -1.524943470954895,
+      0.11389171332120895,
+      -0.14301487803459167,
+      -1.9267250299453735,
+      -1.5791492462158203,
+      -0.19560043513774872,
+      1.5311495065689087,
+      1.9668593406677246,
+      -0.964552104473114,
+      -1.3139442205429077,
+      -0.9792137145996094,
+      0.4413124918937683,
+      -0.18592560291290283,
+      -0.5387620329856873,
+      -0.7066377997398376,
+      0.9972496032714844,
+      -0.12376223504543304,
+      -0.6737706661224365,
+      0.7983350157737732,
+      0.5444274544715881,
+      -1.3038272857666016,
+      1.101620078086853,
+      -1.5507662296295166,
+      0.02854086272418499,
+      -0.6057300567626953,
+      -0.782597005367279,
+      0.3482932448387146,
+      -0.055229704827070236,
+      0.38987356424331665,
+      -0.35090646147727966,
+      -0.190815731883049,
+      -0.5883421301841736,
+      0.6471948027610779,
+      0.5951821804046631,
+      0.4943574070930481,
+      -0.1316496580839157,
+      -0.8007314205169678,
+      -0.13866537809371948,
+      -0.012848706915974617,
+      1.1189842224121094,
+      -1.1396784782409668,
+      -0.33659735321998596,
+      -0.27989667654037476,
+      0.15101654827594757,
+      -0.44554460048675537,
+      0.4468748867511749,
+      0.4023851454257965,
+      -0.37321993708610535,
+      -0.4136735200881958,
+      -0.22391735017299652,
+      -0.3109915256500244,
+      0.9604361057281494,
+      -0.6297188401222229,
+      1.3016139268875122,
+      0.36373990774154663,
+      -1.05316162109375,
+      0.41111207008361816,
+      1.8767585754394531,
+      -0.754970133304596,
+      0.16698729991912842,
+      -0.2632003128528595,
+      -0.4256270229816437,
+      1.7379480600357056,
+      1.2178281545639038,
+      -0.0028167024720460176,
+      0.42778730392456055,
+      -0.12732906639575958,
+      -0.3295230567455292,
+      0.36760953068733215,
+      0.057388786226511,
+      -0.4098236858844757,
+      0.9829326868057251,
+      -0.34538817405700684,
+      -1.3545023202896118,
+      -0.4676443040370941,
+      0.7782469987869263,
+      0.14342212677001953,
+      -1.7002856731414795,
+      0.4266798794269562,
+      -0.33054685592651367,
+      0.9089714884757996,
+      0.5873302221298218,
+      -0.9908685088157654,
+      -0.6938693523406982,
+      -1.5290637016296387,
+      -0.0892898365855217,
+      0.5326513648033142,
+      -0.07912395894527435,
+      0.4673354923725128,
+      -1.0052272081375122,
+      0.13853217661380768,
+      -0.08604929596185684,
+      0.3112524449825287,
+      -1.377512812614441,
+      -0.05614912137389183,
+      0.2633572220802307
+    ],
+    "env": "",
+    "registered_at": 1769589494.0118024
+  }
+}
--- a/config/voice.yaml
+++ b/config/voice.yaml
@@ -0,0 +1,67 @@
+# ROS 语音包配置文件
+
+dashscope:
+  api_key: "sk-7215a5ab7a00469db4072e1672a0661e"
+  asr:
+    model: "qwen3-asr-flash-realtime"
+    url: "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
+  llm:
+    model: "qwen3-vl-flash"
+    base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"
+    temperature: 0.7
+    max_tokens: 4096
+    max_history: 10
+    summary_trigger: 3
+  tts:
+    model: "cosyvoice-v3-flash"
+    voice: "longanyang"
+
+audio:
+  microphone:
+    device_index: -1  # 使用系统默认输入设备
+    sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz，避免重采样可能导致的问题
+    channels: 1  # 输入声道数：单声道（MONO，适合语音采集）
+    chunk: 1024
+    heartbeat_interval: 2.0  # 心跳间隔（秒），用于定期输出录音状态
+  soundcard:
+    card_index: -1  # 使用默认声卡
+    device_index: -1  # 使用默认输出设备
+    sample_rate: 48000  # 输出采样率：默认 44100
+    channels: 2  # 输出声道数：立体声（2声道，FL+FR）
+    volume: 1.0  # 音量比例（0.0-1.0，0.2表示20%音量）
+  tts:
+    source_sample_rate: 22050  # TTS服务固定输出采样率（DashScope服务固定值，不可修改）
+    source_channels: 1  # TTS服务固定输出声道数（DashScope服务固定值，不可修改）
+    ffmpeg_thread_queue_size: 4096  # ffmpeg输入线程队列大小（增大以减少卡顿）
+    force_stop_delay: 0.1  # 强制停止时的延迟（秒）
+    cleanup_timeout: 30.0  # 清理超时（秒）
+    terminate_timeout: 1.0  # 终止超时（秒）
+    interrupt_wait: 0.1  # 中断等待时间（秒）
+
+vad:
+  vad_mode: 3  # VAD模式：0-3，3最严格
+  silence_duration_ms: 1000  # 静音持续时长（毫秒）
+  min_energy_threshold: 300  # 最小能量阈值
+
+system:
+  use_wake_word: true  # 是否启用唤醒词检测
+  wake_word: "er gou"  # 唤醒词（拼音）
+  session_timeout: 3.0  # 会话超时时间（秒）
+  shutup_keywords: "bi zui"  # 闭嘴指令关键词（拼音，逗号分隔）
+  interrupt_command_queue_depth: 10  # 中断命令订阅的队列深度（QoS）
+  sv_enabled: false  # 是否启用声纹识别
+  # sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
+  sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
+  sv_threshold: 0.65  # 声纹识别阈值（0.0-1.0，值越小越宽松，值越大越严格）
+  # sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json"  # 声纹数据库保存路径（JSON格式，相对于ROS2包share目录）
+  sv_speaker_db_path: "~/ros_learn/hivecore_robot_voice/config/speakers.json"  # 声纹数据库保存路径（JSON格式，相对于ROS2包share目录）
+  sv_buffer_size: 96000  # 声纹验证录音缓冲区大小（样本数，48kHz下2秒=96000）
+  continue_without_image: true  # 多模态意图（skill_sequence/chat_camera）未获取到图片时是否继续推理
+  
+camera:
+  image:
+    jpeg_quality: 85  # JPEG压缩质量（0-100，85是质量和大小平衡点）
+
+interfaces:
+  # root_path: "~/hivecore_robot_os1/hivecore_robot_interfaces/src"  # 接口文件根目录，支持 ~ 展开和相对路径
+  root_path: "~/ros_learn/hivecore_robot_interfaces/src"  # 接口文件根目录，支持 ~ 展开和相对路径
--- a/launch/register_speaker.launch.py
+++ b/launch/register_speaker.launch.py
@@ -0,0 +1,54 @@
+from launch import LaunchDescription
+from launch_ros.actions import Node
+from launch.actions import SetEnvironmentVariable, RegisterEventHandler
+from launch.event_handlers import OnProcessExit
+from launch.actions import EmitEvent
+from launch.events import Shutdown
+import os
+
+
+def generate_launch_description():
+    """启动声纹注册节点（需要ASR服务）"""
+    # 获取interfaces包的install路径
+    interfaces_install_path = os.path.expanduser('~/ros_learn/hivecore_robot_interfaces/install')
+    
+    # 设置AMENT_PREFIX_PATH，确保能找到interfaces包的消息类型
+    ament_prefix_path = os.environ.get('AMENT_PREFIX_PATH', '')
+    if interfaces_install_path not in ament_prefix_path:
+        if ament_prefix_path:
+            ament_prefix_path = f'{ament_prefix_path}:{interfaces_install_path}'
+        else:
+            ament_prefix_path = interfaces_install_path
+    
+    # ASR + 音频输入设备节点（提供ASR和AudioData服务）
+    asr_audio_node = Node(
+        package='robot_speaker',
+        executable='asr_audio_node',
+        name='asr_audio_node',
+        output='screen'
+    )
+    
+    # 声纹注册节点
+    register_speaker_node = Node(
+        package='robot_speaker',
+        executable='register_speaker_node',
+        name='register_speaker_node',
+        output='screen'
+    )
+    
+    # 当注册节点退出时，关闭整个 launch
+    register_exit_handler = RegisterEventHandler(
+        OnProcessExit(
+            target_action=register_speaker_node,
+            on_exit=[
+                EmitEvent(event=Shutdown(reason='注册完成，关闭所有节点'))
+            ]
+        )
+    )
+    
+    return LaunchDescription([
+        SetEnvironmentVariable('AMENT_PREFIX_PATH', ament_prefix_path),
+        asr_audio_node,
+        register_speaker_node,
+        register_exit_handler,
+    ])
--- a/launch/voice.launch.py
+++ b/launch/voice.launch.py
@@ -0,0 +1,46 @@
+from launch import LaunchDescription
+from launch_ros.actions import Node
+from launch.actions import SetEnvironmentVariable
+import os
+
+
+def generate_launch_description():
+    """启动语音交互节点，所有参数从 voice.yaml 读取"""
+    # 获取interfaces包的install路径
+    interfaces_install_path = os.path.expanduser('~/ros_learn/hivecore_robot_interfaces/install')
+    
+    # 设置AMENT_PREFIX_PATH，确保能找到interfaces包的消息类型
+    ament_prefix_path = os.environ.get('AMENT_PREFIX_PATH', '')
+    if interfaces_install_path not in ament_prefix_path:
+        if ament_prefix_path:
+            ament_prefix_path = f'{ament_prefix_path}:{interfaces_install_path}'
+        else:
+            ament_prefix_path = interfaces_install_path
+    
+    return LaunchDescription([
+        SetEnvironmentVariable('AMENT_PREFIX_PATH', ament_prefix_path),
+        # ASR + 音频输入设备节点（同时提供VAD事件Service，利用云端ASR的VAD）
+        Node(
+            package='robot_speaker',
+            executable='asr_audio_node',
+            name='asr_audio_node',
+            output='screen'
+        ),
+        # TTS + 音频输出设备节点
+        Node(
+            package='robot_speaker',
+            executable='tts_audio_node',
+            name='tts_audio_node',
+            output='screen'
+        ),
+        # 主业务逻辑节点
+        Node(
+            package='robot_speaker',
+            executable='robot_speaker_node',
+            name='robot_speaker_node',
+            output='screen'
+        ),
+    ])
+
+
+
--- a/package.xml
+++ b/package.xml
@@ -2,13 +2,26 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
  <name>robot_speaker</name>
-  <version>0.0.0</version>
-  <description>TODO: Package description</description>
+  <version>0.0.1</version>
+  <description>语音识别和合成ROS2包</description>
  <maintainer email="mzebra@foxmail.com">mzebra</maintainer>
  <license>Apache-2.0</license>

  <depend>rclpy</depend>
-  <depend>example_interfaces</depend>
+  <depend>std_msgs</depend>
+  <depend>sensor_msgs</depend>
+  <depend>cv_bridge</depend>
+  <depend>ament_index_python</depend>
+  <depend>interfaces</depend>
+  <buildtool_depend>ament_cmake</buildtool_depend>
+  <buildtool_depend>ament_cmake_python</buildtool_depend>
+
+  <exec_depend>python3-pyaudio</exec_depend>
+  <exec_depend>python3-requests</exec_depend>
+  <exec_depend>python3-edge-tts</exec_depend>
+  <exec_depend>python3-webrtcvad</exec_depend>
+  <exec_depend>python3-yaml</exec_depend>
+  <exec_depend>python3-pypinyin</exec_depend>

  <test_depend>ament_copyright</test_depend>
  <test_depend>ament_flake8</test_depend>
@@ -16,6 +29,6 @@
  <test_depend>python3-pytest</test_depend>

  <export>
-    <build_type>ament_python</build_type>
+    <build_type>ament_cmake</build_type>
  </export>
 </package>
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+dashscope>=1.20.0
+openai>=1.0.0
+pyaudio>=0.2.11
+pypinyin>=0.49.0
+rclpy>=3.0.0
+Pillow>=10.0.0
+numpy>=1.24.0,<2.0.0  # cv_bridge需要NumPy 1.x，NumPy 2.x会导致段错误
+PyYAML>=6.0
+funasr>=1.0.0
--- a/robot_speaker/init.py
+++ b/robot_speaker/init.py
@@ -0,0 +1,6 @@
+# robot_speaker package
+
+
+
+
+
--- a/robot_speaker/bridge/init.py
+++ b/robot_speaker/bridge/init.py
@@ -0,0 +1,24 @@
+# Bridge package for connecting LLM outputs to brain execution.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/robot_speaker/bridge/skill_bridge_node.py
+++ b/robot_speaker/bridge/skill_bridge_node.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""
+桥接LLM技能序列到小脑ExecuteBtAction，并转发反馈/结果。
+"""
+import json
+import os
+import re
+
+import rclpy
+from rclpy.node import Node
+from rclpy.action import ActionClient
+from std_msgs.msg import String
+from ament_index_python.packages import get_package_share_directory
+
+from interfaces.action import ExecuteBtAction
+from interfaces.srv import BtRebuild
+
+
+class SkillBridgeNode(Node):
+    def __init__(self):
+        super().__init__('skill_bridge_node')
+        self._action_client = ActionClient(self, ExecuteBtAction, '/execute_bt_action')
+        self._current_epoch = 1
+        self.run_trigger_ = self.create_client(BtRebuild, '/cerebrum/rebuild_now')
+        self.rebuild_requests = 0
+        self._allowed_skills = self._load_allowed_skills()
+
+        self.skill_seq_sub = self.create_subscription(
+            String, '/llm_skill_sequence', self._on_skill_sequence_received, 10
+        )
+        self.feedback_pub = self.create_publisher(String, '/skill_execution_feedback', 10)
+        self.result_pub = self.create_publisher(String, '/skill_execution_result', 10)
+
+        self.get_logger().info('SkillBridgeNode started')
+
+    def _on_skill_sequence_received(self, msg: String):
+        raw = (msg.data or "").strip()
+        if not raw:
+            return
+        if not self._allowed_skills:
+            self.get_logger().warning("No skill whitelist loaded; reject all sequences")
+            return
+        
+        # 尝试解析JSON格式
+        sequence_list = None
+        try:
+            data = json.loads(raw)
+            sequence_list = self._parse_json_sequence(data)
+            if sequence_list is None:
+                self.get_logger().error("Invalid skill sequence format; must be JSON or plain text")
+                return
+        except (json.JSONDecodeError, ValueError) as e:
+            self.get_logger().debug(f"JSON解析失败，尝试文本解析: {e}")
+        
+        # JSON格式处理
+        try:
+            skill_names = [item["skill"] for item in sequence_list]
+            if any(skill in skill_names for skill in ["VisionObjectRecognition", "Arm", "GripperCmd0"]):
+                self.get_logger().info(f"Skill sequence contains special skills, triggering rebuild: {skill_names}")
+                self.rebuild_now("Trigger", "bt_vision_grasp_dual_arm", "")
+            else:
+                skill_params = []
+                for item in sequence_list:
+                    p = item.get("parameters")
+                    params = ""
+                    if isinstance(p, dict):
+                        lines = []
+                        for k, v in p.items():
+                            lines.append(f"{k}: {v}")
+                        if lines:
+                            params = "\n".join(lines) + "\n"
+                    skill_params.append(params)
+
+                self.get_logger().info(f"Sending skill sequence: {skill_names}")
+                self.get_logger().info(f"Sending skill parameters: {skill_params}")
+
+                # 将技能名和参数列表分别用单引号包括，并用逗号隔开
+                # names_str = ", ".join([f"'{name}'" for name in skill_names])
+                # params_str = ", ".join([f"'{param}'" for param in skill_params])
+                names_str = ", ".join(skill_names)
+                params_str = ", ".join(skill_params)
+
+                self.rebuild_now("Remote", names_str, params_str)
+        except Exception as e:
+            self.get_logger().error(f"Error processing skill sequence: {e}")
+
+    def _load_allowed_skills(self) -> set[str]:
+        try:
+            brain_share = get_package_share_directory("brain")
+            skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
+            if not os.path.exists(skill_path):
+                return set()
+            import yaml
+            with open(skill_path, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f) or []
+            return {str(entry["name"]) for entry in data if isinstance(entry, dict) and entry.get("name")}
+        except Exception as e:
+            self.get_logger().warning(f"Load skills failed: {e}")
+            return set()
+
+    def _extract_skill_sequence(self, text: str) -> tuple[str, list[str]]:
+        # Accept CSV/space/semicolon and filter by CamelCase tokens
+        tokens = re.split(r'[,\s;]+', text.strip())
+        skills = [t for t in tokens if re.match(r'^[A-Z][A-Za-z0-9]*$', t)]
+        if not skills:
+            return "", []
+        invalid = [s for s in skills if s not in self._allowed_skills]
+        return ",".join(skills), invalid
+
+    def _parse_json_sequence(self, data: dict) -> list[dict] | None:
+        """解析JSON格式的技能序列"""
+        if not isinstance(data, dict) or "sequence" not in data:
+            return None
+        
+        sequence = data["sequence"]
+        if not isinstance(sequence, list):
+            return None
+        
+        validated = []
+        for item in sequence:
+            if not isinstance(item, dict):
+                continue
+            
+            skill = item.get("skill")
+            if not skill or skill not in self._allowed_skills:
+                continue
+            
+            execution = item.get("execution", "serial")
+            if execution not in ["serial", "parallel"]:
+                execution = "serial"
+            
+            body_id = item.get("body_id")
+            # 只支持数字格式(0,1,2)和null，与意图路由对齐
+            if body_id not in [0, 1, 2, None]:
+                body_id = None
+            
+            validated.append({
+                "skill": skill,
+                "execution": execution,
+                "body_id": body_id,
+                "parameters": item.get("parameters")
+            })
+        
+        return validated if validated else None
+
+    def _send_skill_sequence(self, skill_sequence: str):
+        if not self._action_client.wait_for_server(timeout_sec=2.0):
+            self.get_logger().error('ExecuteBtAction server unavailable')
+            return
+        goal = ExecuteBtAction.Goal()
+        goal.epoch = self._current_epoch
+        self._current_epoch += 1
+        goal.action_name = skill_sequence
+        goal.calls = []
+
+        self.get_logger().info(f"Dispatch skill sequence: {skill_sequence}")
+        send_future = self._action_client.send_goal_async(goal, feedback_callback=self._feedback_callback)
+        rclpy.spin_until_future_complete(self, send_future, timeout_sec=5.0)
+        if not send_future.done():
+            self.get_logger().warning("Send goal timed out")
+            return
+        goal_handle = send_future.result()
+        if not goal_handle or not goal_handle.accepted:
+            self.get_logger().error("Goal rejected")
+            return
+        result_future = goal_handle.get_result_async()
+        rclpy.spin_until_future_complete(self, result_future)
+        if result_future.done():
+            self._handle_result(result_future.result())
+
+    def _feedback_callback(self, feedback_msg):
+        fb = feedback_msg.feedback
+        payload = {
+            "stage": fb.stage,
+            "current_skill": fb.current_skill,
+            "progress": float(fb.progress),
+            "detail": fb.detail,
+            "epoch": int(fb.epoch),
+        }
+        msg = String()
+        msg.data = json.dumps(payload, ensure_ascii=True)
+        self.feedback_pub.publish(msg)
+
+    def _handle_result(self, result_wrapper):
+        result = result_wrapper.result
+        if not result:
+            return
+        payload = {
+            "success": bool(result.success),
+            "message": result.message,
+            "total_skills": int(result.total_skills),
+            "succeeded_skills": int(result.succeeded_skills),
+        }
+        msg = String()
+        msg.data = json.dumps(payload, ensure_ascii=True)
+        self.result_pub.publish(msg)
+
+    def rebuild_now(self, type: str, config: str, param: str) -> None:
+        if not self.run_trigger_.service_is_ready():
+            self.get_logger().error('Rebuild service not ready')
+            return
+
+        self.rebuild_requests += 1
+        self.get_logger().info(f'Rebuild BehaviorTree now. Total requests: {self.rebuild_requests}')
+
+        request = BtRebuild.Request()
+        request.type = type
+        request.config = config
+        request.param = param
+
+        self.get_logger().info(f'Calling rebuild service... request info: {request}')
+
+        future = self.run_trigger_.call_async(request)
+        future.add_done_callback(self._rebuild_done_callback)
+
+    def _rebuild_done_callback(self, future):
+        try:
+            response = future.result()
+            if response.success:
+                self.get_logger().info('Rebuild request successful')
+            else:
+                self.get_logger().warning(f'Rebuild request failed: {response.message}')
+        except Exception as e:
+            self.get_logger().error(f'Rebuild request exception: {str(e)}')
+
+        self.get_logger().info(f"Rebuild requested. Total rebuild requests: {str(self.rebuild_requests)}")
+
+
+def main(args=None):
+    rclpy.init(args=args)
+    node = SkillBridgeNode()
+    rclpy.spin(node)
+    node.destroy_node()
+    rclpy.shutdown()
+
+
+if __name__ == '__main__':
+    main()
+
--- a/robot_speaker/core/init.py
+++ b/robot_speaker/core/init.py
@@ -0,0 +1,28 @@
+"""核心模块"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/robot_speaker/core/context_manager.py
+++ b/robot_speaker/core/context_manager.py
@@ -0,0 +1,130 @@
+"""
+对话历史管理模块
+"""
+from dataclasses import dataclass
+import threading
+
+
+@dataclass
+class LLMMessage:
+    """LLM消息"""
+    role: str  # "user", "assistant", "system"
+    content: str
+
+
+class ConversationHistory:
+    """对话历史管理器 - 实时语音"""
+    
+    def __init__(self, max_history: int, summary_trigger: int):
+        self.max_history = max_history
+        self.summary_trigger = summary_trigger
+        self.conversation_history: list[LLMMessage] = []
+        self.summary: str | None = None
+        
+        # 待确认机制
+        self._pending_user_message: LLMMessage | None = None  # 待确认的用户消息
+        self._lock = threading.Lock()  # 线程安全锁
+    
+    def start_turn(self, user_content: str):
+        """开始一个新的对话轮次,暂存用户消息，等待LLM完成后确认写入历史"""
+        with self._lock:
+            self._pending_user_message = LLMMessage(role="user", content=user_content)
+    
+    def commit_turn(self, assistant_content: str) -> bool:
+        """确认当前轮次完成，将usr和assistant消息写入历史"""
+        with self._lock:
+            if self._pending_user_message is None:
+                return False
+            
+            if not assistant_content or not assistant_content.strip():
+                self._pending_user_message = None
+                return False
+            
+            self.conversation_history.append(self._pending_user_message)
+            self.conversation_history.append(
+                LLMMessage(role="assistant", content=assistant_content.strip())
+            )
+         
+            self._pending_user_message = None
+         
+            self._maybe_compress()
+            return True
+    
+    def cancel_turn(self):
+        """取消当前待确认的轮次，丢弃待确认的用户消息,用于处理中断情况，防止不完整内容污染历史"""
+        with self._lock:
+            if self._pending_user_message is not None:
+                self._pending_user_message = None
+    
+    def add_message(self, role: str, content: str):
+        """直接添加消息"""
+        with self._lock:
+            # 如果有待确认的轮次，先取消它
+            self.cancel_turn()
+            self.conversation_history.append(LLMMessage(role=role, content=content))
+            self._maybe_compress()
+    
+    def get_messages(self) -> list[LLMMessage]:
+        """获取消息列表"""
+        with self._lock:
+            messages = []
+
+            if self.summary:
+                messages.append(LLMMessage(role="system", content=self.summary))
+            
+            if self.max_history > 0:
+                messages.extend(self.conversation_history[-self.max_history * 2:])
+            
+            if self._pending_user_message is not None:
+                messages.append(self._pending_user_message)
+            
+            return messages
+    
+    def has_pending_turn(self) -> bool:
+        """检查是否有待确认的轮次"""
+        with self._lock:
+            return self._pending_user_message is not None
+    
+    def _maybe_compress(self):
+        """压缩对话历史"""
+        if self.max_history <= 0:
+            self.conversation_history.clear()
+            return
+        
+        max_len = self.summary_trigger * 2
+        if len(self.conversation_history) <= max_len:
+            return
+        
+        old = self.conversation_history[:-max_len]
+        self.conversation_history = self.conversation_history[-max_len:]
+        
+        summary_text = []
+        for msg in old:
+            summary_text.append(f"{msg.role}: {msg.content}")
+        
+        compressed = "对话摘要：\n" + "\n".join(summary_text[-10:])
+        
+        if self.summary:
+            self.summary += "\n" + compressed
+        else:
+            self.summary = compressed
+    
+    def clear(self):
+        """清空历史和待确认消息"""
+        with self._lock:
+            self.conversation_history.clear()
+            self.summary = None
+            self._pending_user_message = None
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/robot_speaker/core/intent_router.py
+++ b/robot_speaker/core/intent_router.py
@@ -0,0 +1,272 @@
+from dataclasses import dataclass
+from typing import Optional
+import os
+import yaml
+import json
+from ament_index_python.packages import get_package_share_directory
+
+from pypinyin import pinyin, Style
+from robot_speaker.core.skill_interface_parser import SkillInterfaceParser
+
+
+@dataclass
+class IntentResult:
+    intent: str  # "skill_sequence" | "kb_qa" | "chat_text" | "chat_camera"
+    text: str
+    need_camera: bool
+    camera_mode: Optional[str]  # "top" | "left" | "right" | "hand_r" | None
+    system_prompt: Optional[str]
+
+
+class IntentRouter:
+    def __init__(self):
+        self.camera_capture_keywords = [
+            "pai zhao", "pai ge zhao", "pai zhang zhao"
+        ]
+        # 动作词列表（拼音）- 用于检测技能序列意图
+        self.action_verbs = [
+            "zou", "zou liang bu", "zou ji bu",  # 走、走两步、走几步
+            "na", "na qi", "na zhu",  # 拿、拿起、拿住
+            "ban", "ban yun",  # 搬、搬运
+            "zhua", "zhua qu",  # 抓、抓取
+            "tui", "tui dong",  # 推、推动
+            "la", "la dong",  # 拉、拉动
+            "yi dong", "qian jin", "hou tui",  # 移动、前进、后退
+            "kong zhi", "cao zuo",  # 控制、操作
+            "fang xia", "fang zhi",  # 放下、放置
+            "ju qi", "sheng qi",  # 举起、升起
+            "jia zhua", "jia qi", "jia",  # 夹爪、夹起、夹
+            "shen you bi", "shen zuo bi", "shen chu", "shen shou",  # 伸右臂、伸左臂、伸出、伸手
+            "zhuan quan", "zhuan yi quan", "zhuan",  # 转个圈、转一圈、转
+        ]
+        self.kb_keywords = [
+            "ni shi shui", "ni de ming zi", "tiao ge wu", "ni jiao sha", "ni hui gan", "ni neng gan"
+        ]
+        self._cached_skill_names: list[str] | None = None
+        self._cached_kb_data: list[dict] | None = None
+        
+        interfaces_root = self._get_interfaces_root()
+        self.interface_parser = SkillInterfaceParser(interfaces_root)
+
+    def _get_interfaces_root(self) -> str:
+        """从配置文件读取接口文件根目录"""
+        try:
+            robot_speaker_share = get_package_share_directory("robot_speaker")
+            config_path = os.path.join(robot_speaker_share, "config", "voice.yaml")
+            with open(config_path, "r", encoding="utf-8") as f:
+                config = yaml.safe_load(f) or {}
+            
+            interfaces_config = config.get("interfaces", {})
+            root_path = interfaces_config.get("root_path", "")
+            
+            if not root_path:
+                raise ValueError("interfaces.root_path 未在配置文件中配置")
+            
+            if root_path.startswith("~"):
+                root_path = os.path.expanduser(root_path)
+            
+            if not os.path.isabs(root_path):
+                config_dir = os.path.dirname(robot_speaker_share)
+                root_path = os.path.join(config_dir, root_path)
+            
+            abs_path = os.path.abspath(root_path)
+            
+            if not os.path.exists(abs_path):
+                raise ValueError(f"接口文件根目录不存在: {abs_path}")
+            
+            return abs_path
+        except Exception as e:
+            raise ValueError(f"读取接口文件根目录失败: {e}")
+    
+    def _load_brain_skill_names(self) -> list[str]:
+        """加载技能名称（使用接口解析器，避免重复读取）"""
+        if self._cached_skill_names is not None:
+            return self._cached_skill_names
+        
+        skill_names = self.interface_parser.get_skill_names()
+        self._cached_skill_names = skill_names
+        return skill_names
+
+    def to_pinyin(self, text: str) -> str:
+        chars = [c for c in text if '\u4e00' <= c <= '\u9fa5']
+        if not chars:
+            return ""
+        py_list = pinyin(''.join(chars), style=Style.NORMAL)
+        return ' '.join([item[0] for item in py_list]).lower().strip()
+
+    def is_skill_sequence_intent(self, text: str, text_pinyin: str | None = None) -> bool:
+        if text_pinyin is None:
+            text_pinyin = self.to_pinyin(text)
+        
+        # 检查动作词（精确匹配：动作词必须是完整的单词序列）
+        text_words = text_pinyin.split()
+        for action in self.action_verbs:
+            action_words = action.split()
+            # 检查动作词的单词序列是否是文本单词序列的连续子序列
+            for i in range(len(text_words) - len(action_words) + 1):
+                if text_words[i:i+len(action_words)] == action_words:
+                    return True
+        return False
+
+
+    def check_camera_command(self, text: str, text_pinyin: str | None = None) -> tuple[bool, Optional[str]]:
+        """检查是否包含拍照指令，返回(是否需要相机, 相机模式)"""
+        if not text:
+            return False, None
+        if text_pinyin is None:
+            text_pinyin = self.to_pinyin(text)
+        # 精确匹配：关键词必须作为完整短语出现在文本拼音中
+        if any(keyword in text_pinyin for keyword in self.camera_capture_keywords):
+            return True, self.detect_camera_mode(text, text_pinyin)
+        return False, None
+
+    def detect_camera_mode(self, text: str, text_pinyin: str | None = None) -> str:
+        """检测相机模式，返回与相机驱动匹配的position值：left/right/top/hand_r"""
+        if text_pinyin is None:
+            text_pinyin = self.to_pinyin(text)
+        if any(kw in text_pinyin for kw in ["zuo shou", "zuo bi", "zuo bian", "zuo shou bi"]):
+            return "left"
+        if any(kw in text_pinyin for kw in ["you shou", "you bi", "you bian", "you shou bi"]):
+            return "right"
+        if any(kw in text_pinyin for kw in ["shou bu", "shou", "shou xiang ji", "shou bi xiang ji"]):
+            return "hand_r"
+        if any(kw in text_pinyin for kw in ["tou", "nao dai", "ding bu", "shang fang"]):
+            return "top"
+        return "top"
+
+    def build_skill_prompt(self, execution_status: Optional[str] = None) -> str:
+        skills = self._load_brain_skill_names()
+        skills_text = ", ".join(skills) if skills else ""
+        skill_guard = (
+            "【技能限制】只能使用以下技能名称：" + skills_text
+            if skills_text
+            else "【技能限制】技能列表不可用，请不要输出任何技能名称。"
+        )
+        
+        execution_hint = ""
+        if execution_status:
+            execution_hint = f"【上一轮执行状态】{execution_status}\n请参考上述执行状态，根据成功/失败信息调整本次技能序列。\n"
+        else:
+            execution_hint = "【注意】这是首次执行或没有上一轮执行状态，请根据当前图片和用户请求规划技能序列。\n"
+        
+        skill_params_doc = self.interface_parser.generate_params_documentation()
+        
+        return (
+            "你是机器人任务规划器。\n"
+            "本任务必须拍照。请根据用户请求选择使用哪个相机拍照，并结合当前环境信息生成简洁、可执行的技能序列。\n"
+            "如果用户明确要求或者任务明显需要双手/双臂协作（如扶稳+操作、抓取大体积的物体），必须规划双手技能。\n"
+            + execution_hint
+            + "\n"
+            "【规划要求】\n"
+            "1. execution规划：判断技能之间的执行关系\n"
+            "   - serial（串行）：技能必须按顺序执行，前一个完成后再执行下一个\n"
+            "   - parallel（并行）：技能可以同时执行\n"
+            "2. parameters规划：根据目标物距离和任务需求，规划具体参数值\n"
+            "   - parameters字典必须包含该技能接口文件目标字段的所有字段\n"
+            "【输出格式要求】\n"
+            "必须输出JSON格式，包含sequence数组。每个技能对象包含3个一级字段：\n"
+            "1. skill: 技能名称（字符串）\n"
+            "2. execution: 执行方式，serial（串行）或 parallel（并行）\n"
+            "3. parameters: 参数字典，包含该技能接口文件目标字段的所有字段，并填入合理的预测值。如果技能无参数，使用null。\n"
+            "\n"
+            "注意：一级字段（skill, execution, parameters）是固定结构。\n"
+            "\n"
+            "【技能参数说明】\n"
+            + skill_params_doc +
+            "\n"
+            "示例格式：\n"
+            "{\n"
+            '  "sequence": [\n'
+            '    {"skill": "MoveWheel", "execution": "serial", "parameters": {"move_distance": 1.5, "move_angle": 0.0}},\n'
+            '    {"skill": "Arm", "execution": "serial", "parameters": {"body_id": 0, "data_type": 1, "data_length": 6, "command_id": 0, "frame_time_stamp": 0, "data_array": [0.1, 0.2, 0.3, 0.0, 0.0, 0.0]}},\n'
+            '    {"skill": "GripperCmd0", "execution": "parallel", "parameters": {"loc": 128, "speed": 100, "torque": 80, "mode": 1}}\n'
+            "  ]\n"
+            "}\n"
+            + skill_guard
+        )
+
+    def build_chat_prompt(self, need_camera: bool) -> str:
+        if need_camera:
+            return (
+                "你是一个机器人视觉助理，擅长分析图片中物体的相对位置和空间关系。\n"
+                "请结合图片内容，重点描述物体之间的相对位置（如左右、前后、上下、远近），仅基于可观察信息回答。\n"
+                "回答应简短、客观，不要超过100个token。"
+            )
+        return (
+            "你是一个表达清晰、语气自然的真人助理。\n"
+            "请简短地与用户对话，不要超过100个token。"
+        )
+    
+    def _load_kb_data(self) -> list[dict]:
+        """加载知识库数据"""
+        if self._cached_kb_data is not None:
+            return self._cached_kb_data
+        kb_data = []
+        try:
+            robot_speaker_share = get_package_share_directory("robot_speaker")
+            kb_path = os.path.join(robot_speaker_share, "config", "knowledge.json")
+            with open(kb_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                kb_data = data["entries"]
+        except Exception as e:
+            kb_data = []
+        self._cached_kb_data = kb_data
+        return kb_data
+    
+    def search_kb(self, text: str) -> Optional[str]:
+        """检索知识库，返回匹配的答案"""
+        if not text:
+            return None
+        text_pinyin = self.to_pinyin(text)
+        kb_data = self._load_kb_data()
+       
+        for entry in kb_data:
+            patterns = entry["patterns"]
+            for pattern in patterns:
+                if pattern in text_pinyin:
+                    answer = entry["answer"]
+                    if answer:
+                        return answer
+        return None
+
+    def build_default_system_prompt(self) -> str:
+        return (
+            "你是一个工厂专业的助手。\n"
+            "- 当用户发送图片时，请仔细观察图片内容，结合用户的问题或描述，提供简短、专业的回答。\n"
+            "- 当用户没有发送图片时，请自然、友好地与用户对话。\n"
+            "请根据对话模式调整你的回答风格。"
+        )
+
+    def route(self, text: str) -> IntentResult:
+        text_pinyin = self.to_pinyin(text)
+        need_camera, camera_mode = self.check_camera_command(text, text_pinyin)
+
+        if self.is_skill_sequence_intent(text, text_pinyin):
+            # 技能序列意图总是需要相机，复用 detect_camera_mode：用户指定了相机就用指定的，否则默认 "top"
+            skill_camera_mode = self.detect_camera_mode(text, text_pinyin)
+            return IntentResult(
+                intent="skill_sequence",
+                text=text,
+                need_camera=True,
+                camera_mode=skill_camera_mode,
+                system_prompt=self.build_skill_prompt()
+            )
+
+        # 精确匹配：关键词必须作为完整短语出现在文本拼音中
+        if any(keyword in text_pinyin for keyword in self.kb_keywords):
+            return IntentResult(
+                intent="kb_qa",
+                text=text,
+                need_camera=False,
+                camera_mode=None,
+                system_prompt=None  # kb_qa不走LLM，不需要system_prompt
+            )
+
+        return IntentResult(
+            intent="chat_camera" if need_camera else "chat_text",
+            text=text,
+            need_camera=need_camera,
+            camera_mode=camera_mode,
+            system_prompt=self.build_chat_prompt(need_camera)
+        )
+
--- a/robot_speaker/core/register_speaker_node.py
+++ b/robot_speaker/core/register_speaker_node.py
@@ -0,0 +1,236 @@
+"""声纹注册独立节点：运行完成后退出"""
+import os
+import time
+import yaml
+import numpy as np
+import threading
+import queue
+
+import rclpy
+from rclpy.node import Node
+from ament_index_python.packages import get_package_share_directory
+from interfaces.srv import ASRRecognize, AudioData, VADEvent
+from robot_speaker.core.speaker_verifier import SpeakerVerificationClient
+from pypinyin import pinyin, Style
+
+
+class RegisterSpeakerNode(Node):
+    def __init__(self):
+        super().__init__('register_speaker_node')
+        self._load_config()
+
+        self.asr_client = self.create_client(ASRRecognize, '/asr/recognize')
+        self.audio_data_client = self.create_client(AudioData, '/asr/audio_data')
+        self.vad_client = self.create_client(VADEvent, '/vad/event')
+        
+        self.get_logger().info('等待服务启动...')
+        self.asr_client.wait_for_service(timeout_sec=10.0)
+        self.audio_data_client.wait_for_service(timeout_sec=10.0)
+        self.vad_client.wait_for_service(timeout_sec=10.0)
+        self.get_logger().info('所有服务已就绪')
+
+        self.sv_client = SpeakerVerificationClient(
+            model_path=self.sv_model_path,
+            threshold=self.sv_threshold,
+            speaker_db_path=self.sv_speaker_db_path,
+            logger=self.get_logger()
+        )
+
+        self.registered = False
+        self.shutting_down = False
+        self.get_logger().info("声纹注册节点启动，请说唤醒词开始注册（例如：'二狗我现在正在注册声纹，这是一段很长的测试语音，请把我的声音录进去'）")
+
+        # 使用队列在线程间传递 VAD 事件，避免在子线程中调用 spin_until_future_complete
+        self.vad_event_queue = queue.Queue()
+        self.recording = False  # 录音状态标志
+        self.pending_asr_future = None  # 待处理的 ASR future
+        self.pending_audio_future = None  # 待处理的 AudioData future
+        self.state = "waiting_speech"  # 状态机：waiting_speech, waiting_asr, waiting_audio
+
+        self.vad_thread = threading.Thread(target=self._vad_event_worker, daemon=True)
+        self.vad_thread.start()
+        self.timer = self.create_timer(0.1, self._main_loop)
+
+    def _load_config(self):
+        config_file = os.path.join(
+            get_package_share_directory('robot_speaker'),
+            'config',
+            'voice.yaml'
+        )
+        with open(config_file, 'r') as f:
+            config = yaml.safe_load(f)
+
+        system = config['system']
+        self.sv_model_path = os.path.expanduser(system['sv_model_path'])
+        self.sv_threshold = system['sv_threshold']
+        self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
+        self.wake_word = system['wake_word']
+
+    def _vad_event_worker(self):
+        """VAD 事件监听线程，只负责接收事件并放入队列，不调用 spin_until_future_complete"""
+        while not self.registered and not self.shutting_down:
+            try:
+                request = VADEvent.Request()
+                request.command = "wait"
+                request.timeout_ms = 1000
+                future = self.vad_client.call_async(request)
+                
+                # 简单等待 future 完成，不使用 spin_until_future_complete
+                start_time = time.time()
+                while not future.done() and (time.time() - start_time) < 1.5:
+                    time.sleep(0.01)
+                
+                if not future.done() or self.registered or self.shutting_down:
+                    continue
+                
+                response = future.result()
+                if response.success and response.event in ["speech_started", "speech_stopped"]:
+                    # 将事件放入队列，由主线程处理
+                    try:
+                        self.vad_event_queue.put(response.event, timeout=0.1)
+                    except queue.Full:
+                        self.get_logger().warn(f"[VAD] 事件队列已满，丢弃事件: {response.event}")
+            except Exception as e:
+                if not self.shutting_down:
+                    self.get_logger().error(f"[VAD] 线程异常: {e}")
+                    break
+    
+    def _start_recording(self):
+        """启动录音，返回 future 供主线程处理"""
+        request = AudioData.Request()
+        request.command = "start"
+        return self.audio_data_client.call_async(request)
+    
+    def _to_pinyin(self, text: str) -> str:
+        chars = [c for c in text if '\u4e00' <= c <= '\u9fa5']
+        if not chars:
+            return ""
+        py_list = pinyin(chars, style=Style.NORMAL)
+        return ' '.join([item[0] for item in py_list]).lower().strip()
+    
+    def _check_wake_word(self, text: str):
+        text_pinyin = self._to_pinyin(text)
+        wake_word_pinyin = self.wake_word.lower().strip()
+        
+        if not wake_word_pinyin:
+            return
+        
+        text_pinyin_parts = text_pinyin.split() if text_pinyin else []
+        wake_word_parts = wake_word_pinyin.split()
+        
+        has_wake_word = False
+        for i in range(len(text_pinyin_parts) - len(wake_word_parts) + 1):
+            if text_pinyin_parts[i:i + len(wake_word_parts)] == wake_word_parts:
+                has_wake_word = True
+                break
+        
+        if has_wake_word:
+            self.get_logger().info(f"[注册唤醒词] 检测到唤醒词 '{self.wake_word}'，停止录音并获取音频")
+            request = AudioData.Request()
+            request.command = "stop"
+            future = self.audio_data_client.call_async(request)
+            future._future_type = "stop"
+            self.pending_audio_future = future
+    
+    def _process_voiceprint_audio(self, response):
+        """处理声纹音频数据 - 直接使用 AudioData 返回的音频，不再过滤"""
+        if not response or not response.success or response.samples == 0:
+            self.get_logger().error(f"[注册录音] 获取音频数据失败: {response.message if response else '无响应'}")
+            return
+        
+        audio_array = np.frombuffer(response.audio_data, dtype=np.int16)
+        buffer_sec = response.samples / response.sample_rate
+        self.get_logger().info(f"[注册录音] 音频长度: {buffer_sec:.2f}秒")
+        
+        # 直接使用音频，不再进行 VAD 过滤
+        # 因为 AudioData 服务基于 DashScope VAD，已经是语音活动片段
+        embedding, success = self.sv_client.extract_embedding(
+            audio_array,
+            sample_rate=response.sample_rate
+        )
+        if not success or embedding is None:
+            self.get_logger().error("[注册录音] 提取embedding失败")
+            return
+        
+        speaker_id = f"user_{int(time.time())}"
+        if self.sv_client.register_speaker(speaker_id, embedding):
+            # 注册成功后立即保存到文件
+            self.sv_client.save_speakers()
+            self.get_logger().info(f"[注册录音] 注册成功，用户ID: {speaker_id}，已保存到文件，准备退出")
+            self.registered = True
+        else:
+            self.get_logger().error("[注册录音] 注册失败")
+    
+    def _main_loop(self):
+        """主循环，在主线程中处理所有异步操作"""
+        # 检查是否完成注册
+        if self.registered:
+            self.get_logger().info("注册完成，节点退出")
+            self.shutting_down = True
+            self.timer.cancel()
+            rclpy.shutdown()
+            return
+        
+        # 处理待处理的 ASR future
+        if self.pending_asr_future and self.pending_asr_future.done():
+            response = self.pending_asr_future.result()
+            self.pending_asr_future = None
+            
+            if response.success and response.text:
+                text = response.text.strip()
+                if text:
+                    self._check_wake_word(text)
+            
+            self.state = "waiting_speech"
+        
+        # 处理待处理的 AudioData future
+        if self.pending_audio_future and self.pending_audio_future.done():
+            response = self.pending_audio_future.result()
+            future_type = getattr(self.pending_audio_future, '_future_type', None)
+            self.pending_audio_future = None
+            
+            if future_type == "start":
+                if response.success:
+                    self.get_logger().info("[注册录音] 已开始录音")
+                    self.recording = True
+                else:
+                    self.get_logger().warn(f"[注册录音] 启动录音失败: {response.message}")
+                    self.state = "waiting_speech"
+            elif future_type == "stop":
+                self.recording = False
+                self._process_voiceprint_audio(response)
+        
+        # 处理 VAD 事件队列
+        try:
+            event = self.vad_event_queue.get_nowait()
+            
+            if event == "speech_started" and self.state == "waiting_speech" and not self.recording:
+                self.get_logger().info("[VAD] 检测到语音开始，启动录音")
+                future = self._start_recording()
+                future._future_type = "start"
+                self.pending_audio_future = future
+                
+            elif event == "speech_stopped" and self.recording and self.state == "waiting_speech":
+                self.get_logger().info("[VAD] 检测到语音结束，请求 ASR 识别")
+                self.state = "waiting_asr"
+                request = ASRRecognize.Request()
+                request.command = "start"
+                self.pending_asr_future = self.asr_client.call_async(request)
+                
+        except queue.Empty:
+            pass
+
+
+def main(args=None):
+    rclpy.init(args=args)
+    node = RegisterSpeakerNode()
+    rclpy.spin(node)
+    node.destroy_node()
+    try:
+        rclpy.shutdown()
+    except Exception:
+        pass
+
+
+if __name__ == '__main__':
+    main()
--- a/robot_speaker/core/robot_speaker_node.py
+++ b/robot_speaker/core/robot_speaker_node.py
@@ -0,0 +1,802 @@
+import rclpy
+from rclpy.node import Node
+from std_msgs.msg import String
+import threading
+import queue
+import time
+import base64
+import io
+import numpy as np
+from PIL import Image
+from cv_bridge import CvBridge
+from interfaces.msg import ImgMsg
+import collections
+import os
+import yaml
+from typing import Optional
+from ament_index_python.packages import get_package_share_directory
+from interfaces.srv import VADEvent, ASRRecognize, TTSSynthesize, AudioData
+from openai import OpenAI
+from robot_speaker.core.context_manager import ConversationHistory
+from robot_speaker.core.speaker_verifier import SpeakerVerificationClient, SpeakerState
+from robot_speaker.core.intent_router import IntentRouter, IntentResult
+from enum import Enum
+
+
+class ConversationState(Enum):
+    IDLE = "idle"
+    CHECK_VOICE = "check_voice"
+    AUTHORIZED = "authorized"
+
+
+class RobotSpeakerNode(Node):
+    
+    def __init__(self):
+        super().__init__('robot_speaker_node')
+        self._load_config()
+
+        self.text_queue = queue.Queue()
+        self.tts_queue = queue.Queue()
+        
+        self.interrupt_event = threading.Event()
+        self.stop_event = threading.Event()
+        
+        self.conversation_state = ConversationState.IDLE
+        self.state_lock = threading.Lock()
+        
+        self.current_speaker_id = None
+        self.current_speaker_state = SpeakerState.UNKNOWN
+        self.current_speaker_score = 0.0
+        self.current_speaker_threshold = 0.0
+        self.sv_lock = threading.Lock()
+        self.sv_speech_end_event = threading.Event()
+        self.sv_result_ready_event = threading.Event()
+        self.sv_audio_buffer = None
+        self.sv_recording = False
+
+        self._init_components()
+        
+        if self.sv_enabled and self.sv_client:
+            speaker_count = self.sv_client.get_speaker_count()
+            if speaker_count == 0:
+                self.get_logger().info("[Speaker] 声纹数据库为空，请注册声纹")
+
+        self.skill_sequence_pub = self.create_publisher(String, '/llm_skill_sequence', 10)
+        
+        self.last_execution_status: Optional[str] = None
+        self.execution_status_lock = threading.Lock()
+        self.skill_result_sub = self.create_subscription(
+            String, '/skill_execution_result', self._on_skill_result_received, 10
+        )
+
+        self._start_threads()
+        self.get_logger().info("[Speaker] 语音节点已启动")
+    
+    def _load_config(self):
+        config_file = os.path.join(
+            get_package_share_directory('robot_speaker'),
+            'config',
+            'voice.yaml'
+        )
+        with open(config_file, 'r') as f:
+            config = yaml.safe_load(f)
+        
+        audio = config['audio']
+        mic = audio['microphone']
+        soundcard = audio['soundcard']
+        tts_audio = audio['tts']
+        
+        self.input_device_index = mic['device_index']
+        self.output_card_index = soundcard['card_index']
+        self.output_device_index = soundcard['device_index']
+        self.sample_rate = mic['sample_rate']
+        self.channels = mic['channels']
+        self.chunk = mic['chunk']
+        self.audio_microphone_heartbeat_interval = mic['heartbeat_interval']
+        self.output_sample_rate = soundcard['sample_rate']
+        self.output_channels = soundcard['channels']
+        self.output_volume = soundcard['volume']
+        self.audio_tts_source_sample_rate = tts_audio['source_sample_rate']
+        self.audio_tts_source_channels = tts_audio['source_channels']
+        self.audio_tts_ffmpeg_thread_queue_size = tts_audio['ffmpeg_thread_queue_size']
+        
+        vad = config['vad']
+        self.vad_mode = vad['vad_mode']
+        self.silence_duration_ms = vad['silence_duration_ms']
+        self.min_energy_threshold = vad['min_energy_threshold']
+        
+        dashscope = config['dashscope']
+        self.dashscope_api_key = dashscope['api_key']
+        self.asr_model = dashscope['asr']['model']
+        self.asr_url = dashscope['asr']['url']
+        self.llm_model = dashscope['llm']['model']
+        self.llm_base_url = dashscope['llm']['base_url']
+        self.llm_temperature = dashscope['llm']['temperature']
+        self.llm_max_tokens = dashscope['llm']['max_tokens']
+        self.llm_max_history = dashscope['llm']['max_history']
+        self.llm_summary_trigger = dashscope['llm']['summary_trigger']
+        self.tts_model = dashscope['tts']['model']
+        self.tts_voice = dashscope['tts']['voice']
+        
+        system = config['system']
+        self.use_wake_word = system['use_wake_word']
+        self.wake_word = system['wake_word']
+        self.system_shutup_keywords = system['shutup_keywords']
+        self.sv_enabled = system['sv_enabled']
+        self.sv_model_path = os.path.expanduser(system['sv_model_path'])
+        self.sv_threshold = system['sv_threshold']
+        self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
+        self.sv_buffer_size = system['sv_buffer_size']
+        self.continue_without_image = system['continue_without_image']
+        
+        camera = config['camera']
+        self.camera_image_jpeg_quality = camera['image']['jpeg_quality']
+        
+    def _init_components(self):
+        self.shutup_keywords = [k.strip() for k in self.system_shutup_keywords.split(',') if k.strip()]
+        
+        self.intent_router = IntentRouter()
+        self.sv_audio_buffer = collections.deque(maxlen=self.sv_buffer_size)
+        
+        self.vad_client = self.create_client(VADEvent, '/vad/event')
+        self.asr_client = self.create_client(ASRRecognize, '/asr/recognize')
+        self.tts_client = self.create_client(TTSSynthesize, '/tts/synthesize')
+        self.audio_data_client = self.create_client(AudioData, '/asr/audio_data')
+        
+        self.get_logger().info("[Speaker] 等待service节点启动...")
+        self.vad_client.wait_for_service(timeout_sec=5.0)
+        self.asr_client.wait_for_service(timeout_sec=5.0)
+        self.tts_client.wait_for_service(timeout_sec=5.0)
+        self.audio_data_client.wait_for_service(timeout_sec=5.0)
+        self.get_logger().info("[Speaker] 所有service节点已就绪")
+        
+        self.llm_client = OpenAI(api_key=self.dashscope_api_key, base_url=self.llm_base_url)
+        self.history = ConversationHistory(
+            max_history=self.llm_max_history,
+            summary_trigger=self.llm_summary_trigger
+        )
+        
+        self.cv_bridge = CvBridge()
+        self.img_msg_cache = {}
+        self.img_msg_lock = threading.Lock()
+        self.img_sub = self.create_subscription(
+            ImgMsg,
+            '/img_msg',
+            lambda msg: (self.img_msg_lock.acquire(), self.img_msg_cache.update({msg.position: msg}), self.img_msg_lock.release()),
+            10
+        )
+        
+        if self.sv_enabled and self.sv_model_path:
+            try:
+                self.sv_client = SpeakerVerificationClient(
+                    model_path=self.sv_model_path,
+                    threshold=self.sv_threshold,
+                    speaker_db_path=self.sv_speaker_db_path,
+                    logger=self.get_logger()
+                )
+            except Exception as e:
+                self.get_logger().warning(f"[Speaker] 声纹识别初始化失败: {e}，声纹功能将不可用")
+                self.sv_client = None
+                self.sv_enabled = False
+        else:
+            self.sv_client = None
+    
+    def _start_threads(self):
+        self.vad_thread = threading.Thread(
+            target=self._vad_event_worker,
+            name="VADEventThread",
+            daemon=True
+        )
+        self.vad_thread.start()
+
+        self.process_thread = threading.Thread(
+            target=self._process_worker,
+            name="ProcessThread",
+            daemon=True
+        )
+        self.process_thread.start()
+
+        self.tts_thread = threading.Thread(
+            target=self._tts_worker,
+            name="TTSThread",
+            daemon=True
+        )
+        self.tts_thread.start()
+        
+        if self.sv_enabled and self.sv_client:
+            self.sv_thread = threading.Thread(
+                target=self._sv_worker,
+                name="SVThread",
+                daemon=True
+            )
+            self.sv_thread.start()
+        else:
+            self.sv_thread = None
+    
+    def _change_state(self, new_state: ConversationState, reason: str):
+        with self.state_lock:
+            old_state = self.conversation_state
+            self.conversation_state = new_state
+        self.get_logger().info(f"[Speaker-State] {old_state.value} -> {new_state.value}: {reason}")
+    
+    def _on_speech_started(self):
+        self.get_logger().info("[Speaker-VAD] 检测到人声开始")
+        with self.state_lock:
+            state = self.conversation_state
+        if state == ConversationState.AUTHORIZED:
+            if self.sv_enabled and self.sv_client:
+                self._start_sv_recording()
+                self._change_state(ConversationState.CHECK_VOICE, "新指令，重新验证声纹")
+        if state == ConversationState.IDLE:
+            if self.sv_enabled and self.sv_client:
+                self._start_sv_recording()
+                self._change_state(ConversationState.CHECK_VOICE, "检测到语音，开始检查声纹")
+            else:
+                self._change_state(ConversationState.AUTHORIZED, "未启用声纹，直接授权")
+        elif state == ConversationState.CHECK_VOICE:
+            self._start_sv_recording()
+    
+    def _on_speech_stopped(self):
+        import threading
+        self.get_logger().debug(f"[Speaker-VAD] speech_stopped 被调用 | 线程:{threading.current_thread().name} | 当前状态:{self.conversation_state.value}")
+        with self.state_lock:
+            state = self.conversation_state
+        self.get_logger().debug(f"[Speaker-VAD] 准备停止声纹录音 | sv_enabled:{self.sv_enabled} | state:{state}")
+        if self.sv_enabled and state in [ConversationState.CHECK_VOICE, ConversationState.AUTHORIZED]:
+            self.sv_speech_end_event.clear()
+            self._stop_sv_recording()
+        self._call_asr_service()
+    
+    def _start_sv_recording(self):
+        if not self.sv_enabled:
+            return
+        request = AudioData.Request()
+        request.command = "start"
+        request.duration_ms = 0
+        self.audio_data_client.call_async(request)
+    
+    def _stop_sv_recording(self):
+        import threading
+        self.get_logger().debug(f"[Speaker-SV] _stop_sv_recording 开始 | 线程:{threading.current_thread().name} | 时间:{time.time()}")
+        request = AudioData.Request()
+        request.command = "stop"
+        future = self.audio_data_client.call_async(request)
+        future.add_done_callback(self._on_sv_audio_ready)
+        self.get_logger().debug(f"[Speaker-SV] _stop_sv_recording 已发送异步请求 | future_id:{id(future)}")
+        
+    def _on_sv_audio_ready(self, future):
+        import threading
+        self.get_logger().debug(f"[Speaker-SV] _on_sv_audio_ready 回调触发 | 线程:{threading.current_thread().name} | future_id:{id(future)} | 时间:{time.time()}")
+        try:
+            response = future.result()
+            self.get_logger().debug(f"[Speaker-SV] 收到响应 | success:{response.success} | samples:{response.samples}")
+            if response.success and response.samples > 0:
+                audio_array = np.frombuffer(response.audio_data, dtype=np.int16)
+                with self.sv_lock:
+                    self.get_logger().debug(f"[Speaker-SV] 准备写入buffer | 旧大小:{len(self.sv_audio_buffer)} | 新数据:{len(audio_array)}")
+                    self.sv_audio_buffer.clear()
+                    self.sv_audio_buffer.extend(audio_array)
+                    self.get_logger().debug(f"[Speaker-SV] buffer已更新 | 新大小:{len(self.sv_audio_buffer)}")
+                self.get_logger().debug(f"[Speaker-SV] 准备设置 sv_speech_end_event")
+                self.sv_speech_end_event.set()
+        except Exception as e:
+            self.get_logger().error(f"[Speaker-SV] _on_sv_audio_ready 异常 | 错误:{e} | 类型:{type(e).__name__}")
+    
+    def _call_asr_service(self):
+        self.get_logger().info("[Speaker] 调用ASR服务获取识别结果")
+        request = ASRRecognize.Request()
+        request.command = "start"
+        future = self.asr_client.call_async(request)
+        future.add_done_callback(self._asr_service_callback)
+    
+    def _asr_service_callback(self, future):
+        import threading
+        self.get_logger().debug(f"[Speaker-ASR] ASR回调触发 | 线程:{threading.current_thread().name} | 时间:{time.time()}")
+        try:
+            response = future.result()
+            self.get_logger().debug(f"[Speaker-ASR] 收到响应 | success:{response.success} | text:{response.text if response.success else 'N/A'}")
+            if response.success and response.text:
+                self.text_queue.put(response.text)
+                self.get_logger().debug(f"[Speaker-ASR] 文本已放入队列 | queue_size:{self.text_queue.qsize()}")
+            else:
+                self.get_logger().warn(f"[Speaker-ASR] 识别失败或为空: success={response.success}, message={response.message}")
+        except Exception as e:
+            self.get_logger().error(f"[Speaker-ASR] 异常 | 错误:{e} | 类型:{type(e).__name__}")
+    
+    def _vad_event_worker(self):
+        import threading
+        self.get_logger().info(f"[Speaker-VAD] 启动 | 线程ID:{threading.current_thread().ident}")
+        while not self.stop_event.is_set():
+            request = VADEvent.Request()
+            request.command = "wait"
+            request.timeout_ms = 500
+            future = self.vad_client.call_async(request)
+            future.add_done_callback(self._on_vad_event_response)
+            time.sleep(0.05)
+    
+    def _on_vad_event_response(self, future):
+        import threading
+        self.get_logger().debug(f"[Speaker-VAD] 回调触发 | 线程:{threading.current_thread().name}")
+        try:
+            response = future.result()
+            if not response.success or response.event == "none":
+                return
+            self.get_logger().debug(f"[Speaker-VAD] 收到事件 | event:{response.event} | 线程:{threading.current_thread().name} | 时间:{time.time()}")
+            if response.event == "speech_started":
+                self._on_speech_started()
+            elif response.event == "speech_stopped":
+                self._on_speech_stopped()     
+        except Exception as e:
+            self.get_logger().error(f"[Speaker-VAD] 异常 | 错误:{e} | 类型:{type(e).__name__}")
+    
+    def _process_worker(self):
+        """获取文本 → 状态转换 → 唤醒词处理 → 闭嘴指令检查 → 意图路由 → 处理请求"""
+        self.get_logger().info("[Speaker] 主线程启动")
+        while not self.stop_event.is_set():
+            try:
+                text = self.text_queue.get(timeout=0.1)
+            except queue.Empty:
+                continue
+            self.interrupt_event.clear()
+            with self.state_lock:
+                current_state = self.conversation_state
+            previous_state = current_state
+            current_state = self._handle_state_transition(current_state, text)
+            if current_state is None:
+                continue
+            if current_state == ConversationState.AUTHORIZED and previous_state == ConversationState.CHECK_VOICE:
+                self._interrupt_tts()
+            processed_text = self._handle_wake_word(text, current_state)
+            if not processed_text:
+                continue
+            if self._check_shutup_command(processed_text):
+                self._handle_shutup_command()
+                continue
+            intent_result = self.intent_router.route(processed_text)
+            self.get_logger().info(f"[Speaker-Intent] intent={intent_result.intent}, need_camera={intent_result.need_camera}, camera_mode={intent_result.camera_mode}")
+            if intent_result.intent == "kb_qa":
+                self.interrupt_event.clear()
+                if self._handle_kb_qa(processed_text):
+                    continue
+                self._put_tts_text("抱歉，我没有找到相关信息")
+                continue
+            self.interrupt_event.clear()
+            self._handle_llm_request(intent_result, processed_text)
+    
+    def _handle_state_transition(self, current_state: ConversationState, text: str) -> ConversationState | None:
+        if current_state == ConversationState.CHECK_VOICE:
+            if self.sv_enabled and self.sv_client:
+                if not self._handle_speaker_verification():
+                    return None
+            else:
+                self._change_state(ConversationState.AUTHORIZED, "未启用声纹")
+            if self.use_wake_word:
+                wake_result = self._handle_wake_word(text, current_state)
+                if not wake_result:
+                    self._change_state(ConversationState.IDLE, "未检测到唤醒词")
+                    return None
+        elif current_state == ConversationState.AUTHORIZED:
+            if self.sv_enabled and self.sv_client:
+                if not self._handle_speaker_verification():
+                    return None
+            if self.use_wake_word:
+                wake_result = self._handle_wake_word(text, current_state)
+                if not wake_result:
+                    self._change_state(ConversationState.IDLE, "未检测到唤醒词")
+                    return None
+        elif current_state == ConversationState.IDLE:
+            if self.sv_enabled and self.sv_client:
+                return None
+            else:
+                self._change_state(ConversationState.AUTHORIZED, "收到文本但状态为IDLE，未启用声纹，直接授权")
+            if self.use_wake_word:
+                wake_result = self._handle_wake_word(text, current_state)
+                if not wake_result:
+                    self._change_state(ConversationState.IDLE, "未检测到唤醒词")
+                    return None
+        with self.state_lock:
+            return self.conversation_state
+    
+    def _handle_speaker_verification(self) -> bool:
+        import threading
+        self.get_logger().debug(f"[Speaker-SV] 开始声纹验证 | 线程:{threading.current_thread().name} | result_ready:{self.sv_result_ready_event.is_set()}")
+        if self.sv_result_ready_event.is_set():
+            self.get_logger().debug(f"[Speaker-SV] 结果已ready，跳过等待")
+            pass
+        elif not self.sv_speech_end_event.wait(timeout=2.0):
+            self.get_logger().warn(f"[Speaker-SV] speech_end_event 等待超时")
+            self._change_state(ConversationState.IDLE, "没有录音数据，无法验证")
+            return False
+        self.get_logger().debug(f"[Speaker-SV] speech_end_event 已触发，等待result_ready_event")
+        if not self.sv_result_ready_event.wait(timeout=3.0):
+            self.get_logger().warn(f"[Speaker-SV] result_ready_event 等待超时")
+            with self.sv_lock:
+                self.sv_audio_buffer.clear()
+            self._change_state(ConversationState.IDLE, "声纹结果未ready")
+            return False
+        self.get_logger().debug(f"[Speaker-SV] result_ready_event 已触发，读取结果")
+        self.sv_result_ready_event.clear()    
+        with self.sv_lock:
+            speaker_id = self.current_speaker_id
+            speaker_state = self.current_speaker_state
+            score = self.current_speaker_score
+        self.get_logger().debug(f"[Speaker-SV] 验证结果 | speaker_id:{speaker_id} | state:{speaker_state.value} | score:{score:.4f}")
+        if not (speaker_id and speaker_state == SpeakerState.VERIFIED):
+            if self.sv_client.get_speaker_count() == 0:
+                self._change_state(ConversationState.IDLE, "声纹数据库为空")
+            else:
+                self._change_state(ConversationState.IDLE, f"声纹验证失败, 得分: {score:.4f}")
+            return False
+        self._change_state(ConversationState.AUTHORIZED, f"声纹验证成功: {speaker_id}, 得分: {score:.4f}")
+        return True
+    
+    def _handle_shutup_command(self):
+        with self.state_lock:
+            current_state = self.conversation_state
+        
+        if current_state == ConversationState.AUTHORIZED or not self.sv_enabled or not self.sv_client:
+            self._interrupt_tts()
+            if self.history:
+                self.history.cancel_turn()
+    
+    def _handle_kb_qa(self, text: str) -> bool:
+        kb_answer = self.intent_router.search_kb(text)
+        if kb_answer:
+            self._put_tts_text(kb_answer)
+            return True
+        return False
+    
+    def _handle_llm_request(self, intent_result, processed_text: str):
+        is_skill_sequence = intent_result.intent == "skill_sequence"
+        if self.history and not is_skill_sequence:
+            self.history.start_turn(intent_result.text)
+        if not self.llm_client:
+            self._put_tts_text(processed_text)
+            return
+        if is_skill_sequence:
+            self.get_logger().info(f"[Speaker-Skill] 任务: {processed_text}")
+            with self.execution_status_lock:
+                last_status = self.last_execution_status
+            self.get_logger().debug(f"[Speaker-Skill] 读取执行状态 | 线程:{threading.current_thread().name} | 时间:{time.time()} | 状态:{last_status}")
+            system_prompt_with_status = self.intent_router.build_skill_prompt(execution_status=last_status)
+        else:
+            system_prompt_with_status = intent_result.system_prompt
+        self.get_logger().debug(f"[Speaker-LLM] intent={intent_result.intent} | system_prompt前100字符: {system_prompt_with_status[:100] if system_prompt_with_status else 'None'}")
+        reply = self._llm_process_stream_with_camera(
+            intent_result.text,
+            intent_result.need_camera,
+            intent_result.camera_mode,
+            system_prompt_with_status,
+            intent_result.intent
+        )
+        if not reply or not reply.strip():
+            if self.history and not is_skill_sequence:
+                self.history.cancel_turn()
+            return
+        if self.history and not is_skill_sequence:
+            self.history.commit_turn(reply)
+        if is_skill_sequence and reply.strip():
+            msg = String()
+            msg.data = reply.strip()
+            self.skill_sequence_pub.publish(msg)
+            self.get_logger().info(f"[Speaker-Skill] 开始新任务: {processed_text}")
+    
+    def _check_shutup_command(self, text: str) -> bool:
+        text_pinyin = self.intent_router.to_pinyin(text).lower().strip()
+        for keyword_pinyin in self.shutup_keywords:
+            keyword_pinyin_clean = keyword_pinyin.lower().strip()
+            if keyword_pinyin_clean in text_pinyin:
+                self.get_logger().info(f"[Speaker-Intent] 闭嘴指令匹配到关键词: {keyword_pinyin} (文本拼音: {text_pinyin})")
+                return True
+        return False
+    
+    def _interrupt_tts(self):
+        self.interrupt_event.set()
+        while not self.tts_queue.empty():
+            try:
+                self.tts_queue.get_nowait()
+            except queue.Empty:
+                break
+        request = TTSSynthesize.Request()
+        request.command = "interrupt"
+        request.text = ""
+        request.voice = ""
+        future = self.tts_client.call_async(request)
+        future.add_done_callback(lambda f: self.get_logger().info("[Speaker-TTS] interrupt sent"))
+    
+    def _on_skill_result_received(self, msg: String):
+        try:
+            import json
+            data = json.loads(msg.data)
+            success = data.get("success", False)
+            message = data.get("message", "")
+            total_skills = data.get("total_skills", 0)
+            succeeded_skills = data.get("succeeded_skills", 0)
+            status_text = f"执行结果: {'成功' if success else '失败'}"
+            if message:
+                status_text += f", 详情: {message}"
+            if total_skills > 0:
+                status_text += f", 总技能数: {total_skills}, 成功: {succeeded_skills}, 失败: {total_skills - succeeded_skills}"
+            with self.execution_status_lock:
+                self.last_execution_status = status_text
+            self.get_logger().info(f"[Speaker-Skill] 执行状态已更新: {status_text}")
+        except Exception as e:
+            self.get_logger().warning(f"[Speaker-Skill] 解析执行结果失败: {e}")
+    
+    
+    def _capture_image_from_img_dev(self, camera_mode: Optional[str] = None) -> Optional[np.ndarray]:
+        timeout_sec = 1.0
+        start_time = time.time()
+        while time.time() - start_time < timeout_sec:
+            with self.img_msg_lock:
+                if camera_mode and camera_mode in self.img_msg_cache:
+                    msg = self.img_msg_cache[camera_mode]
+                    cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
+                    self.get_logger().info(f"[Speaker-Camera] 使用{camera_mode}相机获取图像成功 (position={msg.position})")
+                    return cv_image
+                elif camera_mode is None and len(self.img_msg_cache) > 0:
+                    msg = next(iter(self.img_msg_cache.values()))
+                    cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
+                    self.get_logger().info(f"[Speaker-Camera] 未指定相机位置，使用{msg.position}相机获取图像成功")
+                    return cv_image
+            time.sleep(0.1)
+        with self.img_msg_lock:
+            available_positions = list(self.img_msg_cache.keys()) if self.img_msg_cache else []
+        self.get_logger().warning(f"[Speaker-Camera] 等待图像超时 (期望位置={camera_mode}, 可用位置={available_positions})")
+        return None
+    
+    def _encode_image_to_base64(self, image_data: np.ndarray, quality: int = 85) -> str:
+        try:
+            if image_data.shape[2] == 3:
+                pil_image = Image.fromarray(image_data, 'RGB')
+            else:
+                pil_image = Image.fromarray(image_data)      
+            buffer = io.BytesIO()
+            pil_image.save(buffer, format='JPEG', quality=quality)
+            image_bytes = buffer.getvalue()
+            return base64.b64encode(image_bytes).decode('utf-8')
+        except Exception as e:
+            self.get_logger().error(f"[Speaker-Camera] 图像编码失败: {e}")
+            return ""
+    
+    def _llm_process_stream_with_camera(self, user_text: str, need_camera: bool, camera_mode: Optional[str] = None, system_prompt: Optional[str] = None, intent: str = "chat_text") -> str:
+        if not self.llm_client:
+            return ""
+        if intent == "skill_sequence":
+            messages = []
+        else:
+            if not self.history:
+                return ""
+            messages = [{"role": msg.role, "content": msg.content} for msg in self.history.get_messages()]
+        has_system_msg = any(msg.get("role") == "system" for msg in messages)
+        if not has_system_msg:
+            if system_prompt is None:
+                system_prompt = self.intent_router.build_default_system_prompt()
+            messages.insert(0, {"role": "system", "content": system_prompt})
+        image_base64_list = []
+        if need_camera:
+            image_data = self._capture_image_from_img_dev(camera_mode)
+            if image_data is not None:
+                image_base64 = self._encode_image_to_base64(image_data, quality=self.camera_image_jpeg_quality)
+                if image_base64:
+                    image_base64_list.append(image_base64)
+            if not image_base64_list and not self.continue_without_image:
+                self.get_logger().warning(f"[Speaker-LLM] 需要相机但未获取到图片，且配置为不继续推理，放弃请求")
+                return ""
+        if image_base64_list:
+            content_list = [{"type": "text", "text": user_text}]
+            for img_b64 in image_base64_list:
+                content_list.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
+                })
+            if intent == "skill_sequence":
+                messages.append({"role": "user", "content": content_list})
+            else:
+                messages[-1]["content"] = content_list
+        else:
+            if intent == "skill_sequence":
+                messages.append({"role": "user", "content": user_text})
+        full_reply = ""
+        interrupted = False
+        try:
+            stream = self.llm_client.chat.completions.create(
+                model=self.llm_model,
+                messages=messages,
+                temperature=self.llm_temperature,
+                max_tokens=self.llm_max_tokens,
+                stream=True
+            )
+            for chunk in stream:
+                if self.interrupt_event.is_set():
+                    interrupted = True
+                    break
+                
+                if chunk.choices and chunk.choices[0].delta.content:
+                    content = chunk.choices[0].delta.content
+                    full_reply += content
+        except Exception as e:
+            self.get_logger().error(f"[Speaker-LLM] 调用失败: {e}")
+            return ""
+        if interrupted:
+            self.get_logger().info("[Speaker-LLM] 流式处理被中断")
+            return ""
+        reply = full_reply.strip() if full_reply else ""
+        self.get_logger().info(f"[Speaker-LLM] 生成回复: {reply}")
+        if reply and intent != "skill_sequence" and not self.interrupt_event.is_set():
+            self._put_tts_text(reply)
+        return reply
+    
+    def _tts_worker(self):
+        self.get_logger().info("[Speaker-TTS] TTS播放线程启动")
+        while not self.stop_event.is_set():
+            try:
+                text = self.tts_queue.get(timeout=0.5)
+            except queue.Empty:
+                continue
+            if self.interrupt_event.is_set():
+                continue
+            text_str = str(text).strip()
+            if not text_str:
+                continue
+            request = TTSSynthesize.Request()
+            request.command = "synthesize"
+            request.text = text_str
+            request.voice = ""
+            future = self.tts_client.call_async(request)
+            future.add_done_callback(self._on_tts_done)
+
+    def _on_tts_done(self, future):
+        try:
+            response = future.result()
+            if not response.success:
+                self.get_logger().warn(f"[Speaker-TTS] 播放失败: {response.message}")
+        except Exception as e:
+            self.get_logger().error(f"[Speaker-TTS] error: {e}")
+
+    
+    def _sv_worker(self):
+        self.get_logger().info("[Speaker-SV] 启动")
+        min_audio_samples = int(self.sample_rate * 0.5)
+        while not self.stop_event.is_set():
+            try:
+                self.get_logger().debug(f"[Speaker-SV] 等待 sv_speech_end_event...")
+                if not self.sv_speech_end_event.wait(timeout=0.1):
+                    continue
+                self.get_logger().debug(f"[Speaker-SV] sv_speech_end_event 触发 | 时间:{time.time()}")
+                self.sv_speech_end_event.clear()
+                if not (self.sv_enabled and self.sv_client):
+                    continue
+                self.sv_result_ready_event.clear()
+                speaker_count = self.sv_client.get_speaker_count()
+                if speaker_count == 0:
+                    with self.sv_lock:
+                        self.current_speaker_id = None
+                        self.current_speaker_state = SpeakerState.UNKNOWN
+                        self.current_speaker_score = 0.0
+                        self.current_speaker_threshold = self.sv_client.threshold
+                    self.sv_result_ready_event.set()
+                    self.get_logger().info("[Speaker-SV] 数据库为空，跳过验证，直接设置UNKNOWN状态")
+                    continue
+                with self.sv_lock:
+                    audio_list = list(self.sv_audio_buffer)
+                    buffer_size = len(audio_list)
+                    self.get_logger().debug(f"[Speaker-SV] 读取buffer | 大小:{buffer_size} | 时间:{time.time()}")
+                    self.sv_audio_buffer.clear()
+                self.get_logger().info(f"[Speaker-SV] 收到speech_end事件，录音长度: {buffer_size} 样本（{buffer_size/self.sample_rate:.2f}秒）")
+                if buffer_size < min_audio_samples:
+                    self.get_logger().debug(f"[Speaker-SV] 录音太短: {buffer_size} < {min_audio_samples}，跳过处理")
+                    with self.sv_lock:
+                        self.current_speaker_id = None
+                        self.current_speaker_state = SpeakerState.UNKNOWN
+                        self.current_speaker_score = 0.0
+                        self.current_speaker_threshold = self.sv_client.threshold
+                    self.sv_result_ready_event.set()
+                    continue
+                audio_array = np.array(audio_list, dtype=np.int16)
+    
+                embedding, success = self.sv_client.extract_embedding(
+                    audio_array, 
+                    sample_rate=self.sample_rate
+                )
+                if not success or embedding is None:
+                    self.get_logger().debug("[Speaker-SV] 提取embedding失败")
+                    with self.sv_lock:
+                        self.current_speaker_id = None
+                        self.current_speaker_state = SpeakerState.ERROR
+                        self.current_speaker_score = 0.0
+                        self.current_speaker_threshold = self.sv_client.threshold
+                    self.sv_result_ready_event.set()
+                    continue
+                speaker_id, match_state, score, threshold = self.sv_client.match_speaker(embedding)
+                with self.sv_lock:
+                    self.current_speaker_id = speaker_id
+                    self.current_speaker_state = match_state
+                    self.current_speaker_score = score
+                    self.current_speaker_threshold = threshold
+                if match_state == SpeakerState.VERIFIED:
+                    self.get_logger().info(f"[Speaker-SV] 识别到说话人: {speaker_id}, 相似度: {score:.4f}, 阈值: {threshold:.4f}")
+                elif match_state == SpeakerState.REJECTED:
+                    self.get_logger().info(f"[Speaker-SV] 未匹配到已知说话人（相似度不足）, 相似度: {score:.4f}, 阈值: {threshold:.4f}")
+                else:
+                    self.get_logger().info(f"[Speaker-SV] 状态: {match_state.value}, 相似度: {score:.4f}, 阈值: {threshold:.4f}")
+                self.sv_result_ready_event.set()     
+            except Exception as e:
+                self.get_logger().error(f"[Speaker-SV] 错误: {e}")
+                time.sleep(0.1)
+    
+    def _put_tts_text(self, text: str):
+        try:
+            self.tts_queue.put(text, timeout=0.2)
+        except queue.Full:
+            self.get_logger().warning(f"[Speaker-TTS] 队列已满，无法发送文本: {text[:50]}")
+    
+    def _handle_wake_word(self, text: str, current_state: ConversationState = None) -> str:
+        """处理唤醒词：CHECK_VOICE状态下只检查存在性，AUTHORIZED状态下移除唤醒词"""
+        if not self.use_wake_word:
+            return text.strip()
+        text_pinyin = self.intent_router.to_pinyin(text).lower().strip()
+        wake_word_pinyin = self.wake_word.lower().strip()
+        if not wake_word_pinyin:
+            return ""
+        text_pinyin_parts = text_pinyin.split()
+        wake_word_parts = wake_word_pinyin.split()
+        start_idx = -1
+        for i in range(len(text_pinyin_parts) - len(wake_word_parts) + 1):
+            if text_pinyin_parts[i:i+len(wake_word_parts)] == wake_word_parts:
+                start_idx = i
+                break 
+        if start_idx == -1:
+            return ""
+        if current_state == ConversationState.CHECK_VOICE:
+            return text
+        hanzi_count = 0
+        new_text = ""
+        for c in text:
+            if '\u4e00' <= c <= '\u9fa5':
+                if hanzi_count < start_idx or hanzi_count >= start_idx + len(wake_word_parts):
+                    new_text += c
+                hanzi_count += 1
+            else:
+                new_text += c
+        return new_text.strip()
+    
+    def destroy_node(self):
+        self.get_logger().info("[Speaker] 语音节点正在关闭...")
+        self.stop_event.set()
+        self.interrupt_event.set()
+        self.get_logger().info("[Speaker] 强制停止TTS播放...")
+        self._interrupt_tts()
+        threads_to_join = [self.vad_thread, self.process_thread, self.tts_thread]
+        if self.sv_thread:
+            threads_to_join.append(self.sv_thread)
+        for thread in threads_to_join:
+            if thread and thread.is_alive():
+                thread.join(timeout=1.0)
+        self._interrupt_tts()
+        if hasattr(self, 'sv_client') and self.sv_client:
+            try:
+                self.sv_client.save_speakers()
+                self.sv_client.cleanup()
+            except Exception as e:
+                self.get_logger().warning(f"[Speaker] 清理声纹识别资源时出错: {e}")
+        super().destroy_node()
+
+
+def main(args=None):
+    rclpy.init(args=args)
+    node = RobotSpeakerNode()
+    from rclpy.executors import MultiThreadedExecutor
+    executor = MultiThreadedExecutor(num_threads=4)
+    executor.add_node(node)
+    try:
+        executor.spin()
+    except KeyboardInterrupt:
+        node.get_logger().info("[Speaker] 收到中断信号，正在关闭节点")
+    finally:
+        node.destroy_node()
+        rclpy.shutdown()
+
+
+if __name__ == '__main__':
+    main()
--- a/robot_speaker/core/skill_interface_parser.py
+++ b/robot_speaker/core/skill_interface_parser.py
@@ -0,0 +1,185 @@
+"""技能接口文件解析器"""
+import os
+import yaml
+import json
+from typing import Optional
+from ament_index_python.packages import get_package_share_directory
+
+
+class SkillInterfaceParser:
+    def __init__(self, interfaces_root: str):
+        """初始化解析器"""
+        self.interfaces_root = interfaces_root
+        self._cached_skill_config: list[dict] | None = None
+        self._cached_skill_interfaces: dict[str, dict] | None = None
+    
+    def get_skill_names(self) -> list[str]:
+        """获取所有技能名称（统一读取 robot_skills.yaml，避免重复）"""
+        skill_config = self._load_skill_config()
+        return [entry["name"] for entry in skill_config if isinstance(entry, dict) and entry.get("name")]
+    
+    def _load_skill_config(self) -> list[dict]:
+        """加载 robot_skills.yaml（带缓存，避免重复读取）"""
+        if self._cached_skill_config is not None:
+            return self._cached_skill_config
+        
+        try:
+            brain_share = get_package_share_directory("brain")
+            skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
+            with open(skill_path, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f) or []
+            self._cached_skill_config = data if isinstance(data, list) else []
+            return self._cached_skill_config
+        except Exception:
+            self._cached_skill_config = []
+            return []
+    
+    def parse_skill_interfaces(self) -> dict[str, dict]:
+        """解析所有技能接口文件的目标字段（带缓存）"""
+        if self._cached_skill_interfaces is not None:
+            return self._cached_skill_interfaces
+        
+        result = {}
+        skill_config = self._load_skill_config()
+        
+        for skill_entry in skill_config:
+            skill_name = skill_entry.get("name")
+            if not skill_name:
+                continue
+            
+            interfaces = skill_entry.get("interfaces", [])
+            for iface in interfaces:
+                if isinstance(iface, dict):
+                    iface_name = iface.get("name", "")
+                else:
+                    iface_name = str(iface)
+                
+                if ".action" in iface_name:
+                    iface_type = "action"
+                    file_path = os.path.join(self.interfaces_root, "action", iface_name)
+                elif ".srv" in iface_name:
+                    iface_type = "srv"
+                    file_path = os.path.join(self.interfaces_root, "srv", iface_name)
+                else:
+                    continue
+                
+                if os.path.exists(file_path):
+                    goal_fields = self._parse_goal_fields(file_path)
+                    result[skill_name] = {
+                        "type": iface_type,
+                        "goal_fields": goal_fields
+                    }
+                    break
+        
+        self._cached_skill_interfaces = result
+        return result
+    
+    def _parse_goal_fields(self, file_path: str) -> list[dict]:
+        """解析接口文件的目标字段（第一个---之前的所有字段）"""
+        goal_fields = []
+        
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+                for line in lines:
+                    line = line.strip()
+                    if line.startswith("---"):
+                        break
+                    if not line or line.startswith("#"):
+                        continue
+                    
+                    parts = line.split()
+                    if len(parts) >= 2:
+                        field_type = parts[0]
+                        field_name = parts[1]
+                        
+                        comment = ""
+                        if "#" in line:
+                            comment = line.split("#", 1)[1].strip()
+                        
+                        goal_fields.append({
+                            "name": field_name,
+                            "type": field_type,
+                            "comment": comment
+                        })
+        except Exception:
+            return []
+        
+        return goal_fields
+    
+    def generate_params_documentation(self) -> str:
+        """生成技能参数说明文档"""
+        skill_interfaces = self.parse_skill_interfaces()
+        doc_lines = []
+        
+        for skill_name, skill_info in skill_interfaces.items():
+            doc_lines.append(f"{skill_name}技能的parameters字段：")
+            goal_fields = skill_info.get("goal_fields", [])
+            
+            if not goal_fields:
+                doc_lines.append("  - 无参数，使用 null")
+            else:
+                doc_lines.append("  parameters字典必须包含以下字段：")
+                for field in goal_fields:
+                    field_name = field["name"]
+                    field_type = field["type"]
+                    comment = field.get("comment", "")
+                    
+                    if field_name == "body_id":
+                        doc_lines.append(
+                            f"    - {field_name} ({field_type}): 身体部位ID，0=左臂，1=右臂，2=头部。"
+                            f"根据目标物在图片中的方位选择：左侧用0，右侧用1，中央用2。"
+                        )
+                    else:
+                        type_desc = self._get_type_description(field_type)
+                        doc_lines.append(f"    - {field_name} ({field_type}): {type_desc} {comment}")
+                
+                example_params = {}
+                for field in goal_fields:
+                    field_name = field["name"]
+                    field_type = field["type"]
+                    example_params[field_name] = self._get_example_value(field_name, field_type)
+                
+                doc_lines.append(f"  示例：{json.dumps(example_params, ensure_ascii=False)}")
+            
+            doc_lines.append("")
+        
+        return "\n".join(doc_lines)
+    
+    def _get_type_description(self, field_type: str) -> str:
+        """根据字段类型返回描述"""
+        type_map = {
+            "int8": "整数，范围-128到127",
+            "int16": "整数，范围-32768到32767",
+            "int32": "整数",
+            "int64": "整数",
+            "uint8": "无符号整数，范围0到255",
+            "float32": "浮点数",
+            "float64": "浮点数",
+            "string": "字符串",
+        }
+        
+        base_type = field_type.replace("[]", "").replace("_", "")
+        return type_map.get(base_type, field_type)
+    
+    def _get_example_value(self, field_name: str, field_type: str) -> any:
+        """根据字段名和类型生成示例值"""
+        if field_name == "body_id":
+            return 0
+        elif field_name == "data_array" and "float64[]" in field_type:
+            return [0.1, 0.2, 0.3, 0.0, 0.0, 0.0]
+        elif "int" in field_type:
+            return 0
+        elif "float" in field_type:
+            return 0.0
+        elif "string" in field_type:
+            return ""
+        elif "[]" in field_type:
+            if "int" in field_type:
+                return [0, 0, 0]
+            elif "float" in field_type:
+                return [0.0, 0.0, 0.0]
+            return []
+        else:
+            return None
+
--- a/robot_speaker/core/speaker_verifier.py
+++ b/robot_speaker/core/speaker_verifier.py
@@ -0,0 +1,199 @@
+"""
+声纹识别模块
+"""
+import numpy as np
+import threading
+import os
+import time
+import json
+from enum import Enum
+
+
+class SpeakerState(Enum):
+    """说话人识别状态"""
+    UNKNOWN = "unknown"
+    VERIFIED = "verified"
+    REJECTED = "rejected"
+    ERROR = "error"
+
+
+class SpeakerVerificationClient:
+    """声纹识别客户端 - 非实时、低频处理"""
+    
+    def __init__(self, model_path: str, threshold: float, speaker_db_path: str = None, logger=None):
+        self.model_path = model_path
+        self.threshold = threshold
+        self.speaker_db_path = speaker_db_path
+        self.logger = logger
+        self.speaker_db = {}  # {speaker_id: {"embedding": np.ndarray, "env": str, "registered_at": float}}
+        self._lock = threading.Lock()
+        
+        # # 优化CPU性能：限制Torch使用的线程数，防止多线程竞争导致性能骤降
+        import torch
+        torch.set_num_threads(1)
+
+        from funasr import AutoModel
+        model_path = os.path.expanduser(self.model_path)
+        # 禁用自动更新检查，防止每次初始化都联网检查
+        self.model = AutoModel(model=model_path, device="cpu", disable_update=True)
+        if self.logger:
+            self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
+        
+        if self.speaker_db_path:
+            self.load_speakers()
+    
+    def _log(self, level: str, msg: str):
+        """记录日志 - 修复ROS2 logger在多线程环境中的问题"""
+        if self.logger:
+            try:
+                if level == "info":
+                    self.logger.info(msg)
+                elif level == "warning":
+                    self.logger.warning(msg)
+                elif level == "error":
+                    self.logger.error(msg)
+                elif level == "debug":
+                    self.logger.debug(msg)
+            except Exception:
+                pass
+    
+    def load_speakers(self):
+        if not self.speaker_db_path:
+            return
+        
+        db_path = os.path.expanduser(self.speaker_db_path)
+        if not os.path.exists(db_path):
+            self._log("info", f"声纹数据库文件不存在: {db_path}，将创建新文件")
+            return
+        try:
+            with open(db_path, 'rb') as f:
+                data = json.load(f)
+                with self._lock:
+                    self.speaker_db = {}
+                    for speaker_id, info in data.items():
+                        embedding_array = np.array(info["embedding"], dtype=np.float32)
+                        if embedding_array.ndim > 1:
+                            embedding_array = embedding_array.flatten()
+                        self.speaker_db[speaker_id] = {
+                            "embedding": embedding_array,
+                            "env": info.get("env", ""),
+                            "registered_at": info.get("registered_at", 0.0)
+                        }
+                self._log("info", f"已加载 {len(self.speaker_db)} 个已注册说话人")
+        except Exception as e:
+            self._log("error", f"加载声纹数据库失败: {e}")
+    
+    def save_speakers(self):
+        if not self.speaker_db_path:
+            return
+        db_path = os.path.expanduser(self.speaker_db_path)
+        try:
+            os.makedirs(os.path.dirname(db_path), exist_ok=True)
+            with self._lock:
+                data = {}
+                for speaker_id, info in self.speaker_db.items():
+                    data[speaker_id] = {
+                        "embedding": info["embedding"].tolist(),
+                        "env": info.get("env", ""),
+                        "registered_at": info.get("registered_at", 0.0)
+                    }
+            with open(db_path, 'w') as f:
+                json.dump(data, f, indent=2)
+            self._log("info", f"已保存 {len(data)} 个已注册说话人到: {db_path}")
+        except Exception as e:
+            self._log("error", f"保存声纹数据库失败: {e}")
+    
+    def extract_embedding(self, audio_array: np.ndarray, sample_rate: int = 16000) -> tuple[np.ndarray | None, bool]:
+        try:
+            if len(audio_array) == 0:
+                return None, False
+            # 确保是int16格式
+            if audio_array.dtype != np.int16:
+                audio_array = audio_array.astype(np.int16)
+            # 转换为float32并归一化到[-1, 1]
+            audio_float = audio_array.astype(np.float32) / 32768.0
+            # 调用模型提取embedding
+            result = self.model.generate(input=audio_float, cache={})
+            if result and len(result) > 0 and "spk_embedding" in result[0]:
+                embedding = result[0]["spk_embedding"]
+                if embedding is not None and len(embedding) > 0:
+                    embedding_array = np.array(embedding, dtype=np.float32)
+                    if embedding_array.ndim > 1:
+                        embedding_array = embedding_array.flatten()
+                    return embedding_array, True
+            return None, False
+        except Exception as e:
+            self._log("error", f"提取声纹特征失败: {e}")
+            return None, False
+    
+    def match_speaker(self, embedding: np.ndarray) -> tuple[str | None, SpeakerState, float, float]:
+        if embedding is None or len(embedding) == 0:
+            return None, SpeakerState.UNKNOWN, 0.0, float(self.threshold)
+        
+        with self._lock:
+            if len(self.speaker_db) == 0:
+                return None, SpeakerState.UNKNOWN, 0.0, float(self.threshold)
+        try:
+            best_speaker_id = None
+            best_score = 0.0
+            with self._lock:
+                for speaker_id, info in self.speaker_db.items():
+                    stored_embedding = info["embedding"]
+                    # 计算余弦相似度
+                    dot_product = np.dot(embedding, stored_embedding)
+                    norm_embedding = np.linalg.norm(embedding)
+                    norm_stored = np.linalg.norm(stored_embedding)
+                    
+                    if norm_embedding > 0 and norm_stored > 0:
+                        score = dot_product / (norm_embedding * norm_stored)
+                        if score > best_score:
+                            best_score = score
+                            best_speaker_id = speaker_id
+            
+            state = SpeakerState.VERIFIED if best_score >= self.threshold else SpeakerState.REJECTED
+            return best_speaker_id, state, float(best_score), float(self.threshold)
+        except Exception as e:
+            self._log("error", f"匹配说话人失败: {e}")
+            return None, SpeakerState.ERROR, 0.0, float(self.threshold)
+    
+    def register_speaker(self, speaker_id: str, embedding: np.ndarray, env: str = "") -> bool:
+        if embedding is None or len(embedding) == 0:
+            return False
+        
+        try:
+            with self._lock:
+                self.speaker_db[speaker_id] = {
+                    "embedding": np.array(embedding, dtype=np.float32),
+                    "env": env,
+                    "registered_at": time.time()
+                }
+            self._log("info", f"已注册说话人: {speaker_id}")
+            return True
+        except Exception as e:
+            self._log("error", f"注册说话人失败: {e}")
+            return False
+    
+    def get_speaker_count(self) -> int:
+        with self._lock:
+            return len(self.speaker_db)
+    
+    def get_speaker_list(self) -> list[str]:
+        with self._lock:
+            return list(self.speaker_db.keys())
+    
+    def remove_speaker(self, speaker_id: str) -> bool:
+        with self._lock:
+            if speaker_id in self.speaker_db:
+                del self.speaker_db[speaker_id]
+                self._log("info", f"已删除说话人: {speaker_id}")
+                return True
+            return False
+    
+    def cleanup(self):
+        try:
+            self.save_speakers()
+            if hasattr(self, 'model') and self.model:
+                del self.model
+        except Exception as e:
+            self._log("error", f"清理资源失败: {e}")
+
--- a/robot_speaker/perception/audio_pipeline.py
+++ b/robot_speaker/perception/audio_pipeline.py
@@ -0,0 +1,256 @@
+"""
+音频处理模块：录音 + VAD
+"""
+import time
+import pyaudio
+import webrtcvad
+import struct
+import queue
+
+
+class VADDetector:
+    """VAD语音检测器"""
+    
+    def __init__(self, mode: int, sample_rate: int):
+        self.vad = webrtcvad.Vad(mode)
+        self.sample_rate = sample_rate
+
+
+class AudioRecorder:
+    """音频录音器 - 录音线程"""
+    
+    def __init__(self, device_index: int, sample_rate: int, channels: int, 
+                 chunk: int, vad_detector: VADDetector,
+                 audio_queue: queue.Queue,  # 音频队列：录音线程 → ASR线程
+                 silence_duration_ms: int = 1000,
+                 min_energy_threshold: int = 300, # 音频能量 > 300：有语音
+                 heartbeat_interval: float = 2.0,
+                 on_heartbeat=None,
+                 is_playing=None,
+                 on_new_segment=None,  # 检测到新的人声段
+                 on_speech_start=None,  # 检测到人声开始
+                 on_speech_end=None,  # 检测到静音结束（说话结束）
+                 stop_flag=None,
+                 on_audio_chunk=None,  # 音频chunk回调（用于声纹录音等，可选）
+                 should_put_to_queue=None,  # 检查是否应该将音频放入队列（用于阻止ASR，可选）
+                 get_silence_threshold=None,  # 获取动态静音阈值（毫秒，可选）
+                 logger=None):
+        self.device_index = device_index
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.chunk = chunk
+        self.vad_detector = vad_detector
+        self.audio_queue = audio_queue
+        self.silence_duration_ms = int(silence_duration_ms)
+        self.min_energy_threshold = int(min_energy_threshold)
+        self.heartbeat_interval = heartbeat_interval
+        
+        self.on_heartbeat = on_heartbeat
+        self.is_playing = is_playing or (lambda: False)
+        self.on_new_segment = on_new_segment
+        self.on_speech_start = on_speech_start
+        self.on_speech_end = on_speech_end
+        self.stop_flag = stop_flag or (lambda: False)
+        self.on_audio_chunk = on_audio_chunk  # 音频chunk回调（用于声纹录音等）
+        self.should_put_to_queue = should_put_to_queue or (lambda: True)  # 默认允许放入队列
+        self.get_silence_threshold = get_silence_threshold  # 动态静音阈值回调
+        self.logger = logger
+        self.audio = pyaudio.PyAudio()
+
+        # 自动查找 iFLYTEK 麦克风设备
+        try:
+            count = self.audio.get_device_count()
+            found_index = -1
+            if self.logger:
+                self.logger.info(f"开始扫描音频设备 (总数: {count})...")
+
+            for i in range(count):
+                device_info = self.audio.get_device_info_by_index(i)
+                device_name = device_info.get('name', '')
+                max_input_channels = device_info.get('maxInputChannels', 0)
+                
+                if self.logger:
+                    try:
+                        self.logger.info(f"扫描设备 [{i}]: Name='{device_name}', MaxInput={max_input_channels}, Rate={int(device_info.get('defaultSampleRate'))}")
+                    except:
+                        pass
+
+                # 检查是否包含 iFLYTEK 且支持录音（输入通道 > 0）
+                if 'iFLYTEK' in device_name and max_input_channels > 0:
+                    found_index = i
+                    if self.logger:
+                        self.logger.info(f"已自动定位到麦克风设备: {device_name} (Index: {i})")
+                    break
+            
+            if found_index != -1:
+                self.device_index = found_index
+            else:
+                if self.logger:
+                    self.logger.warning(f"未自动检测到 iFLYTEK 设备，请检查USB连接，或执行 'arecord -l' 确认系统是否识别到录音设备，将继续使用配置的索引: {self.device_index}")
+
+        except Exception as e:
+            if self.logger:
+                self.logger.error(f"设备自动检测过程出错: {e}")
+
+        self.format = pyaudio.paInt16
+        self._debug_counter = 0
+    
+    def record_with_vad(self):
+        """录音线程：VAD + 能量检测"""
+        if self.on_heartbeat:
+            self.on_heartbeat()
+
+        try:
+            stream = self.audio.open(
+                format=self.format,
+                channels=self.channels,
+                rate=self.sample_rate,
+                input=True,
+                input_device_index=self.device_index if self.device_index >= 0 else None,
+                frames_per_buffer=self.chunk
+            )
+        except Exception as e:
+            raise RuntimeError(f"无法打开音频输入设备: {e}")
+
+        # VAD检测窗口, 最快 0.5s 内发现说话
+        window_sec = 0.5
+        # 连续 1s 没有检测到语音，就判定为静音状态
+        no_speech_threshold = max(self.silence_duration_ms / 1000.0, 0.1) 
+
+        last_heartbeat_time = time.time()
+
+        audio_buffer = [] # VAD 滑动窗口
+        last_active_time = time.time() # 静音计时基准
+        in_speech_segment = False # 是否处于语音段中（从检测到人声开始，直到静音超时结束）
+
+        try:
+            while not self.stop_flag():
+                # exception_on_overflow=False, 宁可丢帧，也不阻塞
+                data = stream.read(self.chunk, exception_on_overflow=False)
+                processed_data = data
+                
+                # 检查是否应该将音频放入队列（用于阻止ASR，例如无声纹文件时需要注册）
+                if self.should_put_to_queue():
+                    # 队列满时丢弃最旧的数据，ASR 跟不上时系统仍然听得见
+                    if self.audio_queue.full():
+                        self.audio_queue.get_nowait()
+                    # 使用处理后的音频数据（经过回声消除）
+                    self.audio_queue.put_nowait(processed_data)
+                
+                # 音频chunk回调（用于声纹录音等，仅在需要时调用）
+                if self.on_audio_chunk:
+                    # 回调使用处理后的音频数据
+                    self.on_audio_chunk(processed_data)
+                
+                # VAD检测使用处理后的音频（经过回声消除）
+                audio_buffer.append(processed_data) # 只用于 VAD，不用于 ASR
+
+                # VAD检测窗口
+                now = time.time()
+                if len(audio_buffer) * self.chunk / self.sample_rate >= window_sec:
+                    raw_audio = b''.join(audio_buffer)
+                    energy = self._calculate_energy(raw_audio)
+                    vad_result = self._check_activity(raw_audio)
+
+                    self._debug_counter += 1
+                    if self._debug_counter >= 10:
+                        if self.logger:
+                            self.logger.info(f"[VAD调试] 能量={energy:.1f}, 阈值={self.min_energy_threshold}, VAD结果={vad_result}")
+                        self._debug_counter = 0
+
+                    if vad_result:
+                        last_active_time = now
+                        
+                        if not in_speech_segment: # 上一轮没说话，本轮开始说话
+                            in_speech_segment = True
+                            if self.on_speech_start:
+                                self.on_speech_start()
+                            
+                            # 检测当前 TTS 是否在播放
+                            if self.is_playing() and self.on_new_segment:
+                                self.on_new_segment() # 打断 TTS的回调
+                    else:
+                        if in_speech_segment:
+                            # 处于语音段中，但当前帧为静音，检查静音时长
+                            silence_duration = now - last_active_time
+                            
+                            # 动态获取静音阈值（如果提供回调函数）
+                            if self.get_silence_threshold:
+                                current_silence_ms = self.get_silence_threshold()
+                                current_no_speech_threshold = max(current_silence_ms / 1000.0, 0.1)
+                            else:
+                                current_no_speech_threshold = no_speech_threshold
+                            
+                            # 添加调试日志
+                            if self.logger and silence_duration < current_no_speech_threshold:
+                                self.logger.debug(f"[VAD] 静音中: {silence_duration:.3f}秒 < {current_no_speech_threshold:.3f}秒阈值")
+                            
+                            if silence_duration >= current_no_speech_threshold:
+                                if self.on_speech_end:
+                                    if self.logger:
+                                        self.logger.debug(f"[VAD] 触发speech_end: 静音持续时间 {silence_duration:.3f}秒 >= 阈值 {current_no_speech_threshold:.3f}秒")
+                                    self.on_speech_end() # 通知系统用户停止说话
+                                in_speech_segment = False
+                        
+                        if self.on_heartbeat and now - last_heartbeat_time >= self.heartbeat_interval:
+                            self.on_heartbeat()
+                            last_heartbeat_time = now
+
+                    audio_buffer = []
+        finally:
+            if stream.is_active():
+                stream.stop_stream()
+            stream.close()
+
+    @staticmethod
+    def _calculate_energy(audio_chunk: bytes) -> float:
+        """计算音频能量（RMS）"""
+        if not audio_chunk:
+            return 0.0
+        # 计算样本数：音频字节数 // 2（因为是16位PCM，1个样本=2字节）
+        n = len(audio_chunk) // 2
+        if n <= 0:
+            return 0.0
+        # 把字节数据解包为16位有符号整数（小端序）
+        samples = struct.unpack(f'<{n}h', audio_chunk[: n * 2])
+        if not samples:
+            return 0.0
+        return (sum(s * s for s in samples) / len(samples)) ** 0.5
+
+    def _check_activity(self, audio_data: bytes) -> bool:
+        """VAD + 能量检测：先VAD检测，能量作为辅助判断"""
+        energy = self._calculate_energy(audio_data)
+        
+        rate = 0.4 # 连续人声经验值
+        num = 0
+
+        # 采样率:16000 Hz, 帧时长:20ms=0.02s, 每帧采样点数=16000×0.02=320samples
+        # 每帧字节数=320×2=640bytes
+        bytes_per_sample = 2 # paInt16
+        frame_samples = int(self.sample_rate * 0.02)
+        frame_bytes = frame_samples * bytes_per_sample
+
+        if frame_bytes <= 0 or len(audio_data) < frame_bytes:
+            return False
+        
+        total_frames = len(audio_data) // frame_bytes
+        required = max(1, int(total_frames * rate))
+
+        for i in range(0, len(audio_data), frame_bytes):
+            chunk = audio_data[i:i + frame_bytes]
+            if len(chunk) == frame_bytes:
+                if self.vad_detector.vad.is_speech(chunk, sample_rate=self.sample_rate):
+                    num += 1
+
+        # 语音开头能量高, 中后段（拖音、尾音）能量下降
+        vad_result = num >= required
+        if vad_result and energy < self.min_energy_threshold * 0.5:
+            return False
+        
+        return vad_result
+    
+    def cleanup(self):
+        """清理资源"""
+        if hasattr(self, 'audio') and self.audio:
+            self.audio.terminate()
+
--- a/robot_speaker/robot_speaker_node.py
+++ b/robot_speaker/robot_speaker_node.py
@@ -1,55 +0,0 @@
-import rclpy
-from rclpy.node import Node
-from example_interfaces.msg import String
-import threading
-from queue import Queue
-import time
-import espeakng
-import pyttsx3
-
-
-class RobotSpeakerNode(Node):
-    def __init__(self, node_name):
-        super().__init__(node_name)
-        self.novels_queue_ = Queue()
-        self.novel_subscriber_ = self.create_subscription(
-            String, 'robot_msg', self.novel_callback, 10)
-        self.speech_thread_ = threading.Thread(target=self.speak_thread)
-        self.speech_thread_.start()
-
-    def novel_callback(self, msg):
-        self.novels_queue_.put(msg.data)
-
-    def speak_thread(self):
-        # 初始化引擎
-        engine = pyttsx3.init()
-        # 调整参数
-        engine.setProperty('rate', 150)  # 语速（150更自然）
-        engine.setProperty('volume', 1.0)  # 音量（0.0-1.0）
-        
-        # 选择中文音色（修正：使用 languages 属性，且是列表）
-        voices = engine.getProperty('voices')
-        for voice in voices:
-            # 检查语音支持的语言列表中是否包含中文（'zh' 或 'zh-CN' 等）
-            if any('zh' in lang for lang in voice.languages):
-                engine.setProperty('voice', voice.id)
-                self.get_logger().info(f'已选择中文语音：{voice.id}')
-                break
-        else:
-            self.get_logger().warning('未找到中文语音库，将使用默认语音')
-        
-        while rclpy.ok():
-            if self.novels_queue_.qsize() > 0:
-                text = self.novels_queue_.get()
-                engine.say(text)
-                engine.runAndWait()  # 等待语音播放完成
-            else:
-                time.sleep(0.5)
-
-
-
-def main(args=None):
-    rclpy.init(args=args)
-    node = RobotSpeakerNode("robot_speaker_node")
-    rclpy.spin(node)
-    rclpy.shutdown()
--- a/robot_speaker/services/init.py
+++ b/robot_speaker/services/init.py
@@ -0,0 +1,22 @@
+"""
+Service节点模块
+"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/robot_speaker/services/asr_audio_node.py
+++ b/robot_speaker/services/asr_audio_node.py
@@ -0,0 +1,703 @@
+import rclpy
+from rclpy.node import Node
+from interfaces.srv import ASRRecognize, AudioData, VADEvent
+import threading
+import queue
+import time
+import pyaudio
+import yaml
+import os
+import collections
+import numpy as np
+import base64
+import dashscope
+from dashscope.audio.qwen_omni import OmniRealtimeConversation, OmniRealtimeCallback
+from dashscope.audio.qwen_omni.omni_realtime import TranscriptionParams, MultiModality
+from ament_index_python.packages import get_package_share_directory
+
+
+class AudioRecorder:
+    def __init__(self, device_index: int, sample_rate: int, channels: int, 
+                 chunk: int, audio_queue: queue.Queue, stop_event, logger=None):
+        self.device_index = device_index
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.chunk = chunk
+        self.audio_queue = audio_queue
+        self.stop_event = stop_event
+        self.logger = logger
+        self.audio = pyaudio.PyAudio()
+
+        original_index = self.device_index
+        try:
+            for i in range(self.audio.get_device_count()):
+                device_info = self.audio.get_device_info_by_index(i)
+                if 'iFLYTEK' in device_info['name'] and device_info['maxInputChannels'] > 0:
+                    self.device_index = i
+                    if self.logger:
+                        self.logger.info(f"[ASR-Recorder] 已自动定位到麦克风设备: {device_info['name']} (Index: {i})")
+                    break
+        except Exception as e:
+            if self.logger:
+                self.logger.error(f"[ASR-Recorder] 设备自动检测过程出错: {e}")
+        
+        if self.device_index == original_index and original_index == -1:
+            self.device_index = 0
+            if self.logger:
+                self.logger.info("[ASR-Recorder] 未找到 iFLYTEK 设备，使用系统默认输入设备")
+        self.format = pyaudio.paInt16
+    
+    def record(self):
+        if self.logger:
+            self.logger.info(f"[ASR-Recorder] 录音线程启动，设备索引: {self.device_index}")
+        stream = None
+        try:
+            stream = self.audio.open(
+                format=self.format,
+                channels=self.channels,
+                rate=self.sample_rate,
+                input=True,
+                input_device_index=self.device_index if self.device_index >= 0 else None,
+                frames_per_buffer=self.chunk
+            )
+            if self.logger:
+                self.logger.info("[ASR-Recorder] 音频输入设备已打开")
+        except Exception as e:
+            if self.logger:
+                self.logger.error(f"[ASR-Recorder] 无法打开音频输入设备: {e}")
+            return
+        try:
+            while not self.stop_event.is_set():
+                try:
+                    data = stream.read(self.chunk, exception_on_overflow=False)
+                    
+                    if self.audio_queue.full():
+                        self.audio_queue.get_nowait()
+                    self.audio_queue.put_nowait(data)
+                except OSError as e:
+                    if self.logger:
+                        self.logger.debug(f"[ASR-Recorder] 录音设备错误: {e}")
+                    break
+        except KeyboardInterrupt:
+            if self.logger:
+                self.logger.info("[ASR-Recorder] 录音线程收到中断信号")
+        finally:
+            if stream is not None:
+                try:
+                    if stream.is_active():
+                        stream.stop_stream()
+                    stream.close()
+                except Exception as e:
+                    pass
+            if self.logger:
+                self.logger.info("[ASR-Recorder] 录音线程已退出")
+
+
+class DashScopeASR:
+    def __init__(self, api_key: str, sample_rate: int, model: str, url: str, logger=None):
+        dashscope.api_key = api_key
+        self.sample_rate = sample_rate
+        self.model = model
+        self.url = url
+        self.logger = logger
+        
+        self.conversation = None
+        self.running = False
+        self.on_sentence_end = None
+        self.on_speech_started = None
+        self.on_speech_stopped = None
+        
+        self._stop_lock = threading.Lock()
+        self._final_result_event = threading.Event()
+        self._pending_commit = False
+
+        # ========== 连接生命周期管理: 解决 DashScope ASR WebSocket 连接超时导致的识别不稳定 ==========
+        self._connection_start_time = None  # 连接创建时间
+        self._last_audio_time = None  # 最后一次发送音频的时间
+        self._recognition_count = 0  # 识别次数计数
+        self._audio_send_count = 0  # 音频发送次数计数
+        self._last_audio_send_success = True  # 最后一次音频发送是否成功
+        self._consecutive_send_failures = 0  # 连续发送失败次数
+        
+        # 配置参数
+        self.MAX_CONNECTION_AGE = 300  # 连接最大存活时间：5分钟
+        self.MAX_IDLE_TIME = 180  # 最大空闲时间：3分钟
+        self.MAX_RECOGNITIONS = 30  # 最大识别次数：30次后重建连接
+        self.MAX_CONSECUTIVE_FAILURES = 3  # 最大连续失败次数
+    
+    def _log(self, level: str, msg: str):
+        if not self.logger:
+            return
+        try:
+            if level == "debug":
+                self.logger.debug(msg)
+            elif level == "warning":
+                self.logger.warn(msg)
+            elif level == "error":
+                self.logger.error(msg)
+            elif level == "info":
+                self.logger.info(msg)
+        except Exception:
+            pass
+
+    def _should_reconnect(self) -> tuple[bool, str]:
+        if not self.running or not self.conversation:
+            return False, ""
+        current_time = time.time()
+        # 检查1：连接时间
+        if self._connection_start_time:
+            connection_age = current_time - self._connection_start_time
+            if connection_age > self.MAX_CONNECTION_AGE:
+                return True, f"连接已存活{connection_age:.0f}秒，超过{self.MAX_CONNECTION_AGE}秒阈值"
+        # 检查2：空闲时间
+        if self._last_audio_time:
+            idle_time = current_time - self._last_audio_time
+            if idle_time > self.MAX_IDLE_TIME:
+                return True, f"连接已空闲{idle_time:.0f}秒，超过{self.MAX_IDLE_TIME}秒阈值"
+        # 检查3：识别次数
+        if self._recognition_count >= self.MAX_RECOGNITIONS:
+            return True, f"已完成{self._recognition_count}次识别，达到重连阈值"
+        # 检查4：连续发送失败
+        if self._consecutive_send_failures >= self.MAX_CONSECUTIVE_FAILURES:
+            return True, f"连续{self._consecutive_send_failures}次音频发送失败"
+        
+        return False, ""
+    
+    def _reset_connection_stats(self):
+        self._connection_start_time = time.time()
+        self._last_audio_time = time.time()
+        self._recognition_count = 0
+        self._audio_send_count = 0
+        self._last_audio_send_success = True
+        self._consecutive_send_failures = 0
+    
+    def start(self):
+        if self.running:
+            return False
+        
+        try:
+            callback = _ASRCallback(self)
+            self.conversation = OmniRealtimeConversation(
+                model=self.model,
+                url=self.url,
+                callback=callback
+            )
+            callback.conversation = self.conversation
+            
+            self.conversation.connect()
+            
+            transcription_params = TranscriptionParams(
+                language='zh',
+                sample_rate=self.sample_rate,
+                input_audio_format="pcm",
+            )
+            
+            self.conversation.update_session(
+                output_modalities=[MultiModality.TEXT],
+                enable_input_audio_transcription=True,
+                transcription_params=transcription_params,
+                enable_turn_detection=True,
+                turn_detection_type='server_vad',
+                prefix_padding_ms=1000,
+                turn_detection_threshold=0.3,
+                turn_detection_silence_duration_ms=800,
+            )
+            
+            self.running = True
+            self._reset_connection_stats()
+            self._log("info", f"[ASR] 已启动 | 连接ID:{id(self.conversation)}")
+            return True
+        except Exception as e:
+            self.running = False
+            self._log("error", f"[ASR] 启动失败: {e}")
+            if self.conversation:
+                try:
+                    self.conversation.close()
+                except Exception:
+                    pass
+                self.conversation = None
+            return False
+    
+    def send_audio(self, audio_chunk: bytes):
+        should_reconnect, reason = self._should_reconnect()
+        if should_reconnect:
+            self._log("warning", f"[ASR] 检测到需要重连: {reason}")
+            self.running = False
+            try:
+                if self.conversation:
+                    self.conversation.close()
+            except:
+                pass
+            self.conversation = None
+            time.sleep(1.0)
+            if not self.start():
+                self._log("error", "[ASR] 自动重连失败")
+                return False
+            self._log("info", "[ASR] 自动重连成功")
+        import threading
+        self._log("debug", f"[ASR] send_audio 被调用 | 线程:{threading.current_thread().name} | running:{self.running} | conversation:{self.conversation is not None}")
+        if not self.running or not self.conversation:
+            self._log("debug", f"[ASR] send_audio 跳过 | running:{self.running} | conversation:{self.conversation is not None}")
+            return False
+        try:
+            audio_b64 = base64.b64encode(audio_chunk).decode('ascii')
+            self.conversation.append_audio(audio_b64)
+            self._last_audio_time = time.time()
+            self._audio_send_count += 1
+            self._last_audio_send_success = True
+            self._consecutive_send_failures = 0
+            self._log("debug", f"[ASR] 音频发送成功 | 总计:{self._audio_send_count} | 连接年龄:{time.time() - self._connection_start_time:.1f}秒")
+
+            return True
+        except Exception as e:
+            self._last_audio_send_success = False
+            self._consecutive_send_failures += 1
+            
+            error_msg = str(e)
+            error_type = type(e).__name__
+            if "Connection is already closed" in error_msg or "WebSocketConnectionClosedException" in error_type or "ConnectionClosed" in error_type or "websocket" in error_msg.lower():
+                self._log("warning", f"[ASR] WebSocket 连接已断开 | 错误:{error_msg} | 连续失败:{self._consecutive_send_failures}次")
+                self.running = False
+                try:
+                    if self.conversation:
+                        self.conversation.close()
+                except:
+                    pass
+                self.conversation = None
+            else:
+                self._log("error", f"[ASR] send_audio 异常 | 错误:{error_msg} | 类型:{error_type} | 连续失败:{self._consecutive_send_failures}次")
+            
+            return False
+    
+    def stop_current_recognition(self):
+        import threading
+        self._log("debug", f"[ASR] stop_current_recognition 被调用 | 线程:{threading.current_thread().name} | running:{self.running}")
+        if not self._stop_lock.acquire(blocking=False):
+            self._log("debug", f"[ASR] 锁获取失败，有其他线程正在执行 stop_current_recognition")
+            return False
+        
+        self._final_result_event.clear()
+        self._pending_commit = True
+        
+        try:
+            self._log("debug", f"[ASR] 获得锁，开始停止识别 | conversation:{self.conversation is not None}")
+            if not self.running or not self.conversation:
+                self._log("debug", f"[ASR] 无法停止 | running:{self.running} | conversation:{self.conversation is not None}")
+                return False
+            
+            self._recognition_count += 1
+            should_reconnect, reason = self._should_reconnect()
+            if should_reconnect:
+                self._log("info", f"[ASR] 识别完成后检测到需要重连: {reason}")
+            
+            self._final_result_event.clear()
+            self._pending_commit = True
+            
+            try:
+                self.conversation.commit()
+                self._final_result_event.wait(timeout=3.0)
+            except Exception as e:
+                self._log("debug", f"[ASR] commit 异常: {e}")
+
+            self._log("debug", f"[ASR] 准备关闭旧连接 | conversation_id:{id(self.conversation)}")
+            self.running = False
+            
+            old_conversation = self.conversation
+            self.conversation = None
+            
+            self._log("debug", f"[ASR] conversation已设为None，准备关闭旧连接")
+            try:
+                old_conversation.close()
+                self._log("debug", f"[ASR] 旧连接已关闭")
+            except Exception as e:
+                self._log("warning", f"[ASR] 关闭连接异常: {e}")
+            self._log("debug", f"[ASR] 连接已关闭，等待下次语音活动时重连")
+            return True
+        
+        finally:
+            self._pending_commit = False
+            self._stop_lock.release()
+            self._log("debug", f"[ASR] stop_current_recognition 完成，锁已释放")
+    
+    def stop(self):
+        with self._stop_lock:
+            self.running = False
+            self._final_result_event.set()
+            if self.conversation:
+                try:
+                    self.conversation.close()
+                except Exception:
+                    pass
+                self.conversation = None
+            self._log("info", "[ASR] 已完全停止")
+
+
+class _ASRCallback(OmniRealtimeCallback):
+    def __init__(self, asr_client: DashScopeASR):
+        self.asr_client = asr_client
+        self.conversation = None
+    
+    
+    def on_event(self, response):
+        try:
+            event_type = response['type']       
+            if event_type == 'conversation.item.input_audio_transcription.completed':
+                transcript = response['transcript']
+                if transcript.strip() and self.asr_client.on_sentence_end:
+                    self.asr_client.on_sentence_end(transcript.strip()) 
+                if self.asr_client._pending_commit:
+                    self.asr_client._final_result_event.set()
+            
+            elif event_type == 'input_audio_buffer.speech_started':
+                if self.asr_client.logger:
+                    self.asr_client.logger.info("[ASR] 检测到语音开始")
+                if self.asr_client.on_speech_started:
+                    self.asr_client.on_speech_started()
+            
+            elif event_type == 'input_audio_buffer.speech_stopped':
+                if self.asr_client.logger:
+                    self.asr_client.logger.info("[ASR] 检测到语音结束")
+                if self.asr_client.on_speech_stopped:
+                    self.asr_client.on_speech_stopped()
+        except Exception:
+            pass
+
+
+class ASRAudioNode(Node):
+    def __init__(self):
+        super().__init__('asr_audio_node')
+        self._load_config()
+        
+        self.audio_queue = queue.Queue(maxsize=100)
+        self.stop_event = threading.Event()
+        self._shutdown_in_progress = False
+        
+        self._init_components()
+        
+        self.recognize_service = self.create_service(
+            ASRRecognize, '/asr/recognize', self._recognize_callback
+        )
+        self.audio_data_service = self.create_service(
+            AudioData, '/asr/audio_data', self._audio_data_callback
+        )
+        self.vad_event_service = self.create_service(
+            VADEvent, '/vad/event', self._vad_event_callback
+        )
+        
+        self._last_result = None
+        self._result_event = threading.Event()
+        self._last_result_time = None
+        self.vad_event_queue = queue.Queue()
+        self.audio_buffer = collections.deque(maxlen=240000)
+        self.audio_recording = False
+        self.audio_lock = threading.Lock()
+
+        # ========== 异常识别检测 ==========
+        self._abnormal_results = ["嗯。", "。", "啊。", "哦。"]  # 异常识别结果列表
+        self._consecutive_abnormal_count = 0  # 连续异常识别次数
+        self.MAX_CONSECUTIVE_ABNORMAL = 5  # 最大连续异常次数
+        
+        self.recording_thread = threading.Thread(
+            target=self.audio_recorder.record, name="RecordingThread", daemon=True
+        )
+        self.recording_thread.start()
+        
+        self.asr_thread = threading.Thread(
+            target=self._asr_worker, name="ASRThread", daemon=True
+        )
+        self.asr_thread.start()
+        
+        self.get_logger().info("ASR Audio节点已启动")
+    
+    def _load_config(self):
+        config_file = os.path.join(
+            get_package_share_directory('robot_speaker'),
+            'config',
+            'voice.yaml'
+        )
+        with open(config_file, 'r') as f:
+            config = yaml.safe_load(f)
+        
+        mic = config['audio']['microphone']
+        self.input_device_index = mic['device_index']
+        self.sample_rate = mic['sample_rate']
+        self.channels = mic['channels']
+        self.chunk = mic['chunk']
+        
+        dashscope = config['dashscope']
+        self.dashscope_api_key = dashscope['api_key']
+        self.asr_model = dashscope['asr']['model']
+        self.asr_url = dashscope['asr']['url']
+    
+    def _init_components(self):
+        self.audio_recorder = AudioRecorder(
+            device_index=self.input_device_index,
+            sample_rate=self.sample_rate,
+            channels=self.channels,
+            chunk=self.chunk,
+            audio_queue=self.audio_queue,
+            stop_event=self.stop_event,
+            logger=self.get_logger()
+        )
+        
+        self.asr_client = DashScopeASR(
+            api_key=self.dashscope_api_key,
+            sample_rate=self.sample_rate,
+            model=self.asr_model,
+            url=self.asr_url,
+            logger=self.get_logger()
+        )
+        
+        self.asr_client.on_sentence_end = self._on_asr_result
+        self.asr_client.on_speech_started = lambda: self._put_vad_event("speech_started")
+        self.asr_client.on_speech_stopped = lambda: (self._clear_result(), self._put_vad_event("speech_stopped"))
+        self.asr_client.start()
+    
+    def _on_asr_result(self, text: str):
+        if not text or not text.strip():
+            return
+        
+        self._last_result = text.strip()
+        self._last_result_time = time.time()
+        self._result_event.set()
+
+        is_abnormal = self._last_result in self._abnormal_results and len(self._last_result) <= 2
+        if is_abnormal:
+            self._consecutive_abnormal_count += 1
+            self.get_logger().warn(f"[ASR] 检测到异常识别结果: '{self._last_result}' | 连续异常:{self._consecutive_abnormal_count}次")
+            # 如果连续多次异常，强制重置 ASR 连接
+            if self._consecutive_abnormal_count >= self.MAX_CONSECUTIVE_ABNORMAL:
+                self.get_logger().error(f"[ASR] 连续{self._consecutive_abnormal_count}次异常识别，标记需要重连")
+                self.asr_client._consecutive_send_failures = self.asr_client.MAX_CONSECUTIVE_FAILURES
+                self._consecutive_abnormal_count = 0
+        else:
+            # 正常识别，重置异常计数
+            self._consecutive_abnormal_count = 0
+        try:
+            self.get_logger().info(f"[ASR] 识别结果: {self._last_result}")
+        except Exception:
+            pass
+    
+    def _put_vad_event(self, event_type):
+        try:
+            self.vad_event_queue.put(event_type, timeout=0.1)
+        except queue.Full:
+            try:
+                self.get_logger().warn(f"[ASR] VAD事件队列已满，丢弃{event_type}事件")
+            except Exception:
+                pass
+    
+    def _audio_data_callback(self, request, response):
+        import threading
+        self.get_logger().debug(f"[ASR-AudioData] 回调触发 | command:{request.command} | 线程:{threading.current_thread().name}")
+        response.sample_rate = self.sample_rate
+        response.channels = self.channels
+        
+        if request.command == "start":
+            with self.audio_lock:
+                self.get_logger().debug(f"[ASR-AudioData] start命令 | 旧buffer大小:{len(self.audio_buffer)} | recording:{self.audio_recording}")
+                self.audio_buffer.clear()
+                self.audio_recording = True
+                self.get_logger().debug(f"[ASR-AudioData] buffer已清空，recording=True")
+            response.success = True
+            response.message = "开始录音"
+            response.samples = 0
+            return response
+            
+        if request.command == "stop":
+            self.get_logger().debug(f"[ASR-AudioData] stop命令 | recording:{self.audio_recording}")
+            with self.audio_lock:
+                self.audio_recording = False
+                audio_list = list(self.audio_buffer)
+                self.get_logger().debug(f"[ASR-AudioData] 读取buffer | 大小:{len(audio_list)}")
+                self.audio_buffer.clear() 
+            if len(audio_list) > 0:
+                audio_array = np.array(audio_list, dtype=np.int16)
+                response.success = True
+                response.audio_data = audio_array.tobytes()
+                response.samples = len(audio_list)
+                response.message = f"录音完成{len(audio_list)}样本"
+                self.get_logger().debug(f"[ASR-AudioData] 返回音频 | samples:{len(audio_list)}")
+            else:
+                response.success = False
+                response.message = "缓冲区为空"
+                response.samples = 0
+                self.get_logger().debug(f"[ASR-AudioData] buffer为空！")
+            return response
+                
+        if request.command == "get":
+            with self.audio_lock:
+                audio_list = list(self.audio_buffer)
+            if len(audio_list) > 0:
+                audio_array = np.array(audio_list, dtype=np.int16)
+                response.success = True
+                response.audio_data = audio_array.tobytes()
+                response.samples = len(audio_list)
+                response.message = f"获取到{len(audio_list)}样本"
+            else:
+                response.success = False
+                response.message = "缓冲区为空"
+                response.samples = 0
+        return response
+    
+    def _vad_event_callback(self, request, response):
+        timeout = request.timeout_ms / 1000.0 if request.timeout_ms > 0 else None
+        try:
+            event = self.vad_event_queue.get(timeout=timeout)
+            response.success = True
+            response.event = event
+            response.message = "收到VAD事件"
+        except queue.Empty:
+            response.success = False
+            response.event = "none"
+            response.message = "等待超时"
+        except KeyboardInterrupt:
+            try:
+                self.get_logger().info("[ASR-VAD] 收到中断信号，正在关闭")
+            except Exception:
+                pass
+            response.success = False
+            response.event = "none"
+            response.message = "节点正在关闭"
+            self.stop_event.set()
+        return response
+    
+    def _clear_result(self):
+        self._last_result = None
+        self._last_result_time = None
+        self._result_event.clear()
+    
+    def _return_result(self, response, text, message):
+        response.success = True
+        response.text = text
+        response.message = message
+        self._clear_result()
+        return response
+    
+    def _recognize_callback(self, request, response):
+        if request.command == "stop":
+            if self.asr_client.running:
+                self.asr_client.stop_current_recognition()
+            response.success = True
+            response.text = ""
+            response.message = "识别已停止"
+            return response
+        
+        if request.command == "reset":
+            self.asr_client.stop_current_recognition()
+            time.sleep(0.1)
+            self.asr_client.start()
+            response.success = True
+            response.text = ""
+            response.message = "识别器已重置"
+            return response
+        
+        if self.asr_client.running:
+            current_time = time.time()
+            if (self._last_result and self._last_result_time and 
+                (current_time - self._last_result_time) < 0.3) or (self._result_event.is_set() and self._last_result):
+                return self._return_result(response, self._last_result, "返回最近识别结果")
+            if self._result_event.wait(timeout=2.0) and self._last_result:
+                return self._return_result(response, self._last_result, "识别成功（等待中）")
+            self.asr_client.stop_current_recognition()
+            time.sleep(0.2)
+        
+        self._clear_result()
+        
+        if not self.asr_client.running and not self.asr_client.start():
+            response.success = False
+            response.text = ""
+            response.message = "ASR启动失败"
+            return response
+        if self._result_event.wait(timeout=5.0) and self._last_result:
+            response.success = True
+            response.text = self._last_result
+            response.message = "识别成功"
+        else:
+            response.success = False
+            response.text = ""
+            response.message = "识别超时" if not self._result_event.is_set() else "识别结果为空"
+        self._clear_result()
+        return response
+    
+    def _asr_worker(self):
+        while not self.stop_event.is_set():
+            try:
+                audio_chunk = self.audio_queue.get(timeout=0.1)
+            except queue.Empty:
+                continue
+            except KeyboardInterrupt:
+                try:
+                    self.get_logger().info("[ASR-Worker] 收到中断信号")
+                except Exception:
+                    pass
+                break
+            
+            if self.audio_recording:
+                self.get_logger().debug(f"[ASR-Worker] 收到音频chunk | recording:{self.audio_recording} | buffer_size:{len(self.audio_buffer)}")
+                try:
+                    audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
+                    with self.audio_lock:
+                        self.audio_buffer.extend(audio_array)
+                except Exception as e:
+                    self.get_logger().error(f"[ASR-Worker] buffer写入异常 | 错误:{e}")
+                    pass
+            
+            if self.asr_client.running:
+                self.asr_client.send_audio(audio_chunk)
+            else:
+                if not self.asr_client.start():
+                    time.sleep(1.0) 
+    
+    def destroy_node(self):
+        if self._shutdown_in_progress:
+            return
+        self._shutdown_in_progress = True
+        try:
+            self.get_logger().info("ASR Audio节点正在关闭...")
+        except Exception:
+            pass
+        self.stop_event.set()
+        if hasattr(self, 'recording_thread') and self.recording_thread.is_alive():
+            self.recording_thread.join(timeout=1.0)
+        if hasattr(self, 'asr_thread') and self.asr_thread.is_alive():
+            self.asr_thread.join(timeout=1.0)
+        try:
+            if hasattr(self, 'audio_recorder'):
+                self.audio_recorder.audio.terminate()
+        except Exception:
+            pass
+        try:
+            if hasattr(self, 'asr_client'):
+                self.asr_client.stop()
+        except Exception:
+            pass
+        try:
+            super().destroy_node()
+        except Exception:
+            pass
+
+
+def main(args=None):
+    rclpy.init(args=args)
+    node = ASRAudioNode()
+    try:
+        rclpy.spin(node)
+    except KeyboardInterrupt:
+        try:
+            node.get_logger().info("收到中断信号，正在关闭节点")
+        except Exception:
+            pass
+    finally:
+        try:
+            node.destroy_node()
+        except Exception:
+            pass
+        try:
+            rclpy.shutdown()
+        except Exception:
+            pass
+
+
+if __name__ == '__main__':
+    main()
--- a/robot_speaker/services/tts_audio_node.py
+++ b/robot_speaker/services/tts_audio_node.py
@@ -0,0 +1,341 @@
+import rclpy
+from rclpy.node import Node
+from rclpy.callback_groups import ReentrantCallbackGroup
+from interfaces.srv import TTSSynthesize
+import threading
+import yaml
+import os
+import signal
+import subprocess
+import time
+import dashscope
+from dashscope.audio.tts_v2 import SpeechSynthesizer, ResultCallback, AudioFormat
+from ament_index_python.packages import get_package_share_directory
+
+
+class DashScopeTTSClient:
+    def __init__(self, api_key: str, 
+                 model: str,
+                 voice: str,
+                 card_index: int, 
+                 device_index: int,
+                 output_sample_rate: int,
+                 output_channels: int,
+                 output_volume: float,
+                 tts_source_sample_rate: int,
+                 tts_source_channels: int,
+                 tts_ffmpeg_thread_queue_size: int,
+                 force_stop_delay: float,
+                 cleanup_timeout: float,
+                 terminate_timeout: float,
+                 logger):
+        dashscope.api_key = api_key
+        self.model = model
+        self.voice = voice
+        self.card_index = card_index
+        self.device_index = device_index
+        self.output_sample_rate = output_sample_rate
+        self.output_channels = output_channels
+        self.output_volume = output_volume
+        self.tts_source_sample_rate = tts_source_sample_rate
+        self.tts_source_channels = tts_source_channels
+        self.tts_ffmpeg_thread_queue_size = tts_ffmpeg_thread_queue_size
+        self.force_stop_delay = force_stop_delay
+        self.cleanup_timeout = cleanup_timeout
+        self.terminate_timeout = terminate_timeout
+        self.logger = logger
+        self.current_ffmpeg_pid = None
+        self._current_callback = None
+        
+        self.alsa_device = f"plughw:{card_index},{device_index}" if (
+            card_index >= 0 and device_index >= 0
+        ) else "default"
+    
+    
+    def force_stop(self):
+        if self._current_callback:
+            self._current_callback._interrupted = True
+        if not self.current_ffmpeg_pid:
+            if self.logger:
+                self.logger.warn("[TTS] force_stop: current_ffmpeg_pid is None")
+            return
+        pid = self.current_ffmpeg_pid
+        try:
+            if self.logger:
+                self.logger.info(f"[TTS] force_stop: 正在kill进程 {pid}")
+            os.kill(pid, signal.SIGTERM)
+            time.sleep(self.force_stop_delay)
+            try:
+                os.kill(pid, 0)
+                os.kill(pid, signal.SIGKILL)
+                if self.logger:
+                    self.logger.info(f"[TTS] force_stop: 已发送SIGKILL到进程 {pid}")
+            except ProcessLookupError:
+                if self.logger:
+                    self.logger.info(f"[TTS] force_stop: 进程 {pid} 已退出")
+        except (ProcessLookupError, OSError) as e:
+            if self.logger:
+                self.logger.warn(f"[TTS] force_stop: kill进程失败 {pid}: {e}")
+        finally:
+            self.current_ffmpeg_pid = None
+            self._current_callback = None
+    
+    def synthesize(self, text: str, voice: str = None,
+                   on_chunk=None,
+                   interrupt_check=None) -> bool:
+        callback = _TTSCallback(self, interrupt_check, on_chunk)
+        self._current_callback = callback
+        voice_to_use = voice if voice and voice.strip() else self.voice
+        
+        if not voice_to_use or not voice_to_use.strip():
+            if self.logger:
+                self.logger.error(f"[TTS] Voice参数无效: '{voice_to_use}'")
+            self._current_callback = None
+            return False
+        synthesizer = SpeechSynthesizer(
+            model=self.model,
+            voice=voice_to_use,
+            format=AudioFormat.PCM_22050HZ_MONO_16BIT,
+            callback=callback,
+        )
+        
+        try:
+            synthesizer.streaming_call(text)
+            synthesizer.streaming_complete()
+        finally:
+            callback.cleanup()
+            self._current_callback = None
+        
+        return not callback._interrupted
+
+
+class _TTSCallback(ResultCallback):
+    def __init__(self, tts_client: DashScopeTTSClient,
+                 interrupt_check=None,
+                 on_chunk=None):
+        self.tts_client = tts_client
+        self.interrupt_check = interrupt_check
+        self.on_chunk = on_chunk
+        self._proc = None
+        self._interrupted = False
+        self._cleaned_up = False
+    
+    def on_open(self):
+        ffmpeg_cmd = [
+            'ffmpeg',
+            '-f', 's16le',
+            '-ar', str(self.tts_client.tts_source_sample_rate),
+            '-ac', str(self.tts_client.tts_source_channels),
+            '-i', 'pipe:0',
+            '-f', 'alsa',
+            '-ar', str(self.tts_client.output_sample_rate),
+            '-ac', str(self.tts_client.output_channels),
+            '-acodec', 'pcm_s16le',
+            '-fflags', 'nobuffer',
+            '-flags', 'low_delay',
+            '-avioflags', 'direct',
+            self.tts_client.alsa_device
+        ]
+        
+        insert_pos = ffmpeg_cmd.index('-i')
+        ffmpeg_cmd.insert(insert_pos, str(self.tts_client.tts_ffmpeg_thread_queue_size))
+        ffmpeg_cmd.insert(insert_pos, '-thread_queue_size')
+        
+        if self.tts_client.output_volume != 1.0:
+            acodec_idx = ffmpeg_cmd.index('-acodec')
+            ffmpeg_cmd.insert(acodec_idx, f'volume={self.tts_client.output_volume}')
+            ffmpeg_cmd.insert(acodec_idx, '-af')
+
+        self._proc = subprocess.Popen(
+            ffmpeg_cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE
+        )
+        self.tts_client.current_ffmpeg_pid = self._proc.pid
+    
+    def on_data(self, data: bytes) -> None:
+        if self._interrupted:
+            return
+        
+        if self.interrupt_check and self.interrupt_check():
+            self._interrupted = True
+            if self._proc:
+                self._proc.terminate()
+            return
+        
+        if self._proc and self._proc.stdin and not self._interrupted:
+            try:
+                self._proc.stdin.write(data)
+                self._proc.stdin.flush()
+            except BrokenPipeError:
+                self._interrupted = True
+            except OSError:
+                self._interrupted = True
+        
+        if self.on_chunk and not self._interrupted:
+            self.on_chunk(data)
+    
+    def cleanup(self):
+        if self._cleaned_up or not self._proc:
+            return
+        self._cleaned_up = True
+        
+        if self._proc.stdin and not self._proc.stdin.closed:
+            self._proc.stdin.close()
+        
+        if self._proc.poll() is None:
+            self._proc.wait(timeout=self.tts_client.cleanup_timeout)
+            if self._proc.poll() is None:
+                self._proc.terminate()
+                self._proc.wait(timeout=self.tts_client.terminate_timeout)
+                if self._proc.poll() is None:
+                    self._proc.kill()
+        
+        if self.tts_client.current_ffmpeg_pid == self._proc.pid:
+            self.tts_client.current_ffmpeg_pid = None
+
+
+class TTSAudioNode(Node):
+    def __init__(self):
+        super().__init__('tts_audio_node')
+        self._load_config()
+        self._init_tts_client()
+        
+        self.callback_group = ReentrantCallbackGroup()
+        self.synthesize_service = self.create_service(
+            TTSSynthesize, '/tts/synthesize', self._synthesize_callback,
+            callback_group=self.callback_group
+        )
+        
+        self.interrupt_event = threading.Event()
+        self.playing_lock = threading.Lock()
+        self.is_playing = False
+        
+        self.get_logger().info("[TTS] TTS Audio节点已启动")
+    
+    def _load_config(self):
+        config_file = os.path.join(
+            get_package_share_directory('robot_speaker'),
+            'config',
+            'voice.yaml'
+        )
+        with open(config_file, 'r') as f:
+            config = yaml.safe_load(f)
+        
+        audio = config['audio']
+        soundcard = audio['soundcard']
+        tts_audio = audio['tts']
+        dashscope = config['dashscope']
+        
+        self.output_card_index = soundcard['card_index']
+        self.output_device_index = soundcard['device_index']
+        self.output_sample_rate = soundcard['sample_rate']
+        self.output_channels = soundcard['channels']
+        self.output_volume = soundcard['volume']
+        
+        self.tts_source_sample_rate = tts_audio['source_sample_rate']
+        self.tts_source_channels = tts_audio['source_channels']
+        self.tts_ffmpeg_thread_queue_size = tts_audio['ffmpeg_thread_queue_size']
+        self.force_stop_delay = tts_audio['force_stop_delay']
+        self.cleanup_timeout = tts_audio['cleanup_timeout']
+        self.terminate_timeout = tts_audio['terminate_timeout']
+        self.interrupt_wait = tts_audio['interrupt_wait']
+        
+        self.dashscope_api_key = dashscope['api_key']
+        self.tts_model = dashscope['tts']['model']
+        self.tts_voice = dashscope['tts']['voice']
+    
+    def _init_tts_client(self):
+        self.tts_client = DashScopeTTSClient(
+            api_key=self.dashscope_api_key,
+            model=self.tts_model,
+            voice=self.tts_voice,
+            card_index=self.output_card_index,
+            device_index=self.output_device_index,
+            output_sample_rate=self.output_sample_rate,
+            output_channels=self.output_channels,
+            output_volume=self.output_volume,
+            tts_source_sample_rate=self.tts_source_sample_rate,
+            tts_source_channels=self.tts_source_channels,
+            tts_ffmpeg_thread_queue_size=self.tts_ffmpeg_thread_queue_size,
+            force_stop_delay=self.force_stop_delay,
+            cleanup_timeout=self.cleanup_timeout,
+            terminate_timeout=self.terminate_timeout,
+            logger=self.get_logger()
+        )
+    
+    def _synthesize_callback(self, request, response):
+        command = request.command if request.command else "synthesize"
+        
+        if command == "interrupt":
+            with self.playing_lock:
+                was_playing = self.is_playing
+                has_pid = self.tts_client.current_ffmpeg_pid is not None
+                if was_playing or has_pid:
+                    self.interrupt_event.set()
+                    self.tts_client.force_stop()
+                    self.is_playing = False
+                    response.success = True
+                    response.message = "已中断播放"
+                    response.status = "interrupted"
+                else:
+                    response.success = False
+                    response.message = "没有正在播放的内容"
+                    response.status = "none"
+            return response
+        
+        if not request.text or not request.text.strip():
+            response.success = False
+            response.message = "文本为空"
+            response.status = "error"
+            return response
+        
+        with self.playing_lock:
+            if self.is_playing:
+                self.tts_client.force_stop()
+                time.sleep(self.interrupt_wait)
+            self.is_playing = True
+        
+            self.interrupt_event.clear()
+        
+        def synthesize_worker():
+            try:
+                success = self.tts_client.synthesize(
+                    request.text.strip(),
+                    voice=request.voice if request.voice else None,
+                    interrupt_check=lambda: self.interrupt_event.is_set()
+                )
+                with self.playing_lock:
+                    self.is_playing = False
+                if self.get_logger():
+                    if success:
+                        self.get_logger().info("[TTS] 合成并播放成功")
+                    else:
+                        self.get_logger().info("[TTS] 播放被中断")
+            except Exception as e:
+                with self.playing_lock:
+                    self.is_playing = False
+                if self.get_logger():
+                    self.get_logger().error(f"[TTS] 合成失败: {e}")
+        
+        thread = threading.Thread(target=synthesize_worker, daemon=True)
+        thread.start()
+        
+        response.success = True
+        response.message = "合成任务已启动"
+        response.status = "playing"
+        return response
+
+
+def main(args=None):
+    rclpy.init(args=args)
+    node = TTSAudioNode()
+    rclpy.spin(node)
+    node.destroy_node()
+    rclpy.shutdown()
+
+
+if __name__ == '__main__':
+    main()
+
--- a/setup.py
+++ b/setup.py
@@ -1,26 +1,38 @@
-from setuptools import find_packages, setup
+from setuptools import setup, find_packages
+import os
+from glob import glob

 package_name = 'robot_speaker'

 setup(
    name=package_name,
-    version='0.0.0',
-    packages=[package_name],
+    version='0.0.1',
+    packages=find_packages(where='.'),
+    package_dir={'': '.'},
    data_files=[
        ('share/ament_index/resource_index/packages',
            ['resource/' + package_name]),
        ('share/' + package_name, ['package.xml']),
+        (os.path.join('share', package_name, 'launch'), glob('launch/*.launch.py')),
+        (os.path.join('share', package_name, 'config'), glob('config/*.yaml') + glob('config/*.json')),
+        (os.path.join('share', package_name, 'srv'), glob('srv/*.srv')),
+    ],
+    install_requires=[
+        'setuptools',
+        'pypinyin',
    ],
-    install_requires=['setuptools'],
    zip_safe=True,
    maintainer='mzebra',
    maintainer_email='mzebra@foxmail.com',
-    description='TODO: Package description',
+    description='语音识别和合成ROS2包',
    license='Apache-2.0',
-    tests_require=['pytest'],
    entry_points={
        'console_scripts': [
-            'robot_speaker_node=robot_speaker.robot_speaker_node:main'
+            'robot_speaker_node = robot_speaker.core.robot_speaker_node:main',
+            'register_speaker_node = robot_speaker.core.register_speaker_node:main',
+            'skill_bridge_node = robot_speaker.bridge.skill_bridge_node:main',
+            'asr_audio_node = robot_speaker.services.asr_audio_node:main',
+            'tts_audio_node = robot_speaker.services.tts_audio_node:main',
        ],
    },
 )
--- a/srv/ASRRecognize.srv
+++ b/srv/ASRRecognize.srv
@@ -0,0 +1,10 @@
+# 请求：启动识别
+string command  # "start" (默认), "stop", "reset"
+---
+# 响应：识别结果
+bool success
+string text     # 识别文本（空字符串表示未识别到）
+string message  # 状态消息
+
+
+
--- a/srv/AudioData.srv
+++ b/srv/AudioData.srv
@@ -0,0 +1,27 @@
+# 请求：获取音频数据
+string command  # "start" (开始录音), "stop" (停止并返回), "get" (获取当前缓冲区)
+int32 duration_ms  # 录音时长（毫秒），仅用于start命令
+---
+# 响应：音频数据
+bool success
+uint8[] audio_data  # PCM音频数据（int16格式）
+int32 sample_rate
+int32 channels
+int32 samples  # 样本数
+string message
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/srv/TTSSynthesize.srv
+++ b/srv/TTSSynthesize.srv
@@ -0,0 +1,14 @@
+# 请求：合成文本或中断命令
+string command  # "synthesize" (默认), "interrupt"
+string text
+string voice    # 可选，默认使用配置
+---
+# 响应：合成状态
+bool success
+string message
+string status   # "playing", "completed", "interrupted"
+
+
+
+
+
--- a/srv/VADEvent.srv
+++ b/srv/VADEvent.srv
@@ -0,0 +1,11 @@
+# 请求：等待VAD事件
+string command  # "wait" (等待下一个事件)
+int32 timeout_ms  # 超时时间（毫秒），0表示无限等待
+---
+# 响应：VAD事件
+bool success
+string event     # "speech_started", "speech_stopped", "none"
+string message
+
+
+
Author	SHA1	Message	Date
lxy	a0ceb934ce	修复1在WebSocket回调线程内执行stop/start竞争条件，'socket already closed'循环出现，2陈旧结果5秒复用窗口旧识别结果污染新请求，意图混乱	2026-03-06 17:29:55 +08:00
NuoDaJia02	ed861a9fb1	fix run issues	2026-01-30 10:53:07 +08:00
lxy	aaa17c10f2	修复rclpy.spin() 单线程执行器导致异步回调死锁，增加ASR WebSocket 自动重连机制	2026-01-29 17:24:49 +08:00
NuoDaJia02	c65395c50f	merge remote	2026-01-28 14:45:42 +08:00
lxy	9c8bd017e1	分出asr和tts节点	2026-01-27 20:53:43 +08:00
NuoDaJia02	856c07715c	Update voice configuration and skill bridge logic - Update voice.yaml to use default audio devices and 48kHz sample rate. - Update voice.yaml paths for voice model and interfaces. - Improve skill_bridge_node.py JSON parsing and skill parameter handling. - Update audio_pipeline.py warning message for device detection.	2026-01-22 17:28:28 +08:00
lxy	e8a9821ce4	配置文件增加没有图像skill_sequence/chat_camera是否推理的button，扩充kb_qa的回复，减少闲聊模式的回复长度	2026-01-21 18:04:26 +08:00
lxy	ab1fb4f3f8	修改声纹验证失败仍然执行，增加接口解析提示词	2026-01-21 15:13:31 +08:00
lxy	dd6ccf77bb	修改技能序列历史管理--不接入历史上下文	2026-01-21 11:22:25 +08:00
lxy	7324630458	修改声纹注册选择第一句话完整片段。去掉注册时多余的阈值信息，修改llm技能序列输出格式	2026-01-20 21:39:15 +08:00
NuoDaJia02	04ca80c3f9	add rebuild service to skill bridge	2026-01-20 15:20:48 +08:00
lxy	98c0eb5ca5	refactor: 删除回声消除相关代码，支持从hivecore_robot_drivers/img_dev获取图片	2026-01-20 09:28:57 +08:00
lxy	71062701e1	Merge branch 'feature-deploy' into develop # Conflicts: # config/voice.yaml # robot_speaker/core/robot_speaker_node.py	2026-01-19 15:16:11 +08:00
lxy	0409ce0de4	修正声纹验证音频长度计算	2026-01-19 14:21:06 +08:00
NuoDaJia02	ce0d581770	fix torch issue	2026-01-19 13:31:49 +08:00
NuoDaJia02	a1b91ed52f	disable echo cancellation	2026-01-19 11:35:01 +08:00
lxy	6d101b9d9e	添加与行为树的桥接节点	2026-01-19 09:58:40 +08:00
NuoDaJia02	c282f9b4de	fix deploy issues	2026-01-19 09:09:28 +08:00
lxy	9fd658990c	datasets==3.6.0	2026-01-16 10:49:16 +08:00
lxy	0c118412ec	代码重构，区分声纹注册和主节点	2026-01-16 10:40:40 +08:00
lxy	eb91e2f139	增加AEC	2026-01-13 22:14:46 +08:00
lxy	838a4a357c	增加声纹验证	2026-01-12 20:39:47 +08:00
lxy	9c775cff5c	增加中断词	2026-01-12 17:40:08 +08:00
lxy	63a21999bb	增加相机调用，修复对话历史管理，修复asr停止识别逻辑	2026-01-08 20:59:58 +08:00
lxy	8fffd4ab42	chore: add .gitignore and stop tracking build/install/log outputs	2026-01-07 14:30:16 +08:00
xyliu	b90d84c325	feat(robot_speaker): 创建语音包包含唤醒词，asr,llm,tts等。	2026-01-07 14:14:29 +08:00
				`@@ -0,0 +1,24 @@`
				`# Bridge package for connecting LLM outputs to brain execution.`