26 Commits

Author SHA1 Message Date
lxy
a0ceb934ce 修复1在WebSocket回调线程内执行stop/start竞争条件,'socket already closed'循环出现,2陈旧结果5秒复用窗口旧识别结果污染新请求,意图混乱 2026-03-06 17:29:55 +08:00
NuoDaJia02
ed861a9fb1 fix run issues 2026-01-30 10:53:07 +08:00
lxy
aaa17c10f2 修复rclpy.spin() 单线程执行器导致异步回调死锁,增加ASR WebSocket 自动重连机制 2026-01-29 17:24:49 +08:00
NuoDaJia02
c65395c50f merge remote 2026-01-28 14:45:42 +08:00
lxy
9c8bd017e1 分出asr和tts节点 2026-01-27 20:53:43 +08:00
NuoDaJia02
856c07715c Update voice configuration and skill bridge logic
- Update voice.yaml to use default audio devices and 48kHz sample rate.
- Update voice.yaml paths for voice model and interfaces.
- Improve skill_bridge_node.py JSON parsing and skill parameter handling.
- Update audio_pipeline.py warning message for device detection.
2026-01-22 17:28:28 +08:00
lxy
e8a9821ce4 配置文件增加没有图像skill_sequence/chat_camera是否推理的button,扩充kb_qa的回复,减少闲聊模式的回复长度 2026-01-21 18:04:26 +08:00
lxy
ab1fb4f3f8 修改声纹验证失败仍然执行,增加接口解析提示词 2026-01-21 15:13:31 +08:00
lxy
dd6ccf77bb 修改技能序列历史管理--不接入历史上下文 2026-01-21 11:22:25 +08:00
lxy
7324630458 修改声纹注册选择第一句话完整片段。去掉注册时多余的阈值信息,修改llm技能序列输出格式 2026-01-20 21:39:15 +08:00
NuoDaJia02
04ca80c3f9 add rebuild service to skill bridge 2026-01-20 15:20:48 +08:00
lxy
98c0eb5ca5 refactor: 删除回声消除相关代码,支持从hivecore_robot_drivers/img_dev获取图片 2026-01-20 09:28:57 +08:00
lxy
71062701e1 Merge branch 'feature-deploy' into develop
# Conflicts:
#	config/voice.yaml
#	robot_speaker/core/robot_speaker_node.py
2026-01-19 15:16:11 +08:00
lxy
0409ce0de4 修正声纹验证音频长度计算 2026-01-19 14:21:06 +08:00
NuoDaJia02
ce0d581770 fix torch issue 2026-01-19 13:31:49 +08:00
NuoDaJia02
a1b91ed52f disable echo cancellation 2026-01-19 11:35:01 +08:00
lxy
6d101b9d9e 添加与行为树的桥接节点 2026-01-19 09:58:40 +08:00
NuoDaJia02
c282f9b4de fix deploy issues 2026-01-19 09:09:28 +08:00
lxy
9fd658990c datasets==3.6.0 2026-01-16 10:49:16 +08:00
lxy
0c118412ec 代码重构,区分声纹注册和主节点 2026-01-16 10:40:40 +08:00
lxy
eb91e2f139 增加AEC 2026-01-13 22:14:46 +08:00
lxy
838a4a357c 增加声纹验证 2026-01-12 20:39:47 +08:00
lxy
9c775cff5c 增加中断词 2026-01-12 17:40:08 +08:00
lxy
63a21999bb 增加相机调用,修复对话历史管理,修复asr停止识别逻辑 2026-01-08 20:59:58 +08:00
lxy
8fffd4ab42 chore: add .gitignore and stop tracking build/install/log outputs 2026-01-07 14:30:16 +08:00
b90d84c325 feat(robot_speaker): 创建语音包
包含唤醒词,asr,llm,tts等。
2026-01-07 14:14:29 +08:00
30 changed files with 4585 additions and 67 deletions

9
.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
build/
install/
log/
__pycache__/
*.pyc
*.egg-info/
dist/
lib/
installed_files.txt

116
CMakeLists.txt Normal file
View File

@@ -0,0 +1,116 @@
cmake_minimum_required(VERSION 3.8)
project(robot_speaker)
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
add_compile_options(-Wall -Wextra -Wpedantic)
endif()
find_package(ament_cmake REQUIRED)
find_package(ament_cmake_python REQUIRED)
find_package(interfaces REQUIRED)
# 确保使用系统 Python而不是 conda/miniconda 的 Python
find_program(PYTHON3_CMD python3 PATHS /usr/bin /usr/local/bin NO_DEFAULT_PATH)
if(NOT PYTHON3_CMD)
find_program(PYTHON3_CMD python3)
endif()
if(PYTHON3_CMD)
set(Python3_EXECUTABLE ${PYTHON3_CMD} CACHE FILEPATH "Python 3 executable" FORCE)
set(PYTHON_EXECUTABLE ${PYTHON3_CMD} CACHE FILEPATH "Python executable" FORCE)
endif()
install(CODE "
execute_process(
COMMAND ${PYTHON3_CMD} -m pip install --prefix=${CMAKE_INSTALL_PREFIX} --no-deps ${CMAKE_CURRENT_SOURCE_DIR}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE install_result
OUTPUT_VARIABLE install_output
ERROR_VARIABLE install_error
)
if(NOT install_result EQUAL 0)
message(FATAL_ERROR \"Failed to install Python package. Output: ${install_output} Error: ${install_error}\")
endif()
execute_process(
COMMAND ${PYTHON3_CMD} -c \"
import os
import shutil
import glob
import sysconfig
install_prefix = '${CMAKE_INSTALL_PREFIX}'
build_dir = '${CMAKE_CURRENT_BINARY_DIR}'
python_version = f'{sysconfig.get_python_version()}'
# ROS2 期望的 Python 包位置
ros2_site_packages = os.path.join(install_prefix, 'lib', f'python{python_version}', 'site-packages')
os.makedirs(ros2_site_packages, exist_ok=True)
# pip install --prefix 可能将包安装到不同位置(系统环境通常是 local/lib/pythonX/dist-packages
pip_locations = [
os.path.join(install_prefix, 'local', 'lib', f'python{python_version}', 'dist-packages'),
os.path.join(install_prefix, 'lib', f'python{python_version}', 'site-packages'),
os.path.join(install_prefix, 'local', 'lib', f'python{python_version}', 'site-packages'),
]
# 查找并复制 robot_speaker 包到 ROS2 期望的位置
robot_speaker_src = None
for location in pip_locations:
candidate = os.path.join(location, 'robot_speaker')
if os.path.exists(candidate) and os.path.isdir(candidate):
robot_speaker_src = candidate
break
if robot_speaker_src:
robot_speaker_dest = os.path.join(ros2_site_packages, 'robot_speaker')
if os.path.exists(robot_speaker_dest):
shutil.rmtree(robot_speaker_dest)
if robot_speaker_src != robot_speaker_dest:
shutil.copytree(robot_speaker_src, robot_speaker_dest)
print(f'Copied robot_speaker from {robot_speaker_src} to {ros2_site_packages}')
else:
print(f'robot_speaker already in correct location')
# 处理 entry_points 脚本
lib_dir = os.path.join(install_prefix, 'lib', 'robot_speaker')
os.makedirs(lib_dir, exist_ok=True)
# 脚本可能在 local/bin 或 bin 中
for bin_dir in [os.path.join(install_prefix, 'local', 'bin'), os.path.join(install_prefix, 'bin')]:
if os.path.exists(bin_dir):
scripts = glob.glob(os.path.join(bin_dir, '*_node'))
for script in scripts:
script_name = os.path.basename(script)
dest = os.path.join(lib_dir, script_name)
if script != dest:
shutil.copy2(script, dest)
os.chmod(dest, 0o755)
print(f'Copied {script_name} to {lib_dir}')
\"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE python_result
OUTPUT_VARIABLE python_output
)
if(python_result EQUAL 0)
message(STATUS \"${python_output}\")
else()
message(WARNING \"Failed to setup Python package: ${python_output}\")
endif()
")
install(DIRECTORY launch/
DESTINATION share/${PROJECT_NAME}/launch
FILES_MATCHING PATTERN "*.launch.py"
)
install(DIRECTORY config/
DESTINATION share/${PROJECT_NAME}/config
FILES_MATCHING PATTERN "*.yaml" PATTERN "*.json"
)
if(BUILD_TESTING)
find_package(ament_lint_auto REQUIRED)
ament_lint_auto_find_test_dependencies()
endif()
ament_package()

102
README.md
View File

@@ -1,2 +1,102 @@
# hivecore_robot_voice
# ROS 语音包 (robot_speaker)
## 注册阿里云百炼获取api_key
https://bailian.console.aliyun.com/?tab=model#/api-key
->密钥管理
放到config/voice.yaml
## 安装依赖
1. 系统依赖
```bash
sudo apt-get update
sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg swig meson ninja-build build-essential pkg-config libwebrtc-audio-processing-dev
```
2. Python依赖
```bash
cd ~/ros_learn/hivecore_robot_voice
# 在 Python 3.10 环境下,需要单独安装 aec-audio-processing 以跳过版本检查
pip3 install aec-audio-processing --no-binary :all: --ignore-requires-python --break-system-packages
pip3 install -r requirements.txt --break-system-packages
```
## 编译启动
1. 注册声纹
- 启动节点后可以说er gou我现在正在注册声纹这是一段很长的测试语音请把我的声音录进去。
- 正确的注册姿势包含唤醒词二狗不要停顿的尽量说完3秒
- 现在的逻辑只要识别到二狗就注册,然后退出节点,识别不到二狗继续等待
- 多注册几段,换方向距离注册,可以提高识别相似度,注册方向对声纹相似性影响很大
```bash
cd ~/ros_learn/hivecore_robot_voice
colcon build
source install/setup.bash
```
```bash
# 终端1: 启动ASR节点
ros2 run robot_speaker asr_audio_node
# 终端2: 注册声纹
ros2 run robot_speaker register_speaker_node
```
2. 主节点
- 启动节点后每句交互包含唤醒词,唤醒词和语句之间不要有停顿
- 二狗拍照看看开启图文交互
- 支持已注册声纹用户打断
```bash
cd ~/ros_learn/hivecore_robot_voice
colcon build
source install/setup.bash
ros2 launch robot_speaker voice.launch.py
```
3. ASR节点
```bash
ros2 run robot_speaker asr_audio_node
```
4. TTS节点
```bash
# 终端1: 启动TTS节点
ros2 run robot_speaker tts_audio_node
# 终端2: 启动播放
source install/setup.bash
ros2 service call /tts/synthesize robot_speaker/srv/TTSSynthesize \
"{command: 'synthesize', text: '这是一段很长的测试文本用于测试TTS中断功能。我需要说很多很多内容这样你才有足够的时间来测试中断命令。让我继续说下去这是一段很长的测试文本用于测试TTS中断功能。我需要说很多很多内容这样你才有足够的时间来测试中断命令。让我继续说下去这是一段很长的测试文本用于测试TTS中断功能。我需要说很多很多内容这样你才有足够的时间来测试中断命令。', voice: ''}"
# 终端3: 立即执行中断
source install/setup.bash
ros2 service call /tts/synthesize robot_speaker/srv/TTSSynthesize \
"{command: 'interrupt', text: '', voice: ''}"
```
5. 完整运行
```bash
# 终端1启动 brain 节点
# 终端2启动 voice 节点
# 终端3启动 bridge 节点
# 终端4订阅相机
```
## 用到的命令
1. 音频设备
```bash
# 1. 查看所有音频设备
cat /proc/asound/cards
# 2. 查看 card(1)的流信息(设备参数)
cat /proc/asound/card1/stream0
```
2. 相机设备
```bash
# 1. 查看相机所有基础信息(型号、固件版本、序列号等)
rs-enumerate-devices -c
```
3. 模型下载
```bash
modelscope download --model iic/speech_campplus_sv_zh-cn_16k-common --local_dir [指定路径]
```

46
config/knowledge.json Normal file
View File

@@ -0,0 +1,46 @@
{
"entries": [
{
"id": "robot_identity_1",
"patterns": [
"ni shi shui"
],
"answer": "我叫二狗,是蜂核科技的机器人,很高兴为你服务"
},
{
"id": "robot_identity_2",
"patterns": [
"ni jiao sha"
],
"answer": "我叫二狗呀,我是你的好帮手"
},
{
"id": "wake_word",
"patterns": [
"ni de ming zi"
],
"answer": "我的名字是二狗"
},
{
"id": "skill_1",
"patterns": [
"tiao ge wu"
],
"answer": "这个我真不会,我怕跳起来吓到你"
},
{
"id": "skill_2",
"patterns": [
"ni neng gan"
],
"answer": "我可以陪你聊天,也能帮你干活"
},
{
"id": "skill_3",
"patterns": [
"ni hui gan"
],
"answer": "我可以陪你聊天,你也可以发布具体的指令让我干活"
}
]
}

596
config/speakers.json Normal file
View File

@@ -0,0 +1,596 @@
{
"user_1769589229": {
"embedding": [
0.018443606793880463,
0.12385621666908264,
0.42172902822494507,
1.3724409341812134,
-0.4492957293987274,
-0.6218937635421753,
-0.9678031802177429,
0.678302526473999,
1.744055151939392,
-1.8670854568481445,
-1.9064403772354126,
0.5380862951278687,
0.16627110540866852,
-0.6322636008262634,
-1.7715388536453247,
-0.2003282904624939,
-2.1722018718719482,
0.5719940662384033,
-0.6866416931152344,
1.5751206874847412,
0.27836838364601135,
-0.03192685544490814,
-0.486663818359375,
1.6337751150131226,
-1.0401458740234375,
0.0581182986497879,
0.9309709072113037,
-0.00908487569540739,
-0.05825135484337807,
1.042805552482605,
0.95391845703125,
0.5708717107772827,
-1.3427493572235107,
-0.46104469895362854,
-0.4387856423854828,
-2.2000691890716553,
-1.2598334550857544,
-0.34516626596450806,
-1.5205646753311157,
-1.3810551166534424,
-0.9685532450675964,
0.33360639214515686,
0.7115882039070129,
-0.6262675523757935,
-1.831620216369629,
-1.0514777898788452,
0.677291750907898,
1.6341345310211182,
1.0802626609802246,
0.2750645875930786,
2.517354726791382,
-0.5022090077400208,
-0.512808084487915,
-1.0913103818893433,
-0.5228419899940491,
0.7334955334663391,
-0.04904095083475113,
0.5420397520065308,
0.76543128490448,
-0.28510582447052,
-0.015149342827498913,
-0.38553595542907715,
-0.8873414993286133,
-0.7940725684165955,
2.0196990966796875,
1.079050064086914,
-0.3385912775993347,
0.687140703201294,
0.8218201994895935,
-0.8151140809059143,
-0.12016838788986206,
-0.5360821485519409,
1.5735585689544678,
2.2081315517425537,
-0.8545964956283569,
-0.7184719443321228,
1.0227694511413574,
1.004757285118103,
1.279994010925293,
1.0615602731704712,
-0.026518817991018295,
-0.12089776247739792,
1.9652493000030518,
-2.219129800796509,
1.3730603456497192,
-0.2324638068675995,
1.1085208654403687,
0.38454243540763855,
-0.7640709280967712,
1.8690227270126343,
-2.371783971786499,
0.4353397786617279,
0.6538525223731995,
-1.0312976837158203,
-0.06995117664337158,
2.4163870811462402,
0.16073228418827057,
-0.6870989799499512,
-1.6179540157318115,
-1.3476271629333496,
0.20239552855491638,
-0.050261445343494415,
-0.038828205317258835,
0.4753866195678711,
0.6126185059547424,
0.8918412923812866,
-0.3909176290035248,
0.2147030234336853,
0.39352068305015564,
-0.6788452863693237,
-2.1740481853485107,
1.1571974754333496,
-0.4064839482307434,
1.2412688732147217,
0.7256757616996765,
1.7226027250289917,
-0.0026558407116681337,
-0.5800378918647766,
-0.15300726890563965,
-0.7650083899497986,
-2.0132904052734375,
-1.0595450401306152,
-0.49976038932800293,
0.9254617094993591,
-1.2378792762756348,
1.6656403541564941,
-0.7135428786277771,
-0.9382724761962891,
0.9358375668525696,
0.3685700595378876,
-0.10180468112230301,
-0.1037834882736206,
-0.23670005798339844,
1.75762140750885,
-0.17887072265148163,
0.046728529036045074,
-0.8897371888160706,
-1.3732428550720215,
-1.258161187171936,
-1.8424062728881836,
-0.20653045177459717,
1.2090659141540527,
-2.8419432640075684,
-0.21915671229362488,
0.9777458310127258,
-0.4830246567726135,
-1.0184019804000854,
-1.981907606124878,
-0.9043097496032715,
1.2316601276397705,
0.4337644577026367,
-1.4176150560379028,
-0.0775287076830864,
1.9701248407363892,
-0.49479153752326965,
-0.8893828988075256,
-1.4819709062576294,
1.7628812789916992,
-1.1569868326187134,
-0.5023629069328308,
1.0665892362594604,
0.380581796169281,
0.8616085052490234,
1.566547155380249,
-0.08466020226478577,
-6.428647611755878e-05,
-0.4506562650203705,
1.4498881101608276,
-0.8292654752731323,
-1.5012402534484863,
-2.3441176414489746,
-0.1354956328868866,
0.9400366544723511,
-2.566408157348633,
-0.6355810761451721,
0.6913732290267944,
-1.6313157081604004,
-0.7377245426177979,
-0.6275296807289124,
1.2654041051864624,
-1.2346998453140259,
-0.9682437181472778,
1.750296950340271,
0.145521342754364,
0.3888598680496216,
-0.10642947256565094,
0.534409761428833,
-0.07756417989730835,
-0.36027759313583374,
0.45393145084381104,
0.48670390248298645,
-0.41557130217552185
],
"env": "",
"registered_at": 1769589229.8906083
},
"user_1769589397": {
"embedding": [
-0.4532654285430908,
0.9910935163497925,
0.7677441835403442,
0.6021982431411743,
-0.15526464581489563,
0.07699152082204819,
-0.20115968585014343,
1.1546334028244019,
1.3028098344802856,
-1.102020263671875,
-1.785357117652893,
1.0002834796905518,
0.29556989669799805,
-1.1847732067108154,
-1.6235555410385132,
-0.37263453006744385,
-1.0660096406936646,
1.1186366081237793,
-0.2739306390285492,
1.2053704261779785,
-0.4484007656574249,
-0.036067165434360504,
-0.22930052876472473,
0.7094787359237671,
-1.289236307144165,
0.6730620265007019,
0.139224573969841,
0.9508735537528992,
0.19451767206192017,
0.09167198091745377,
0.6681411266326904,
0.5114644169807434,
-0.41296282410621643,
-0.3286001980304718,
-0.13978855311870575,
-1.4886829853057861,
-1.125450849533081,
-0.5365853309631348,
-1.491755723953247,
-0.9122400879859924,
-0.336325466632843,
0.4180590510368347,
0.28993961215019226,
-0.18810254335403442,
-0.8575659990310669,
-0.7043600082397461,
0.1335042417049408,
0.7772237658500671,
0.5636520385742188,
-0.7948008179664612,
1.7150989770889282,
-0.13010169565677643,
-0.17901964485645294,
0.049516208469867706,
-0.3525894284248352,
0.47636479139328003,
0.4723852276802063,
0.21579991281032562,
0.4706135094165802,
-0.7862219214439392,
0.3285289406776428,
0.06317808479070663,
-0.44086384773254395,
-0.48760634660720825,
0.5548083782196045,
0.9824976921081543,
0.002366408007219434,
0.9341856837272644,
0.7644594311714172,
-0.4781777560710907,
0.140120267868042,
-0.27633413672447205,
0.2346642166376114,
1.050230860710144,
-1.269995927810669,
-0.05720380321145058,
1.291229248046875,
0.9839679002761841,
0.8129491209983826,
1.5021783113479614,
-0.3042735457420349,
-0.5572257041931152,
0.9156222343444824,
-1.9603447914123535,
0.43610018491744995,
0.4057847559452057,
0.7319568395614624,
0.20832139253616333,
-0.3430367410182953,
1.1169347763061523,
-1.3572204113006592,
-0.338941365480423,
0.68513023853302,
-0.5876723527908325,
0.028429267928004265,
1.647197961807251,
0.16790558397769928,
-0.39321064949035645,
-0.6376479864120483,
-0.8013231754302979,
0.2443818897008896,
-0.4631305932998657,
0.22423194348812103,
1.2424927949905396,
-0.29924842715263367,
0.8623120784759521,
-0.1876244992017746,
0.4357032775878906,
-0.1294589787721634,
-0.6075098514556885,
-0.13139747083187103,
0.7296662330627441,
-0.535290539264679,
0.36691513657569885,
0.7906659841537476,
1.353682279586792,
-0.09513506293296814,
-0.25815069675445557,
0.49696165323257446,
-0.8457471132278442,
-1.6415969133377075,
-1.4221503734588623,
-0.8390084505081177,
0.78926020860672,
-0.6399183869361877,
1.2397722005844116,
-0.4215489625930786,
-1.6843048334121704,
0.2801710367202759,
0.14025956392288208,
-0.07066306471824646,
0.6200811862945557,
0.06813270598649979,
1.0460718870162964,
-0.10868484526872635,
-0.4543164074420929,
-0.2009115219116211,
-1.5997940301895142,
-0.901277482509613,
-0.6989807486534119,
-0.6416334509849548,
0.6334083676338196,
-1.9596667289733887,
0.5712984204292297,
0.46919143199920654,
-0.29728618264198303,
-1.1560853719711304,
-1.0001498460769653,
-0.514187753200531,
0.5281404256820679,
-0.30581149458885193,
-0.509894073009491,
-0.5975268483161926,
1.3572251796722412,
-0.6662765145301819,
-0.42911258339881897,
-1.1632274389266968,
1.3836815357208252,
-0.3148840367794037,
-0.4249371290206909,
0.7550786733627319,
-0.05023616552352905,
0.4652675986289978,
0.5009594559669495,
-0.539340615272522,
0.5251657366752625,
-0.3844148814678192,
1.1907575130462646,
-0.05959271639585495,
-1.3751143217086792,
-1.4880049228668213,
0.07974031567573547,
1.0876556634902954,
-1.8819210529327393,
-0.33337870240211487,
0.8860157132148743,
-0.7781083583831787,
-0.18586120009422302,
0.36383724212646484,
-0.05233919247984886,
-1.4240131378173828,
-0.6472991704940796,
0.9354408383369446,
-0.22309261560440063,
0.8367215991020203,
-0.20836658775806427,
0.7580796480178833,
-0.06159410998225212,
-0.1761341243982315,
-0.4837302267551422,
-0.1933494508266449,
-0.23003722727298737
],
"env": "",
"registered_at": 1769589397.5840247
},
"user_1769589494": {
"embedding": [
0.23541471362113953,
0.667961597442627,
0.38707974553108215,
0.6673084497451782,
-1.869005560874939,
-0.4901138246059418,
-0.9352726936340332,
0.49656397104263306,
0.004735413007438183,
1.1503483057022095,
-0.7223904728889465,
1.1780078411102295,
-1.1934415102005005,
0.5933876633644104,
-0.047901105135679245,
-0.6350924372673035,
0.9101377725601196,
0.9945328235626221,
-0.6955628395080566,
-1.4766680002212524,
0.14297445118427277,
1.0183905363082886,
-0.5544767379760742,
0.7108471989631653,
0.12324491143226624,
0.8664625287055969,
-1.0339009761810303,
0.6388123035430908,
-0.3606623709201813,
1.1092636585235596,
-0.2134912759065628,
-1.0129042863845825,
1.1676888465881348,
-0.25849631428718567,
0.21622547507286072,
-0.21850265562534332,
-2.146343469619751,
0.9746832251548767,
-1.0417606830596924,
-1.118934988975525,
0.45158135890960693,
-0.12440077215433121,
0.9278182983398438,
0.673552393913269,
-1.4133691787719727,
-0.9833011031150818,
1.7980570793151855,
1.1249372959136963,
0.6850293278694153,
-0.4094180762767792,
1.3220067024230957,
-0.5562354922294617,
0.35797858238220215,
0.7082096338272095,
0.38267695903778076,
-0.3067215085029602,
-0.12430296093225479,
-1.3622304201126099,
-1.2127659320831299,
-0.14369715750217438,
0.744861900806427,
0.35735955834388733,
0.30824899673461914,
-0.3879246413707733,
0.332281231880188,
0.31966903805732727,
-0.014374539256095886,
0.37477824091911316,
1.2712546586990356,
-0.1365097314119339,
0.5229204893112183,
0.47963225841522217,
0.8237362504005432,
0.7043209671974182,
-1.673892855644226,
0.13583803176879883,
0.5652695298194885,
0.40299320220947266,
0.08790996670722961,
0.2492693066596985,
-0.4379039406776428,
-1.14923894405365,
-0.5844811797142029,
-1.132568359375,
0.49928411841392517,
-0.4650140404701233,
1.1566886901855469,
-0.07155625522136688,
0.36949872970581055,
0.31576940417289734,
-0.4941798746585846,
0.8808521628379822,
0.12892158329486847,
-0.3473222255706787,
-0.1342766135931015,
0.6350370645523071,
-1.524943470954895,
0.11389171332120895,
-0.14301487803459167,
-1.9267250299453735,
-1.5791492462158203,
-0.19560043513774872,
1.5311495065689087,
1.9668593406677246,
-0.964552104473114,
-1.3139442205429077,
-0.9792137145996094,
0.4413124918937683,
-0.18592560291290283,
-0.5387620329856873,
-0.7066377997398376,
0.9972496032714844,
-0.12376223504543304,
-0.6737706661224365,
0.7983350157737732,
0.5444274544715881,
-1.3038272857666016,
1.101620078086853,
-1.5507662296295166,
0.02854086272418499,
-0.6057300567626953,
-0.782597005367279,
0.3482932448387146,
-0.055229704827070236,
0.38987356424331665,
-0.35090646147727966,
-0.190815731883049,
-0.5883421301841736,
0.6471948027610779,
0.5951821804046631,
0.4943574070930481,
-0.1316496580839157,
-0.8007314205169678,
-0.13866537809371948,
-0.012848706915974617,
1.1189842224121094,
-1.1396784782409668,
-0.33659735321998596,
-0.27989667654037476,
0.15101654827594757,
-0.44554460048675537,
0.4468748867511749,
0.4023851454257965,
-0.37321993708610535,
-0.4136735200881958,
-0.22391735017299652,
-0.3109915256500244,
0.9604361057281494,
-0.6297188401222229,
1.3016139268875122,
0.36373990774154663,
-1.05316162109375,
0.41111207008361816,
1.8767585754394531,
-0.754970133304596,
0.16698729991912842,
-0.2632003128528595,
-0.4256270229816437,
1.7379480600357056,
1.2178281545639038,
-0.0028167024720460176,
0.42778730392456055,
-0.12732906639575958,
-0.3295230567455292,
0.36760953068733215,
0.057388786226511,
-0.4098236858844757,
0.9829326868057251,
-0.34538817405700684,
-1.3545023202896118,
-0.4676443040370941,
0.7782469987869263,
0.14342212677001953,
-1.7002856731414795,
0.4266798794269562,
-0.33054685592651367,
0.9089714884757996,
0.5873302221298218,
-0.9908685088157654,
-0.6938693523406982,
-1.5290637016296387,
-0.0892898365855217,
0.5326513648033142,
-0.07912395894527435,
0.4673354923725128,
-1.0052272081375122,
0.13853217661380768,
-0.08604929596185684,
0.3112524449825287,
-1.377512812614441,
-0.05614912137389183,
0.2633572220802307
],
"env": "",
"registered_at": 1769589494.0118024
}
}

67
config/voice.yaml Normal file
View File

@@ -0,0 +1,67 @@
# ROS 语音包配置文件
dashscope:
api_key: "sk-7215a5ab7a00469db4072e1672a0661e"
asr:
model: "qwen3-asr-flash-realtime"
url: "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
llm:
model: "qwen3-vl-flash"
base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"
temperature: 0.7
max_tokens: 4096
max_history: 10
summary_trigger: 3
tts:
model: "cosyvoice-v3-flash"
voice: "longanyang"
audio:
microphone:
device_index: -1 # 使用系统默认输入设备
sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz避免重采样可能导致的问题
channels: 1 # 输入声道数单声道MONO适合语音采集
chunk: 1024
heartbeat_interval: 2.0 # 心跳间隔(秒),用于定期输出录音状态
soundcard:
card_index: -1 # 使用默认声卡
device_index: -1 # 使用默认输出设备
sample_rate: 48000 # 输出采样率:默认 44100
channels: 2 # 输出声道数立体声2声道FL+FR
volume: 1.0 # 音量比例0.0-1.00.2表示20%音量)
tts:
source_sample_rate: 22050 # TTS服务固定输出采样率DashScope服务固定值不可修改
source_channels: 1 # TTS服务固定输出声道数DashScope服务固定值不可修改
ffmpeg_thread_queue_size: 4096 # ffmpeg输入线程队列大小增大以减少卡顿
force_stop_delay: 0.1 # 强制停止时的延迟(秒)
cleanup_timeout: 30.0 # 清理超时(秒)
terminate_timeout: 1.0 # 终止超时(秒)
interrupt_wait: 0.1 # 中断等待时间(秒)
vad:
vad_mode: 3 # VAD模式0-33最严格
silence_duration_ms: 1000 # 静音持续时长(毫秒)
min_energy_threshold: 300 # 最小能量阈值
system:
use_wake_word: true # 是否启用唤醒词检测
wake_word: "er gou" # 唤醒词(拼音)
session_timeout: 3.0 # 会话超时时间(秒)
shutup_keywords: "bi zui" # 闭嘴指令关键词(拼音,逗号分隔)
interrupt_command_queue_depth: 10 # 中断命令订阅的队列深度QoS
sv_enabled: false # 是否启用声纹识别
# sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
sv_threshold: 0.65 # 声纹识别阈值0.0-1.0,值越小越宽松,值越大越严格)
# sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json" # 声纹数据库保存路径JSON格式相对于ROS2包share目录
sv_speaker_db_path: "~/ros_learn/hivecore_robot_voice/config/speakers.json" # 声纹数据库保存路径JSON格式相对于ROS2包share目录
sv_buffer_size: 96000 # 声纹验证录音缓冲区大小样本数48kHz下2秒=96000
continue_without_image: true # 多模态意图skill_sequence/chat_camera未获取到图片时是否继续推理
camera:
image:
jpeg_quality: 85 # JPEG压缩质量0-10085是质量和大小平衡点
interfaces:
# root_path: "~/hivecore_robot_os1/hivecore_robot_interfaces/src" # 接口文件根目录,支持 ~ 展开和相对路径
root_path: "~/ros_learn/hivecore_robot_interfaces/src" # 接口文件根目录,支持 ~ 展开和相对路径

View File

@@ -0,0 +1,54 @@
from launch import LaunchDescription
from launch_ros.actions import Node
from launch.actions import SetEnvironmentVariable, RegisterEventHandler
from launch.event_handlers import OnProcessExit
from launch.actions import EmitEvent
from launch.events import Shutdown
import os
def generate_launch_description():
"""启动声纹注册节点需要ASR服务"""
# 获取interfaces包的install路径
interfaces_install_path = os.path.expanduser('~/ros_learn/hivecore_robot_interfaces/install')
# 设置AMENT_PREFIX_PATH确保能找到interfaces包的消息类型
ament_prefix_path = os.environ.get('AMENT_PREFIX_PATH', '')
if interfaces_install_path not in ament_prefix_path:
if ament_prefix_path:
ament_prefix_path = f'{ament_prefix_path}:{interfaces_install_path}'
else:
ament_prefix_path = interfaces_install_path
# ASR + 音频输入设备节点提供ASR和AudioData服务
asr_audio_node = Node(
package='robot_speaker',
executable='asr_audio_node',
name='asr_audio_node',
output='screen'
)
# 声纹注册节点
register_speaker_node = Node(
package='robot_speaker',
executable='register_speaker_node',
name='register_speaker_node',
output='screen'
)
# 当注册节点退出时,关闭整个 launch
register_exit_handler = RegisterEventHandler(
OnProcessExit(
target_action=register_speaker_node,
on_exit=[
EmitEvent(event=Shutdown(reason='注册完成,关闭所有节点'))
]
)
)
return LaunchDescription([
SetEnvironmentVariable('AMENT_PREFIX_PATH', ament_prefix_path),
asr_audio_node,
register_speaker_node,
register_exit_handler,
])

46
launch/voice.launch.py Normal file
View File

@@ -0,0 +1,46 @@
from launch import LaunchDescription
from launch_ros.actions import Node
from launch.actions import SetEnvironmentVariable
import os
def generate_launch_description():
"""启动语音交互节点,所有参数从 voice.yaml 读取"""
# 获取interfaces包的install路径
interfaces_install_path = os.path.expanduser('~/ros_learn/hivecore_robot_interfaces/install')
# 设置AMENT_PREFIX_PATH确保能找到interfaces包的消息类型
ament_prefix_path = os.environ.get('AMENT_PREFIX_PATH', '')
if interfaces_install_path not in ament_prefix_path:
if ament_prefix_path:
ament_prefix_path = f'{ament_prefix_path}:{interfaces_install_path}'
else:
ament_prefix_path = interfaces_install_path
return LaunchDescription([
SetEnvironmentVariable('AMENT_PREFIX_PATH', ament_prefix_path),
# ASR + 音频输入设备节点同时提供VAD事件Service利用云端ASR的VAD
Node(
package='robot_speaker',
executable='asr_audio_node',
name='asr_audio_node',
output='screen'
),
# TTS + 音频输出设备节点
Node(
package='robot_speaker',
executable='tts_audio_node',
name='tts_audio_node',
output='screen'
),
# 主业务逻辑节点
Node(
package='robot_speaker',
executable='robot_speaker_node',
name='robot_speaker_node',
output='screen'
),
])

View File

@@ -2,13 +2,26 @@
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
<package format="3">
<name>robot_speaker</name>
<version>0.0.0</version>
<description>TODO: Package description</description>
<version>0.0.1</version>
<description>语音识别和合成ROS2包</description>
<maintainer email="mzebra@foxmail.com">mzebra</maintainer>
<license>Apache-2.0</license>
<depend>rclpy</depend>
<depend>example_interfaces</depend>
<depend>std_msgs</depend>
<depend>sensor_msgs</depend>
<depend>cv_bridge</depend>
<depend>ament_index_python</depend>
<depend>interfaces</depend>
<buildtool_depend>ament_cmake</buildtool_depend>
<buildtool_depend>ament_cmake_python</buildtool_depend>
<exec_depend>python3-pyaudio</exec_depend>
<exec_depend>python3-requests</exec_depend>
<exec_depend>python3-edge-tts</exec_depend>
<exec_depend>python3-webrtcvad</exec_depend>
<exec_depend>python3-yaml</exec_depend>
<exec_depend>python3-pypinyin</exec_depend>
<test_depend>ament_copyright</test_depend>
<test_depend>ament_flake8</test_depend>
@@ -16,6 +29,6 @@
<test_depend>python3-pytest</test_depend>
<export>
<build_type>ament_python</build_type>
<build_type>ament_cmake</build_type>
</export>
</package>

9
requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
dashscope>=1.20.0
openai>=1.0.0
pyaudio>=0.2.11
pypinyin>=0.49.0
rclpy>=3.0.0
Pillow>=10.0.0
numpy>=1.24.0,<2.0.0 # cv_bridge需要NumPy 1.xNumPy 2.x会导致段错误
PyYAML>=6.0
funasr>=1.0.0

View File

@@ -0,0 +1,6 @@
# robot_speaker package

View File

@@ -0,0 +1,24 @@
# Bridge package for connecting LLM outputs to brain execution.

View File

@@ -0,0 +1,239 @@
#!/usr/bin/env python3
"""
桥接LLM技能序列到小脑ExecuteBtAction并转发反馈/结果。
"""
import json
import os
import re
import rclpy
from rclpy.node import Node
from rclpy.action import ActionClient
from std_msgs.msg import String
from ament_index_python.packages import get_package_share_directory
from interfaces.action import ExecuteBtAction
from interfaces.srv import BtRebuild
class SkillBridgeNode(Node):
def __init__(self):
super().__init__('skill_bridge_node')
self._action_client = ActionClient(self, ExecuteBtAction, '/execute_bt_action')
self._current_epoch = 1
self.run_trigger_ = self.create_client(BtRebuild, '/cerebrum/rebuild_now')
self.rebuild_requests = 0
self._allowed_skills = self._load_allowed_skills()
self.skill_seq_sub = self.create_subscription(
String, '/llm_skill_sequence', self._on_skill_sequence_received, 10
)
self.feedback_pub = self.create_publisher(String, '/skill_execution_feedback', 10)
self.result_pub = self.create_publisher(String, '/skill_execution_result', 10)
self.get_logger().info('SkillBridgeNode started')
def _on_skill_sequence_received(self, msg: String):
raw = (msg.data or "").strip()
if not raw:
return
if not self._allowed_skills:
self.get_logger().warning("No skill whitelist loaded; reject all sequences")
return
# 尝试解析JSON格式
sequence_list = None
try:
data = json.loads(raw)
sequence_list = self._parse_json_sequence(data)
if sequence_list is None:
self.get_logger().error("Invalid skill sequence format; must be JSON or plain text")
return
except (json.JSONDecodeError, ValueError) as e:
self.get_logger().debug(f"JSON解析失败尝试文本解析: {e}")
# JSON格式处理
try:
skill_names = [item["skill"] for item in sequence_list]
if any(skill in skill_names for skill in ["VisionObjectRecognition", "Arm", "GripperCmd0"]):
self.get_logger().info(f"Skill sequence contains special skills, triggering rebuild: {skill_names}")
self.rebuild_now("Trigger", "bt_vision_grasp_dual_arm", "")
else:
skill_params = []
for item in sequence_list:
p = item.get("parameters")
params = ""
if isinstance(p, dict):
lines = []
for k, v in p.items():
lines.append(f"{k}: {v}")
if lines:
params = "\n".join(lines) + "\n"
skill_params.append(params)
self.get_logger().info(f"Sending skill sequence: {skill_names}")
self.get_logger().info(f"Sending skill parameters: {skill_params}")
# 将技能名和参数列表分别用单引号包括,并用逗号隔开
# names_str = ", ".join([f"'{name}'" for name in skill_names])
# params_str = ", ".join([f"'{param}'" for param in skill_params])
names_str = ", ".join(skill_names)
params_str = ", ".join(skill_params)
self.rebuild_now("Remote", names_str, params_str)
except Exception as e:
self.get_logger().error(f"Error processing skill sequence: {e}")
def _load_allowed_skills(self) -> set[str]:
try:
brain_share = get_package_share_directory("brain")
skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
if not os.path.exists(skill_path):
return set()
import yaml
with open(skill_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or []
return {str(entry["name"]) for entry in data if isinstance(entry, dict) and entry.get("name")}
except Exception as e:
self.get_logger().warning(f"Load skills failed: {e}")
return set()
def _extract_skill_sequence(self, text: str) -> tuple[str, list[str]]:
# Accept CSV/space/semicolon and filter by CamelCase tokens
tokens = re.split(r'[,\s;]+', text.strip())
skills = [t for t in tokens if re.match(r'^[A-Z][A-Za-z0-9]*$', t)]
if not skills:
return "", []
invalid = [s for s in skills if s not in self._allowed_skills]
return ",".join(skills), invalid
def _parse_json_sequence(self, data: dict) -> list[dict] | None:
"""解析JSON格式的技能序列"""
if not isinstance(data, dict) or "sequence" not in data:
return None
sequence = data["sequence"]
if not isinstance(sequence, list):
return None
validated = []
for item in sequence:
if not isinstance(item, dict):
continue
skill = item.get("skill")
if not skill or skill not in self._allowed_skills:
continue
execution = item.get("execution", "serial")
if execution not in ["serial", "parallel"]:
execution = "serial"
body_id = item.get("body_id")
# 只支持数字格式(0,1,2)和null与意图路由对齐
if body_id not in [0, 1, 2, None]:
body_id = None
validated.append({
"skill": skill,
"execution": execution,
"body_id": body_id,
"parameters": item.get("parameters")
})
return validated if validated else None
def _send_skill_sequence(self, skill_sequence: str):
if not self._action_client.wait_for_server(timeout_sec=2.0):
self.get_logger().error('ExecuteBtAction server unavailable')
return
goal = ExecuteBtAction.Goal()
goal.epoch = self._current_epoch
self._current_epoch += 1
goal.action_name = skill_sequence
goal.calls = []
self.get_logger().info(f"Dispatch skill sequence: {skill_sequence}")
send_future = self._action_client.send_goal_async(goal, feedback_callback=self._feedback_callback)
rclpy.spin_until_future_complete(self, send_future, timeout_sec=5.0)
if not send_future.done():
self.get_logger().warning("Send goal timed out")
return
goal_handle = send_future.result()
if not goal_handle or not goal_handle.accepted:
self.get_logger().error("Goal rejected")
return
result_future = goal_handle.get_result_async()
rclpy.spin_until_future_complete(self, result_future)
if result_future.done():
self._handle_result(result_future.result())
def _feedback_callback(self, feedback_msg):
fb = feedback_msg.feedback
payload = {
"stage": fb.stage,
"current_skill": fb.current_skill,
"progress": float(fb.progress),
"detail": fb.detail,
"epoch": int(fb.epoch),
}
msg = String()
msg.data = json.dumps(payload, ensure_ascii=True)
self.feedback_pub.publish(msg)
def _handle_result(self, result_wrapper):
result = result_wrapper.result
if not result:
return
payload = {
"success": bool(result.success),
"message": result.message,
"total_skills": int(result.total_skills),
"succeeded_skills": int(result.succeeded_skills),
}
msg = String()
msg.data = json.dumps(payload, ensure_ascii=True)
self.result_pub.publish(msg)
def rebuild_now(self, type: str, config: str, param: str) -> None:
if not self.run_trigger_.service_is_ready():
self.get_logger().error('Rebuild service not ready')
return
self.rebuild_requests += 1
self.get_logger().info(f'Rebuild BehaviorTree now. Total requests: {self.rebuild_requests}')
request = BtRebuild.Request()
request.type = type
request.config = config
request.param = param
self.get_logger().info(f'Calling rebuild service... request info: {request}')
future = self.run_trigger_.call_async(request)
future.add_done_callback(self._rebuild_done_callback)
def _rebuild_done_callback(self, future):
try:
response = future.result()
if response.success:
self.get_logger().info('Rebuild request successful')
else:
self.get_logger().warning(f'Rebuild request failed: {response.message}')
except Exception as e:
self.get_logger().error(f'Rebuild request exception: {str(e)}')
self.get_logger().info(f"Rebuild requested. Total rebuild requests: {str(self.rebuild_requests)}")
def main(args=None):
rclpy.init(args=args)
node = SkillBridgeNode()
rclpy.spin(node)
node.destroy_node()
rclpy.shutdown()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,28 @@
"""核心模块"""

View File

@@ -0,0 +1,130 @@
"""
对话历史管理模块
"""
from dataclasses import dataclass
import threading
@dataclass
class LLMMessage:
"""LLM消息"""
role: str # "user", "assistant", "system"
content: str
class ConversationHistory:
"""对话历史管理器 - 实时语音"""
def __init__(self, max_history: int, summary_trigger: int):
self.max_history = max_history
self.summary_trigger = summary_trigger
self.conversation_history: list[LLMMessage] = []
self.summary: str | None = None
# 待确认机制
self._pending_user_message: LLMMessage | None = None # 待确认的用户消息
self._lock = threading.Lock() # 线程安全锁
def start_turn(self, user_content: str):
"""开始一个新的对话轮次,暂存用户消息等待LLM完成后确认写入历史"""
with self._lock:
self._pending_user_message = LLMMessage(role="user", content=user_content)
def commit_turn(self, assistant_content: str) -> bool:
"""确认当前轮次完成将usr和assistant消息写入历史"""
with self._lock:
if self._pending_user_message is None:
return False
if not assistant_content or not assistant_content.strip():
self._pending_user_message = None
return False
self.conversation_history.append(self._pending_user_message)
self.conversation_history.append(
LLMMessage(role="assistant", content=assistant_content.strip())
)
self._pending_user_message = None
self._maybe_compress()
return True
def cancel_turn(self):
"""取消当前待确认的轮次,丢弃待确认的用户消息,用于处理中断情况,防止不完整内容污染历史"""
with self._lock:
if self._pending_user_message is not None:
self._pending_user_message = None
def add_message(self, role: str, content: str):
"""直接添加消息"""
with self._lock:
# 如果有待确认的轮次,先取消它
self.cancel_turn()
self.conversation_history.append(LLMMessage(role=role, content=content))
self._maybe_compress()
def get_messages(self) -> list[LLMMessage]:
"""获取消息列表"""
with self._lock:
messages = []
if self.summary:
messages.append(LLMMessage(role="system", content=self.summary))
if self.max_history > 0:
messages.extend(self.conversation_history[-self.max_history * 2:])
if self._pending_user_message is not None:
messages.append(self._pending_user_message)
return messages
def has_pending_turn(self) -> bool:
"""检查是否有待确认的轮次"""
with self._lock:
return self._pending_user_message is not None
def _maybe_compress(self):
"""压缩对话历史"""
if self.max_history <= 0:
self.conversation_history.clear()
return
max_len = self.summary_trigger * 2
if len(self.conversation_history) <= max_len:
return
old = self.conversation_history[:-max_len]
self.conversation_history = self.conversation_history[-max_len:]
summary_text = []
for msg in old:
summary_text.append(f"{msg.role}: {msg.content}")
compressed = "对话摘要:\n" + "\n".join(summary_text[-10:])
if self.summary:
self.summary += "\n" + compressed
else:
self.summary = compressed
def clear(self):
"""清空历史和待确认消息"""
with self._lock:
self.conversation_history.clear()
self.summary = None
self._pending_user_message = None

View File

@@ -0,0 +1,272 @@
from dataclasses import dataclass
from typing import Optional
import os
import yaml
import json
from ament_index_python.packages import get_package_share_directory
from pypinyin import pinyin, Style
from robot_speaker.core.skill_interface_parser import SkillInterfaceParser
@dataclass
class IntentResult:
intent: str # "skill_sequence" | "kb_qa" | "chat_text" | "chat_camera"
text: str
need_camera: bool
camera_mode: Optional[str] # "top" | "left" | "right" | "hand_r" | None
system_prompt: Optional[str]
class IntentRouter:
def __init__(self):
self.camera_capture_keywords = [
"pai zhao", "pai ge zhao", "pai zhang zhao"
]
# 动作词列表(拼音)- 用于检测技能序列意图
self.action_verbs = [
"zou", "zou liang bu", "zou ji bu", # 走、走两步、走几步
"na", "na qi", "na zhu", # 拿、拿起、拿住
"ban", "ban yun", # 搬、搬运
"zhua", "zhua qu", # 抓、抓取
"tui", "tui dong", # 推、推动
"la", "la dong", # 拉、拉动
"yi dong", "qian jin", "hou tui", # 移动、前进、后退
"kong zhi", "cao zuo", # 控制、操作
"fang xia", "fang zhi", # 放下、放置
"ju qi", "sheng qi", # 举起、升起
"jia zhua", "jia qi", "jia", # 夹爪、夹起、夹
"shen you bi", "shen zuo bi", "shen chu", "shen shou", # 伸右臂、伸左臂、伸出、伸手
"zhuan quan", "zhuan yi quan", "zhuan", # 转个圈、转一圈、转
]
self.kb_keywords = [
"ni shi shui", "ni de ming zi", "tiao ge wu", "ni jiao sha", "ni hui gan", "ni neng gan"
]
self._cached_skill_names: list[str] | None = None
self._cached_kb_data: list[dict] | None = None
interfaces_root = self._get_interfaces_root()
self.interface_parser = SkillInterfaceParser(interfaces_root)
def _get_interfaces_root(self) -> str:
"""从配置文件读取接口文件根目录"""
try:
robot_speaker_share = get_package_share_directory("robot_speaker")
config_path = os.path.join(robot_speaker_share, "config", "voice.yaml")
with open(config_path, "r", encoding="utf-8") as f:
config = yaml.safe_load(f) or {}
interfaces_config = config.get("interfaces", {})
root_path = interfaces_config.get("root_path", "")
if not root_path:
raise ValueError("interfaces.root_path 未在配置文件中配置")
if root_path.startswith("~"):
root_path = os.path.expanduser(root_path)
if not os.path.isabs(root_path):
config_dir = os.path.dirname(robot_speaker_share)
root_path = os.path.join(config_dir, root_path)
abs_path = os.path.abspath(root_path)
if not os.path.exists(abs_path):
raise ValueError(f"接口文件根目录不存在: {abs_path}")
return abs_path
except Exception as e:
raise ValueError(f"读取接口文件根目录失败: {e}")
def _load_brain_skill_names(self) -> list[str]:
"""加载技能名称(使用接口解析器,避免重复读取)"""
if self._cached_skill_names is not None:
return self._cached_skill_names
skill_names = self.interface_parser.get_skill_names()
self._cached_skill_names = skill_names
return skill_names
def to_pinyin(self, text: str) -> str:
chars = [c for c in text if '\u4e00' <= c <= '\u9fa5']
if not chars:
return ""
py_list = pinyin(''.join(chars), style=Style.NORMAL)
return ' '.join([item[0] for item in py_list]).lower().strip()
def is_skill_sequence_intent(self, text: str, text_pinyin: str | None = None) -> bool:
if text_pinyin is None:
text_pinyin = self.to_pinyin(text)
# 检查动作词(精确匹配:动作词必须是完整的单词序列)
text_words = text_pinyin.split()
for action in self.action_verbs:
action_words = action.split()
# 检查动作词的单词序列是否是文本单词序列的连续子序列
for i in range(len(text_words) - len(action_words) + 1):
if text_words[i:i+len(action_words)] == action_words:
return True
return False
def check_camera_command(self, text: str, text_pinyin: str | None = None) -> tuple[bool, Optional[str]]:
"""检查是否包含拍照指令,返回(是否需要相机, 相机模式)"""
if not text:
return False, None
if text_pinyin is None:
text_pinyin = self.to_pinyin(text)
# 精确匹配:关键词必须作为完整短语出现在文本拼音中
if any(keyword in text_pinyin for keyword in self.camera_capture_keywords):
return True, self.detect_camera_mode(text, text_pinyin)
return False, None
def detect_camera_mode(self, text: str, text_pinyin: str | None = None) -> str:
"""检测相机模式返回与相机驱动匹配的position值left/right/top/hand_r"""
if text_pinyin is None:
text_pinyin = self.to_pinyin(text)
if any(kw in text_pinyin for kw in ["zuo shou", "zuo bi", "zuo bian", "zuo shou bi"]):
return "left"
if any(kw in text_pinyin for kw in ["you shou", "you bi", "you bian", "you shou bi"]):
return "right"
if any(kw in text_pinyin for kw in ["shou bu", "shou", "shou xiang ji", "shou bi xiang ji"]):
return "hand_r"
if any(kw in text_pinyin for kw in ["tou", "nao dai", "ding bu", "shang fang"]):
return "top"
return "top"
def build_skill_prompt(self, execution_status: Optional[str] = None) -> str:
skills = self._load_brain_skill_names()
skills_text = ", ".join(skills) if skills else ""
skill_guard = (
"【技能限制】只能使用以下技能名称:" + skills_text
if skills_text
else "【技能限制】技能列表不可用,请不要输出任何技能名称。"
)
execution_hint = ""
if execution_status:
execution_hint = f"【上一轮执行状态】{execution_status}\n请参考上述执行状态,根据成功/失败信息调整本次技能序列。\n"
else:
execution_hint = "【注意】这是首次执行或没有上一轮执行状态,请根据当前图片和用户请求规划技能序列。\n"
skill_params_doc = self.interface_parser.generate_params_documentation()
return (
"你是机器人任务规划器。\n"
"本任务必须拍照。请根据用户请求选择使用哪个相机拍照,并结合当前环境信息生成简洁、可执行的技能序列。\n"
"如果用户明确要求或者任务明显需要双手/双臂协作(如扶稳+操作、抓取大体积的物体),必须规划双手技能。\n"
+ execution_hint
+ "\n"
"【规划要求】\n"
"1. execution规划判断技能之间的执行关系\n"
" - serial串行技能必须按顺序执行前一个完成后再执行下一个\n"
" - parallel并行技能可以同时执行\n"
"2. parameters规划根据目标物距离和任务需求规划具体参数值\n"
" - parameters字典必须包含该技能接口文件目标字段的所有字段\n"
"【输出格式要求】\n"
"必须输出JSON格式包含sequence数组。每个技能对象包含3个一级字段\n"
"1. skill: 技能名称(字符串)\n"
"2. execution: 执行方式serial串行或 parallel并行\n"
"3. parameters: 参数字典包含该技能接口文件目标字段的所有字段并填入合理的预测值。如果技能无参数使用null。\n"
"\n"
"注意一级字段skill, execution, parameters是固定结构。\n"
"\n"
"【技能参数说明】\n"
+ skill_params_doc +
"\n"
"示例格式:\n"
"{\n"
' "sequence": [\n'
' {"skill": "MoveWheel", "execution": "serial", "parameters": {"move_distance": 1.5, "move_angle": 0.0}},\n'
' {"skill": "Arm", "execution": "serial", "parameters": {"body_id": 0, "data_type": 1, "data_length": 6, "command_id": 0, "frame_time_stamp": 0, "data_array": [0.1, 0.2, 0.3, 0.0, 0.0, 0.0]}},\n'
' {"skill": "GripperCmd0", "execution": "parallel", "parameters": {"loc": 128, "speed": 100, "torque": 80, "mode": 1}}\n'
" ]\n"
"}\n"
+ skill_guard
)
def build_chat_prompt(self, need_camera: bool) -> str:
if need_camera:
return (
"你是一个机器人视觉助理,擅长分析图片中物体的相对位置和空间关系。\n"
"请结合图片内容,重点描述物体之间的相对位置(如左右、前后、上下、远近),仅基于可观察信息回答。\n"
"回答应简短、客观不要超过100个token。"
)
return (
"你是一个表达清晰、语气自然的真人助理。\n"
"请简短地与用户对话不要超过100个token。"
)
def _load_kb_data(self) -> list[dict]:
"""加载知识库数据"""
if self._cached_kb_data is not None:
return self._cached_kb_data
kb_data = []
try:
robot_speaker_share = get_package_share_directory("robot_speaker")
kb_path = os.path.join(robot_speaker_share, "config", "knowledge.json")
with open(kb_path, "r", encoding="utf-8") as f:
data = json.load(f)
kb_data = data["entries"]
except Exception as e:
kb_data = []
self._cached_kb_data = kb_data
return kb_data
def search_kb(self, text: str) -> Optional[str]:
"""检索知识库,返回匹配的答案"""
if not text:
return None
text_pinyin = self.to_pinyin(text)
kb_data = self._load_kb_data()
for entry in kb_data:
patterns = entry["patterns"]
for pattern in patterns:
if pattern in text_pinyin:
answer = entry["answer"]
if answer:
return answer
return None
def build_default_system_prompt(self) -> str:
return (
"你是一个工厂专业的助手。\n"
"- 当用户发送图片时,请仔细观察图片内容,结合用户的问题或描述,提供简短、专业的回答。\n"
"- 当用户没有发送图片时,请自然、友好地与用户对话。\n"
"请根据对话模式调整你的回答风格。"
)
def route(self, text: str) -> IntentResult:
text_pinyin = self.to_pinyin(text)
need_camera, camera_mode = self.check_camera_command(text, text_pinyin)
if self.is_skill_sequence_intent(text, text_pinyin):
# 技能序列意图总是需要相机,复用 detect_camera_mode用户指定了相机就用指定的否则默认 "top"
skill_camera_mode = self.detect_camera_mode(text, text_pinyin)
return IntentResult(
intent="skill_sequence",
text=text,
need_camera=True,
camera_mode=skill_camera_mode,
system_prompt=self.build_skill_prompt()
)
# 精确匹配:关键词必须作为完整短语出现在文本拼音中
if any(keyword in text_pinyin for keyword in self.kb_keywords):
return IntentResult(
intent="kb_qa",
text=text,
need_camera=False,
camera_mode=None,
system_prompt=None # kb_qa不走LLM不需要system_prompt
)
return IntentResult(
intent="chat_camera" if need_camera else "chat_text",
text=text,
need_camera=need_camera,
camera_mode=camera_mode,
system_prompt=self.build_chat_prompt(need_camera)
)

View File

@@ -0,0 +1,236 @@
"""声纹注册独立节点:运行完成后退出"""
import os
import time
import yaml
import numpy as np
import threading
import queue
import rclpy
from rclpy.node import Node
from ament_index_python.packages import get_package_share_directory
from interfaces.srv import ASRRecognize, AudioData, VADEvent
from robot_speaker.core.speaker_verifier import SpeakerVerificationClient
from pypinyin import pinyin, Style
class RegisterSpeakerNode(Node):
def __init__(self):
super().__init__('register_speaker_node')
self._load_config()
self.asr_client = self.create_client(ASRRecognize, '/asr/recognize')
self.audio_data_client = self.create_client(AudioData, '/asr/audio_data')
self.vad_client = self.create_client(VADEvent, '/vad/event')
self.get_logger().info('等待服务启动...')
self.asr_client.wait_for_service(timeout_sec=10.0)
self.audio_data_client.wait_for_service(timeout_sec=10.0)
self.vad_client.wait_for_service(timeout_sec=10.0)
self.get_logger().info('所有服务已就绪')
self.sv_client = SpeakerVerificationClient(
model_path=self.sv_model_path,
threshold=self.sv_threshold,
speaker_db_path=self.sv_speaker_db_path,
logger=self.get_logger()
)
self.registered = False
self.shutting_down = False
self.get_logger().info("声纹注册节点启动,请说唤醒词开始注册(例如:'二狗我现在正在注册声纹,这是一段很长的测试语音,请把我的声音录进去'")
# 使用队列在线程间传递 VAD 事件,避免在子线程中调用 spin_until_future_complete
self.vad_event_queue = queue.Queue()
self.recording = False # 录音状态标志
self.pending_asr_future = None # 待处理的 ASR future
self.pending_audio_future = None # 待处理的 AudioData future
self.state = "waiting_speech" # 状态机waiting_speech, waiting_asr, waiting_audio
self.vad_thread = threading.Thread(target=self._vad_event_worker, daemon=True)
self.vad_thread.start()
self.timer = self.create_timer(0.1, self._main_loop)
def _load_config(self):
config_file = os.path.join(
get_package_share_directory('robot_speaker'),
'config',
'voice.yaml'
)
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
system = config['system']
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
self.sv_threshold = system['sv_threshold']
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
self.wake_word = system['wake_word']
def _vad_event_worker(self):
"""VAD 事件监听线程,只负责接收事件并放入队列,不调用 spin_until_future_complete"""
while not self.registered and not self.shutting_down:
try:
request = VADEvent.Request()
request.command = "wait"
request.timeout_ms = 1000
future = self.vad_client.call_async(request)
# 简单等待 future 完成,不使用 spin_until_future_complete
start_time = time.time()
while not future.done() and (time.time() - start_time) < 1.5:
time.sleep(0.01)
if not future.done() or self.registered or self.shutting_down:
continue
response = future.result()
if response.success and response.event in ["speech_started", "speech_stopped"]:
# 将事件放入队列,由主线程处理
try:
self.vad_event_queue.put(response.event, timeout=0.1)
except queue.Full:
self.get_logger().warn(f"[VAD] 事件队列已满,丢弃事件: {response.event}")
except Exception as e:
if not self.shutting_down:
self.get_logger().error(f"[VAD] 线程异常: {e}")
break
def _start_recording(self):
"""启动录音,返回 future 供主线程处理"""
request = AudioData.Request()
request.command = "start"
return self.audio_data_client.call_async(request)
def _to_pinyin(self, text: str) -> str:
chars = [c for c in text if '\u4e00' <= c <= '\u9fa5']
if not chars:
return ""
py_list = pinyin(chars, style=Style.NORMAL)
return ' '.join([item[0] for item in py_list]).lower().strip()
def _check_wake_word(self, text: str):
text_pinyin = self._to_pinyin(text)
wake_word_pinyin = self.wake_word.lower().strip()
if not wake_word_pinyin:
return
text_pinyin_parts = text_pinyin.split() if text_pinyin else []
wake_word_parts = wake_word_pinyin.split()
has_wake_word = False
for i in range(len(text_pinyin_parts) - len(wake_word_parts) + 1):
if text_pinyin_parts[i:i + len(wake_word_parts)] == wake_word_parts:
has_wake_word = True
break
if has_wake_word:
self.get_logger().info(f"[注册唤醒词] 检测到唤醒词 '{self.wake_word}',停止录音并获取音频")
request = AudioData.Request()
request.command = "stop"
future = self.audio_data_client.call_async(request)
future._future_type = "stop"
self.pending_audio_future = future
def _process_voiceprint_audio(self, response):
"""处理声纹音频数据 - 直接使用 AudioData 返回的音频,不再过滤"""
if not response or not response.success or response.samples == 0:
self.get_logger().error(f"[注册录音] 获取音频数据失败: {response.message if response else '无响应'}")
return
audio_array = np.frombuffer(response.audio_data, dtype=np.int16)
buffer_sec = response.samples / response.sample_rate
self.get_logger().info(f"[注册录音] 音频长度: {buffer_sec:.2f}")
# 直接使用音频,不再进行 VAD 过滤
# 因为 AudioData 服务基于 DashScope VAD已经是语音活动片段
embedding, success = self.sv_client.extract_embedding(
audio_array,
sample_rate=response.sample_rate
)
if not success or embedding is None:
self.get_logger().error("[注册录音] 提取embedding失败")
return
speaker_id = f"user_{int(time.time())}"
if self.sv_client.register_speaker(speaker_id, embedding):
# 注册成功后立即保存到文件
self.sv_client.save_speakers()
self.get_logger().info(f"[注册录音] 注册成功用户ID: {speaker_id},已保存到文件,准备退出")
self.registered = True
else:
self.get_logger().error("[注册录音] 注册失败")
def _main_loop(self):
"""主循环,在主线程中处理所有异步操作"""
# 检查是否完成注册
if self.registered:
self.get_logger().info("注册完成,节点退出")
self.shutting_down = True
self.timer.cancel()
rclpy.shutdown()
return
# 处理待处理的 ASR future
if self.pending_asr_future and self.pending_asr_future.done():
response = self.pending_asr_future.result()
self.pending_asr_future = None
if response.success and response.text:
text = response.text.strip()
if text:
self._check_wake_word(text)
self.state = "waiting_speech"
# 处理待处理的 AudioData future
if self.pending_audio_future and self.pending_audio_future.done():
response = self.pending_audio_future.result()
future_type = getattr(self.pending_audio_future, '_future_type', None)
self.pending_audio_future = None
if future_type == "start":
if response.success:
self.get_logger().info("[注册录音] 已开始录音")
self.recording = True
else:
self.get_logger().warn(f"[注册录音] 启动录音失败: {response.message}")
self.state = "waiting_speech"
elif future_type == "stop":
self.recording = False
self._process_voiceprint_audio(response)
# 处理 VAD 事件队列
try:
event = self.vad_event_queue.get_nowait()
if event == "speech_started" and self.state == "waiting_speech" and not self.recording:
self.get_logger().info("[VAD] 检测到语音开始,启动录音")
future = self._start_recording()
future._future_type = "start"
self.pending_audio_future = future
elif event == "speech_stopped" and self.recording and self.state == "waiting_speech":
self.get_logger().info("[VAD] 检测到语音结束,请求 ASR 识别")
self.state = "waiting_asr"
request = ASRRecognize.Request()
request.command = "start"
self.pending_asr_future = self.asr_client.call_async(request)
except queue.Empty:
pass
def main(args=None):
rclpy.init(args=args)
node = RegisterSpeakerNode()
rclpy.spin(node)
node.destroy_node()
try:
rclpy.shutdown()
except Exception:
pass
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,802 @@
import rclpy
from rclpy.node import Node
from std_msgs.msg import String
import threading
import queue
import time
import base64
import io
import numpy as np
from PIL import Image
from cv_bridge import CvBridge
from interfaces.msg import ImgMsg
import collections
import os
import yaml
from typing import Optional
from ament_index_python.packages import get_package_share_directory
from interfaces.srv import VADEvent, ASRRecognize, TTSSynthesize, AudioData
from openai import OpenAI
from robot_speaker.core.context_manager import ConversationHistory
from robot_speaker.core.speaker_verifier import SpeakerVerificationClient, SpeakerState
from robot_speaker.core.intent_router import IntentRouter, IntentResult
from enum import Enum
class ConversationState(Enum):
IDLE = "idle"
CHECK_VOICE = "check_voice"
AUTHORIZED = "authorized"
class RobotSpeakerNode(Node):
def __init__(self):
super().__init__('robot_speaker_node')
self._load_config()
self.text_queue = queue.Queue()
self.tts_queue = queue.Queue()
self.interrupt_event = threading.Event()
self.stop_event = threading.Event()
self.conversation_state = ConversationState.IDLE
self.state_lock = threading.Lock()
self.current_speaker_id = None
self.current_speaker_state = SpeakerState.UNKNOWN
self.current_speaker_score = 0.0
self.current_speaker_threshold = 0.0
self.sv_lock = threading.Lock()
self.sv_speech_end_event = threading.Event()
self.sv_result_ready_event = threading.Event()
self.sv_audio_buffer = None
self.sv_recording = False
self._init_components()
if self.sv_enabled and self.sv_client:
speaker_count = self.sv_client.get_speaker_count()
if speaker_count == 0:
self.get_logger().info("[Speaker] 声纹数据库为空,请注册声纹")
self.skill_sequence_pub = self.create_publisher(String, '/llm_skill_sequence', 10)
self.last_execution_status: Optional[str] = None
self.execution_status_lock = threading.Lock()
self.skill_result_sub = self.create_subscription(
String, '/skill_execution_result', self._on_skill_result_received, 10
)
self._start_threads()
self.get_logger().info("[Speaker] 语音节点已启动")
def _load_config(self):
config_file = os.path.join(
get_package_share_directory('robot_speaker'),
'config',
'voice.yaml'
)
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
audio = config['audio']
mic = audio['microphone']
soundcard = audio['soundcard']
tts_audio = audio['tts']
self.input_device_index = mic['device_index']
self.output_card_index = soundcard['card_index']
self.output_device_index = soundcard['device_index']
self.sample_rate = mic['sample_rate']
self.channels = mic['channels']
self.chunk = mic['chunk']
self.audio_microphone_heartbeat_interval = mic['heartbeat_interval']
self.output_sample_rate = soundcard['sample_rate']
self.output_channels = soundcard['channels']
self.output_volume = soundcard['volume']
self.audio_tts_source_sample_rate = tts_audio['source_sample_rate']
self.audio_tts_source_channels = tts_audio['source_channels']
self.audio_tts_ffmpeg_thread_queue_size = tts_audio['ffmpeg_thread_queue_size']
vad = config['vad']
self.vad_mode = vad['vad_mode']
self.silence_duration_ms = vad['silence_duration_ms']
self.min_energy_threshold = vad['min_energy_threshold']
dashscope = config['dashscope']
self.dashscope_api_key = dashscope['api_key']
self.asr_model = dashscope['asr']['model']
self.asr_url = dashscope['asr']['url']
self.llm_model = dashscope['llm']['model']
self.llm_base_url = dashscope['llm']['base_url']
self.llm_temperature = dashscope['llm']['temperature']
self.llm_max_tokens = dashscope['llm']['max_tokens']
self.llm_max_history = dashscope['llm']['max_history']
self.llm_summary_trigger = dashscope['llm']['summary_trigger']
self.tts_model = dashscope['tts']['model']
self.tts_voice = dashscope['tts']['voice']
system = config['system']
self.use_wake_word = system['use_wake_word']
self.wake_word = system['wake_word']
self.system_shutup_keywords = system['shutup_keywords']
self.sv_enabled = system['sv_enabled']
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
self.sv_threshold = system['sv_threshold']
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
self.sv_buffer_size = system['sv_buffer_size']
self.continue_without_image = system['continue_without_image']
camera = config['camera']
self.camera_image_jpeg_quality = camera['image']['jpeg_quality']
def _init_components(self):
self.shutup_keywords = [k.strip() for k in self.system_shutup_keywords.split(',') if k.strip()]
self.intent_router = IntentRouter()
self.sv_audio_buffer = collections.deque(maxlen=self.sv_buffer_size)
self.vad_client = self.create_client(VADEvent, '/vad/event')
self.asr_client = self.create_client(ASRRecognize, '/asr/recognize')
self.tts_client = self.create_client(TTSSynthesize, '/tts/synthesize')
self.audio_data_client = self.create_client(AudioData, '/asr/audio_data')
self.get_logger().info("[Speaker] 等待service节点启动...")
self.vad_client.wait_for_service(timeout_sec=5.0)
self.asr_client.wait_for_service(timeout_sec=5.0)
self.tts_client.wait_for_service(timeout_sec=5.0)
self.audio_data_client.wait_for_service(timeout_sec=5.0)
self.get_logger().info("[Speaker] 所有service节点已就绪")
self.llm_client = OpenAI(api_key=self.dashscope_api_key, base_url=self.llm_base_url)
self.history = ConversationHistory(
max_history=self.llm_max_history,
summary_trigger=self.llm_summary_trigger
)
self.cv_bridge = CvBridge()
self.img_msg_cache = {}
self.img_msg_lock = threading.Lock()
self.img_sub = self.create_subscription(
ImgMsg,
'/img_msg',
lambda msg: (self.img_msg_lock.acquire(), self.img_msg_cache.update({msg.position: msg}), self.img_msg_lock.release()),
10
)
if self.sv_enabled and self.sv_model_path:
try:
self.sv_client = SpeakerVerificationClient(
model_path=self.sv_model_path,
threshold=self.sv_threshold,
speaker_db_path=self.sv_speaker_db_path,
logger=self.get_logger()
)
except Exception as e:
self.get_logger().warning(f"[Speaker] 声纹识别初始化失败: {e},声纹功能将不可用")
self.sv_client = None
self.sv_enabled = False
else:
self.sv_client = None
def _start_threads(self):
self.vad_thread = threading.Thread(
target=self._vad_event_worker,
name="VADEventThread",
daemon=True
)
self.vad_thread.start()
self.process_thread = threading.Thread(
target=self._process_worker,
name="ProcessThread",
daemon=True
)
self.process_thread.start()
self.tts_thread = threading.Thread(
target=self._tts_worker,
name="TTSThread",
daemon=True
)
self.tts_thread.start()
if self.sv_enabled and self.sv_client:
self.sv_thread = threading.Thread(
target=self._sv_worker,
name="SVThread",
daemon=True
)
self.sv_thread.start()
else:
self.sv_thread = None
def _change_state(self, new_state: ConversationState, reason: str):
with self.state_lock:
old_state = self.conversation_state
self.conversation_state = new_state
self.get_logger().info(f"[Speaker-State] {old_state.value} -> {new_state.value}: {reason}")
def _on_speech_started(self):
self.get_logger().info("[Speaker-VAD] 检测到人声开始")
with self.state_lock:
state = self.conversation_state
if state == ConversationState.AUTHORIZED:
if self.sv_enabled and self.sv_client:
self._start_sv_recording()
self._change_state(ConversationState.CHECK_VOICE, "新指令,重新验证声纹")
if state == ConversationState.IDLE:
if self.sv_enabled and self.sv_client:
self._start_sv_recording()
self._change_state(ConversationState.CHECK_VOICE, "检测到语音,开始检查声纹")
else:
self._change_state(ConversationState.AUTHORIZED, "未启用声纹,直接授权")
elif state == ConversationState.CHECK_VOICE:
self._start_sv_recording()
def _on_speech_stopped(self):
import threading
self.get_logger().debug(f"[Speaker-VAD] speech_stopped 被调用 | 线程:{threading.current_thread().name} | 当前状态:{self.conversation_state.value}")
with self.state_lock:
state = self.conversation_state
self.get_logger().debug(f"[Speaker-VAD] 准备停止声纹录音 | sv_enabled:{self.sv_enabled} | state:{state}")
if self.sv_enabled and state in [ConversationState.CHECK_VOICE, ConversationState.AUTHORIZED]:
self.sv_speech_end_event.clear()
self._stop_sv_recording()
self._call_asr_service()
def _start_sv_recording(self):
if not self.sv_enabled:
return
request = AudioData.Request()
request.command = "start"
request.duration_ms = 0
self.audio_data_client.call_async(request)
def _stop_sv_recording(self):
import threading
self.get_logger().debug(f"[Speaker-SV] _stop_sv_recording 开始 | 线程:{threading.current_thread().name} | 时间:{time.time()}")
request = AudioData.Request()
request.command = "stop"
future = self.audio_data_client.call_async(request)
future.add_done_callback(self._on_sv_audio_ready)
self.get_logger().debug(f"[Speaker-SV] _stop_sv_recording 已发送异步请求 | future_id:{id(future)}")
def _on_sv_audio_ready(self, future):
import threading
self.get_logger().debug(f"[Speaker-SV] _on_sv_audio_ready 回调触发 | 线程:{threading.current_thread().name} | future_id:{id(future)} | 时间:{time.time()}")
try:
response = future.result()
self.get_logger().debug(f"[Speaker-SV] 收到响应 | success:{response.success} | samples:{response.samples}")
if response.success and response.samples > 0:
audio_array = np.frombuffer(response.audio_data, dtype=np.int16)
with self.sv_lock:
self.get_logger().debug(f"[Speaker-SV] 准备写入buffer | 旧大小:{len(self.sv_audio_buffer)} | 新数据:{len(audio_array)}")
self.sv_audio_buffer.clear()
self.sv_audio_buffer.extend(audio_array)
self.get_logger().debug(f"[Speaker-SV] buffer已更新 | 新大小:{len(self.sv_audio_buffer)}")
self.get_logger().debug(f"[Speaker-SV] 准备设置 sv_speech_end_event")
self.sv_speech_end_event.set()
except Exception as e:
self.get_logger().error(f"[Speaker-SV] _on_sv_audio_ready 异常 | 错误:{e} | 类型:{type(e).__name__}")
def _call_asr_service(self):
self.get_logger().info("[Speaker] 调用ASR服务获取识别结果")
request = ASRRecognize.Request()
request.command = "start"
future = self.asr_client.call_async(request)
future.add_done_callback(self._asr_service_callback)
def _asr_service_callback(self, future):
import threading
self.get_logger().debug(f"[Speaker-ASR] ASR回调触发 | 线程:{threading.current_thread().name} | 时间:{time.time()}")
try:
response = future.result()
self.get_logger().debug(f"[Speaker-ASR] 收到响应 | success:{response.success} | text:{response.text if response.success else 'N/A'}")
if response.success and response.text:
self.text_queue.put(response.text)
self.get_logger().debug(f"[Speaker-ASR] 文本已放入队列 | queue_size:{self.text_queue.qsize()}")
else:
self.get_logger().warn(f"[Speaker-ASR] 识别失败或为空: success={response.success}, message={response.message}")
except Exception as e:
self.get_logger().error(f"[Speaker-ASR] 异常 | 错误:{e} | 类型:{type(e).__name__}")
def _vad_event_worker(self):
import threading
self.get_logger().info(f"[Speaker-VAD] 启动 | 线程ID:{threading.current_thread().ident}")
while not self.stop_event.is_set():
request = VADEvent.Request()
request.command = "wait"
request.timeout_ms = 500
future = self.vad_client.call_async(request)
future.add_done_callback(self._on_vad_event_response)
time.sleep(0.05)
def _on_vad_event_response(self, future):
import threading
self.get_logger().debug(f"[Speaker-VAD] 回调触发 | 线程:{threading.current_thread().name}")
try:
response = future.result()
if not response.success or response.event == "none":
return
self.get_logger().debug(f"[Speaker-VAD] 收到事件 | event:{response.event} | 线程:{threading.current_thread().name} | 时间:{time.time()}")
if response.event == "speech_started":
self._on_speech_started()
elif response.event == "speech_stopped":
self._on_speech_stopped()
except Exception as e:
self.get_logger().error(f"[Speaker-VAD] 异常 | 错误:{e} | 类型:{type(e).__name__}")
def _process_worker(self):
"""获取文本 → 状态转换 → 唤醒词处理 → 闭嘴指令检查 → 意图路由 → 处理请求"""
self.get_logger().info("[Speaker] 主线程启动")
while not self.stop_event.is_set():
try:
text = self.text_queue.get(timeout=0.1)
except queue.Empty:
continue
self.interrupt_event.clear()
with self.state_lock:
current_state = self.conversation_state
previous_state = current_state
current_state = self._handle_state_transition(current_state, text)
if current_state is None:
continue
if current_state == ConversationState.AUTHORIZED and previous_state == ConversationState.CHECK_VOICE:
self._interrupt_tts()
processed_text = self._handle_wake_word(text, current_state)
if not processed_text:
continue
if self._check_shutup_command(processed_text):
self._handle_shutup_command()
continue
intent_result = self.intent_router.route(processed_text)
self.get_logger().info(f"[Speaker-Intent] intent={intent_result.intent}, need_camera={intent_result.need_camera}, camera_mode={intent_result.camera_mode}")
if intent_result.intent == "kb_qa":
self.interrupt_event.clear()
if self._handle_kb_qa(processed_text):
continue
self._put_tts_text("抱歉,我没有找到相关信息")
continue
self.interrupt_event.clear()
self._handle_llm_request(intent_result, processed_text)
def _handle_state_transition(self, current_state: ConversationState, text: str) -> ConversationState | None:
if current_state == ConversationState.CHECK_VOICE:
if self.sv_enabled and self.sv_client:
if not self._handle_speaker_verification():
return None
else:
self._change_state(ConversationState.AUTHORIZED, "未启用声纹")
if self.use_wake_word:
wake_result = self._handle_wake_word(text, current_state)
if not wake_result:
self._change_state(ConversationState.IDLE, "未检测到唤醒词")
return None
elif current_state == ConversationState.AUTHORIZED:
if self.sv_enabled and self.sv_client:
if not self._handle_speaker_verification():
return None
if self.use_wake_word:
wake_result = self._handle_wake_word(text, current_state)
if not wake_result:
self._change_state(ConversationState.IDLE, "未检测到唤醒词")
return None
elif current_state == ConversationState.IDLE:
if self.sv_enabled and self.sv_client:
return None
else:
self._change_state(ConversationState.AUTHORIZED, "收到文本但状态为IDLE未启用声纹直接授权")
if self.use_wake_word:
wake_result = self._handle_wake_word(text, current_state)
if not wake_result:
self._change_state(ConversationState.IDLE, "未检测到唤醒词")
return None
with self.state_lock:
return self.conversation_state
def _handle_speaker_verification(self) -> bool:
import threading
self.get_logger().debug(f"[Speaker-SV] 开始声纹验证 | 线程:{threading.current_thread().name} | result_ready:{self.sv_result_ready_event.is_set()}")
if self.sv_result_ready_event.is_set():
self.get_logger().debug(f"[Speaker-SV] 结果已ready跳过等待")
pass
elif not self.sv_speech_end_event.wait(timeout=2.0):
self.get_logger().warn(f"[Speaker-SV] speech_end_event 等待超时")
self._change_state(ConversationState.IDLE, "没有录音数据,无法验证")
return False
self.get_logger().debug(f"[Speaker-SV] speech_end_event 已触发等待result_ready_event")
if not self.sv_result_ready_event.wait(timeout=3.0):
self.get_logger().warn(f"[Speaker-SV] result_ready_event 等待超时")
with self.sv_lock:
self.sv_audio_buffer.clear()
self._change_state(ConversationState.IDLE, "声纹结果未ready")
return False
self.get_logger().debug(f"[Speaker-SV] result_ready_event 已触发,读取结果")
self.sv_result_ready_event.clear()
with self.sv_lock:
speaker_id = self.current_speaker_id
speaker_state = self.current_speaker_state
score = self.current_speaker_score
self.get_logger().debug(f"[Speaker-SV] 验证结果 | speaker_id:{speaker_id} | state:{speaker_state.value} | score:{score:.4f}")
if not (speaker_id and speaker_state == SpeakerState.VERIFIED):
if self.sv_client.get_speaker_count() == 0:
self._change_state(ConversationState.IDLE, "声纹数据库为空")
else:
self._change_state(ConversationState.IDLE, f"声纹验证失败, 得分: {score:.4f}")
return False
self._change_state(ConversationState.AUTHORIZED, f"声纹验证成功: {speaker_id}, 得分: {score:.4f}")
return True
def _handle_shutup_command(self):
with self.state_lock:
current_state = self.conversation_state
if current_state == ConversationState.AUTHORIZED or not self.sv_enabled or not self.sv_client:
self._interrupt_tts()
if self.history:
self.history.cancel_turn()
def _handle_kb_qa(self, text: str) -> bool:
kb_answer = self.intent_router.search_kb(text)
if kb_answer:
self._put_tts_text(kb_answer)
return True
return False
def _handle_llm_request(self, intent_result, processed_text: str):
is_skill_sequence = intent_result.intent == "skill_sequence"
if self.history and not is_skill_sequence:
self.history.start_turn(intent_result.text)
if not self.llm_client:
self._put_tts_text(processed_text)
return
if is_skill_sequence:
self.get_logger().info(f"[Speaker-Skill] 任务: {processed_text}")
with self.execution_status_lock:
last_status = self.last_execution_status
self.get_logger().debug(f"[Speaker-Skill] 读取执行状态 | 线程:{threading.current_thread().name} | 时间:{time.time()} | 状态:{last_status}")
system_prompt_with_status = self.intent_router.build_skill_prompt(execution_status=last_status)
else:
system_prompt_with_status = intent_result.system_prompt
self.get_logger().debug(f"[Speaker-LLM] intent={intent_result.intent} | system_prompt前100字符: {system_prompt_with_status[:100] if system_prompt_with_status else 'None'}")
reply = self._llm_process_stream_with_camera(
intent_result.text,
intent_result.need_camera,
intent_result.camera_mode,
system_prompt_with_status,
intent_result.intent
)
if not reply or not reply.strip():
if self.history and not is_skill_sequence:
self.history.cancel_turn()
return
if self.history and not is_skill_sequence:
self.history.commit_turn(reply)
if is_skill_sequence and reply.strip():
msg = String()
msg.data = reply.strip()
self.skill_sequence_pub.publish(msg)
self.get_logger().info(f"[Speaker-Skill] 开始新任务: {processed_text}")
def _check_shutup_command(self, text: str) -> bool:
text_pinyin = self.intent_router.to_pinyin(text).lower().strip()
for keyword_pinyin in self.shutup_keywords:
keyword_pinyin_clean = keyword_pinyin.lower().strip()
if keyword_pinyin_clean in text_pinyin:
self.get_logger().info(f"[Speaker-Intent] 闭嘴指令匹配到关键词: {keyword_pinyin} (文本拼音: {text_pinyin})")
return True
return False
def _interrupt_tts(self):
self.interrupt_event.set()
while not self.tts_queue.empty():
try:
self.tts_queue.get_nowait()
except queue.Empty:
break
request = TTSSynthesize.Request()
request.command = "interrupt"
request.text = ""
request.voice = ""
future = self.tts_client.call_async(request)
future.add_done_callback(lambda f: self.get_logger().info("[Speaker-TTS] interrupt sent"))
def _on_skill_result_received(self, msg: String):
try:
import json
data = json.loads(msg.data)
success = data.get("success", False)
message = data.get("message", "")
total_skills = data.get("total_skills", 0)
succeeded_skills = data.get("succeeded_skills", 0)
status_text = f"执行结果: {'成功' if success else '失败'}"
if message:
status_text += f", 详情: {message}"
if total_skills > 0:
status_text += f", 总技能数: {total_skills}, 成功: {succeeded_skills}, 失败: {total_skills - succeeded_skills}"
with self.execution_status_lock:
self.last_execution_status = status_text
self.get_logger().info(f"[Speaker-Skill] 执行状态已更新: {status_text}")
except Exception as e:
self.get_logger().warning(f"[Speaker-Skill] 解析执行结果失败: {e}")
def _capture_image_from_img_dev(self, camera_mode: Optional[str] = None) -> Optional[np.ndarray]:
timeout_sec = 1.0
start_time = time.time()
while time.time() - start_time < timeout_sec:
with self.img_msg_lock:
if camera_mode and camera_mode in self.img_msg_cache:
msg = self.img_msg_cache[camera_mode]
cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
self.get_logger().info(f"[Speaker-Camera] 使用{camera_mode}相机获取图像成功 (position={msg.position})")
return cv_image
elif camera_mode is None and len(self.img_msg_cache) > 0:
msg = next(iter(self.img_msg_cache.values()))
cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
self.get_logger().info(f"[Speaker-Camera] 未指定相机位置,使用{msg.position}相机获取图像成功")
return cv_image
time.sleep(0.1)
with self.img_msg_lock:
available_positions = list(self.img_msg_cache.keys()) if self.img_msg_cache else []
self.get_logger().warning(f"[Speaker-Camera] 等待图像超时 (期望位置={camera_mode}, 可用位置={available_positions})")
return None
def _encode_image_to_base64(self, image_data: np.ndarray, quality: int = 85) -> str:
try:
if image_data.shape[2] == 3:
pil_image = Image.fromarray(image_data, 'RGB')
else:
pil_image = Image.fromarray(image_data)
buffer = io.BytesIO()
pil_image.save(buffer, format='JPEG', quality=quality)
image_bytes = buffer.getvalue()
return base64.b64encode(image_bytes).decode('utf-8')
except Exception as e:
self.get_logger().error(f"[Speaker-Camera] 图像编码失败: {e}")
return ""
def _llm_process_stream_with_camera(self, user_text: str, need_camera: bool, camera_mode: Optional[str] = None, system_prompt: Optional[str] = None, intent: str = "chat_text") -> str:
if not self.llm_client:
return ""
if intent == "skill_sequence":
messages = []
else:
if not self.history:
return ""
messages = [{"role": msg.role, "content": msg.content} for msg in self.history.get_messages()]
has_system_msg = any(msg.get("role") == "system" for msg in messages)
if not has_system_msg:
if system_prompt is None:
system_prompt = self.intent_router.build_default_system_prompt()
messages.insert(0, {"role": "system", "content": system_prompt})
image_base64_list = []
if need_camera:
image_data = self._capture_image_from_img_dev(camera_mode)
if image_data is not None:
image_base64 = self._encode_image_to_base64(image_data, quality=self.camera_image_jpeg_quality)
if image_base64:
image_base64_list.append(image_base64)
if not image_base64_list and not self.continue_without_image:
self.get_logger().warning(f"[Speaker-LLM] 需要相机但未获取到图片,且配置为不继续推理,放弃请求")
return ""
if image_base64_list:
content_list = [{"type": "text", "text": user_text}]
for img_b64 in image_base64_list:
content_list.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
})
if intent == "skill_sequence":
messages.append({"role": "user", "content": content_list})
else:
messages[-1]["content"] = content_list
else:
if intent == "skill_sequence":
messages.append({"role": "user", "content": user_text})
full_reply = ""
interrupted = False
try:
stream = self.llm_client.chat.completions.create(
model=self.llm_model,
messages=messages,
temperature=self.llm_temperature,
max_tokens=self.llm_max_tokens,
stream=True
)
for chunk in stream:
if self.interrupt_event.is_set():
interrupted = True
break
if chunk.choices and chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
full_reply += content
except Exception as e:
self.get_logger().error(f"[Speaker-LLM] 调用失败: {e}")
return ""
if interrupted:
self.get_logger().info("[Speaker-LLM] 流式处理被中断")
return ""
reply = full_reply.strip() if full_reply else ""
self.get_logger().info(f"[Speaker-LLM] 生成回复: {reply}")
if reply and intent != "skill_sequence" and not self.interrupt_event.is_set():
self._put_tts_text(reply)
return reply
def _tts_worker(self):
self.get_logger().info("[Speaker-TTS] TTS播放线程启动")
while not self.stop_event.is_set():
try:
text = self.tts_queue.get(timeout=0.5)
except queue.Empty:
continue
if self.interrupt_event.is_set():
continue
text_str = str(text).strip()
if not text_str:
continue
request = TTSSynthesize.Request()
request.command = "synthesize"
request.text = text_str
request.voice = ""
future = self.tts_client.call_async(request)
future.add_done_callback(self._on_tts_done)
def _on_tts_done(self, future):
try:
response = future.result()
if not response.success:
self.get_logger().warn(f"[Speaker-TTS] 播放失败: {response.message}")
except Exception as e:
self.get_logger().error(f"[Speaker-TTS] error: {e}")
def _sv_worker(self):
self.get_logger().info("[Speaker-SV] 启动")
min_audio_samples = int(self.sample_rate * 0.5)
while not self.stop_event.is_set():
try:
self.get_logger().debug(f"[Speaker-SV] 等待 sv_speech_end_event...")
if not self.sv_speech_end_event.wait(timeout=0.1):
continue
self.get_logger().debug(f"[Speaker-SV] sv_speech_end_event 触发 | 时间:{time.time()}")
self.sv_speech_end_event.clear()
if not (self.sv_enabled and self.sv_client):
continue
self.sv_result_ready_event.clear()
speaker_count = self.sv_client.get_speaker_count()
if speaker_count == 0:
with self.sv_lock:
self.current_speaker_id = None
self.current_speaker_state = SpeakerState.UNKNOWN
self.current_speaker_score = 0.0
self.current_speaker_threshold = self.sv_client.threshold
self.sv_result_ready_event.set()
self.get_logger().info("[Speaker-SV] 数据库为空跳过验证直接设置UNKNOWN状态")
continue
with self.sv_lock:
audio_list = list(self.sv_audio_buffer)
buffer_size = len(audio_list)
self.get_logger().debug(f"[Speaker-SV] 读取buffer | 大小:{buffer_size} | 时间:{time.time()}")
self.sv_audio_buffer.clear()
self.get_logger().info(f"[Speaker-SV] 收到speech_end事件录音长度: {buffer_size} 样本({buffer_size/self.sample_rate:.2f}秒)")
if buffer_size < min_audio_samples:
self.get_logger().debug(f"[Speaker-SV] 录音太短: {buffer_size} < {min_audio_samples},跳过处理")
with self.sv_lock:
self.current_speaker_id = None
self.current_speaker_state = SpeakerState.UNKNOWN
self.current_speaker_score = 0.0
self.current_speaker_threshold = self.sv_client.threshold
self.sv_result_ready_event.set()
continue
audio_array = np.array(audio_list, dtype=np.int16)
embedding, success = self.sv_client.extract_embedding(
audio_array,
sample_rate=self.sample_rate
)
if not success or embedding is None:
self.get_logger().debug("[Speaker-SV] 提取embedding失败")
with self.sv_lock:
self.current_speaker_id = None
self.current_speaker_state = SpeakerState.ERROR
self.current_speaker_score = 0.0
self.current_speaker_threshold = self.sv_client.threshold
self.sv_result_ready_event.set()
continue
speaker_id, match_state, score, threshold = self.sv_client.match_speaker(embedding)
with self.sv_lock:
self.current_speaker_id = speaker_id
self.current_speaker_state = match_state
self.current_speaker_score = score
self.current_speaker_threshold = threshold
if match_state == SpeakerState.VERIFIED:
self.get_logger().info(f"[Speaker-SV] 识别到说话人: {speaker_id}, 相似度: {score:.4f}, 阈值: {threshold:.4f}")
elif match_state == SpeakerState.REJECTED:
self.get_logger().info(f"[Speaker-SV] 未匹配到已知说话人(相似度不足), 相似度: {score:.4f}, 阈值: {threshold:.4f}")
else:
self.get_logger().info(f"[Speaker-SV] 状态: {match_state.value}, 相似度: {score:.4f}, 阈值: {threshold:.4f}")
self.sv_result_ready_event.set()
except Exception as e:
self.get_logger().error(f"[Speaker-SV] 错误: {e}")
time.sleep(0.1)
def _put_tts_text(self, text: str):
try:
self.tts_queue.put(text, timeout=0.2)
except queue.Full:
self.get_logger().warning(f"[Speaker-TTS] 队列已满,无法发送文本: {text[:50]}")
def _handle_wake_word(self, text: str, current_state: ConversationState = None) -> str:
"""处理唤醒词CHECK_VOICE状态下只检查存在性AUTHORIZED状态下移除唤醒词"""
if not self.use_wake_word:
return text.strip()
text_pinyin = self.intent_router.to_pinyin(text).lower().strip()
wake_word_pinyin = self.wake_word.lower().strip()
if not wake_word_pinyin:
return ""
text_pinyin_parts = text_pinyin.split()
wake_word_parts = wake_word_pinyin.split()
start_idx = -1
for i in range(len(text_pinyin_parts) - len(wake_word_parts) + 1):
if text_pinyin_parts[i:i+len(wake_word_parts)] == wake_word_parts:
start_idx = i
break
if start_idx == -1:
return ""
if current_state == ConversationState.CHECK_VOICE:
return text
hanzi_count = 0
new_text = ""
for c in text:
if '\u4e00' <= c <= '\u9fa5':
if hanzi_count < start_idx or hanzi_count >= start_idx + len(wake_word_parts):
new_text += c
hanzi_count += 1
else:
new_text += c
return new_text.strip()
def destroy_node(self):
self.get_logger().info("[Speaker] 语音节点正在关闭...")
self.stop_event.set()
self.interrupt_event.set()
self.get_logger().info("[Speaker] 强制停止TTS播放...")
self._interrupt_tts()
threads_to_join = [self.vad_thread, self.process_thread, self.tts_thread]
if self.sv_thread:
threads_to_join.append(self.sv_thread)
for thread in threads_to_join:
if thread and thread.is_alive():
thread.join(timeout=1.0)
self._interrupt_tts()
if hasattr(self, 'sv_client') and self.sv_client:
try:
self.sv_client.save_speakers()
self.sv_client.cleanup()
except Exception as e:
self.get_logger().warning(f"[Speaker] 清理声纹识别资源时出错: {e}")
super().destroy_node()
def main(args=None):
rclpy.init(args=args)
node = RobotSpeakerNode()
from rclpy.executors import MultiThreadedExecutor
executor = MultiThreadedExecutor(num_threads=4)
executor.add_node(node)
try:
executor.spin()
except KeyboardInterrupt:
node.get_logger().info("[Speaker] 收到中断信号,正在关闭节点")
finally:
node.destroy_node()
rclpy.shutdown()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,185 @@
"""技能接口文件解析器"""
import os
import yaml
import json
from typing import Optional
from ament_index_python.packages import get_package_share_directory
class SkillInterfaceParser:
def __init__(self, interfaces_root: str):
"""初始化解析器"""
self.interfaces_root = interfaces_root
self._cached_skill_config: list[dict] | None = None
self._cached_skill_interfaces: dict[str, dict] | None = None
def get_skill_names(self) -> list[str]:
"""获取所有技能名称(统一读取 robot_skills.yaml避免重复"""
skill_config = self._load_skill_config()
return [entry["name"] for entry in skill_config if isinstance(entry, dict) and entry.get("name")]
def _load_skill_config(self) -> list[dict]:
"""加载 robot_skills.yaml带缓存避免重复读取"""
if self._cached_skill_config is not None:
return self._cached_skill_config
try:
brain_share = get_package_share_directory("brain")
skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
with open(skill_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or []
self._cached_skill_config = data if isinstance(data, list) else []
return self._cached_skill_config
except Exception:
self._cached_skill_config = []
return []
def parse_skill_interfaces(self) -> dict[str, dict]:
"""解析所有技能接口文件的目标字段(带缓存)"""
if self._cached_skill_interfaces is not None:
return self._cached_skill_interfaces
result = {}
skill_config = self._load_skill_config()
for skill_entry in skill_config:
skill_name = skill_entry.get("name")
if not skill_name:
continue
interfaces = skill_entry.get("interfaces", [])
for iface in interfaces:
if isinstance(iface, dict):
iface_name = iface.get("name", "")
else:
iface_name = str(iface)
if ".action" in iface_name:
iface_type = "action"
file_path = os.path.join(self.interfaces_root, "action", iface_name)
elif ".srv" in iface_name:
iface_type = "srv"
file_path = os.path.join(self.interfaces_root, "srv", iface_name)
else:
continue
if os.path.exists(file_path):
goal_fields = self._parse_goal_fields(file_path)
result[skill_name] = {
"type": iface_type,
"goal_fields": goal_fields
}
break
self._cached_skill_interfaces = result
return result
def _parse_goal_fields(self, file_path: str) -> list[dict]:
"""解析接口文件的目标字段(第一个---之前的所有字段)"""
goal_fields = []
try:
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line.startswith("---"):
break
if not line or line.startswith("#"):
continue
parts = line.split()
if len(parts) >= 2:
field_type = parts[0]
field_name = parts[1]
comment = ""
if "#" in line:
comment = line.split("#", 1)[1].strip()
goal_fields.append({
"name": field_name,
"type": field_type,
"comment": comment
})
except Exception:
return []
return goal_fields
def generate_params_documentation(self) -> str:
"""生成技能参数说明文档"""
skill_interfaces = self.parse_skill_interfaces()
doc_lines = []
for skill_name, skill_info in skill_interfaces.items():
doc_lines.append(f"{skill_name}技能的parameters字段")
goal_fields = skill_info.get("goal_fields", [])
if not goal_fields:
doc_lines.append(" - 无参数,使用 null")
else:
doc_lines.append(" parameters字典必须包含以下字段")
for field in goal_fields:
field_name = field["name"]
field_type = field["type"]
comment = field.get("comment", "")
if field_name == "body_id":
doc_lines.append(
f" - {field_name} ({field_type}): 身体部位ID0=左臂1=右臂2=头部。"
f"根据目标物在图片中的方位选择左侧用0右侧用1中央用2。"
)
else:
type_desc = self._get_type_description(field_type)
doc_lines.append(f" - {field_name} ({field_type}): {type_desc} {comment}")
example_params = {}
for field in goal_fields:
field_name = field["name"]
field_type = field["type"]
example_params[field_name] = self._get_example_value(field_name, field_type)
doc_lines.append(f" 示例:{json.dumps(example_params, ensure_ascii=False)}")
doc_lines.append("")
return "\n".join(doc_lines)
def _get_type_description(self, field_type: str) -> str:
"""根据字段类型返回描述"""
type_map = {
"int8": "整数,范围-128到127",
"int16": "整数,范围-32768到32767",
"int32": "整数",
"int64": "整数",
"uint8": "无符号整数范围0到255",
"float32": "浮点数",
"float64": "浮点数",
"string": "字符串",
}
base_type = field_type.replace("[]", "").replace("_", "")
return type_map.get(base_type, field_type)
def _get_example_value(self, field_name: str, field_type: str) -> any:
"""根据字段名和类型生成示例值"""
if field_name == "body_id":
return 0
elif field_name == "data_array" and "float64[]" in field_type:
return [0.1, 0.2, 0.3, 0.0, 0.0, 0.0]
elif "int" in field_type:
return 0
elif "float" in field_type:
return 0.0
elif "string" in field_type:
return ""
elif "[]" in field_type:
if "int" in field_type:
return [0, 0, 0]
elif "float" in field_type:
return [0.0, 0.0, 0.0]
return []
else:
return None

View File

@@ -0,0 +1,199 @@
"""
声纹识别模块
"""
import numpy as np
import threading
import os
import time
import json
from enum import Enum
class SpeakerState(Enum):
"""说话人识别状态"""
UNKNOWN = "unknown"
VERIFIED = "verified"
REJECTED = "rejected"
ERROR = "error"
class SpeakerVerificationClient:
"""声纹识别客户端 - 非实时、低频处理"""
def __init__(self, model_path: str, threshold: float, speaker_db_path: str = None, logger=None):
self.model_path = model_path
self.threshold = threshold
self.speaker_db_path = speaker_db_path
self.logger = logger
self.speaker_db = {} # {speaker_id: {"embedding": np.ndarray, "env": str, "registered_at": float}}
self._lock = threading.Lock()
# # 优化CPU性能限制Torch使用的线程数防止多线程竞争导致性能骤降
import torch
torch.set_num_threads(1)
from funasr import AutoModel
model_path = os.path.expanduser(self.model_path)
# 禁用自动更新检查,防止每次初始化都联网检查
self.model = AutoModel(model=model_path, device="cpu", disable_update=True)
if self.logger:
self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
if self.speaker_db_path:
self.load_speakers()
def _log(self, level: str, msg: str):
"""记录日志 - 修复ROS2 logger在多线程环境中的问题"""
if self.logger:
try:
if level == "info":
self.logger.info(msg)
elif level == "warning":
self.logger.warning(msg)
elif level == "error":
self.logger.error(msg)
elif level == "debug":
self.logger.debug(msg)
except Exception:
pass
def load_speakers(self):
if not self.speaker_db_path:
return
db_path = os.path.expanduser(self.speaker_db_path)
if not os.path.exists(db_path):
self._log("info", f"声纹数据库文件不存在: {db_path},将创建新文件")
return
try:
with open(db_path, 'rb') as f:
data = json.load(f)
with self._lock:
self.speaker_db = {}
for speaker_id, info in data.items():
embedding_array = np.array(info["embedding"], dtype=np.float32)
if embedding_array.ndim > 1:
embedding_array = embedding_array.flatten()
self.speaker_db[speaker_id] = {
"embedding": embedding_array,
"env": info.get("env", ""),
"registered_at": info.get("registered_at", 0.0)
}
self._log("info", f"已加载 {len(self.speaker_db)} 个已注册说话人")
except Exception as e:
self._log("error", f"加载声纹数据库失败: {e}")
def save_speakers(self):
if not self.speaker_db_path:
return
db_path = os.path.expanduser(self.speaker_db_path)
try:
os.makedirs(os.path.dirname(db_path), exist_ok=True)
with self._lock:
data = {}
for speaker_id, info in self.speaker_db.items():
data[speaker_id] = {
"embedding": info["embedding"].tolist(),
"env": info.get("env", ""),
"registered_at": info.get("registered_at", 0.0)
}
with open(db_path, 'w') as f:
json.dump(data, f, indent=2)
self._log("info", f"已保存 {len(data)} 个已注册说话人到: {db_path}")
except Exception as e:
self._log("error", f"保存声纹数据库失败: {e}")
def extract_embedding(self, audio_array: np.ndarray, sample_rate: int = 16000) -> tuple[np.ndarray | None, bool]:
try:
if len(audio_array) == 0:
return None, False
# 确保是int16格式
if audio_array.dtype != np.int16:
audio_array = audio_array.astype(np.int16)
# 转换为float32并归一化到[-1, 1]
audio_float = audio_array.astype(np.float32) / 32768.0
# 调用模型提取embedding
result = self.model.generate(input=audio_float, cache={})
if result and len(result) > 0 and "spk_embedding" in result[0]:
embedding = result[0]["spk_embedding"]
if embedding is not None and len(embedding) > 0:
embedding_array = np.array(embedding, dtype=np.float32)
if embedding_array.ndim > 1:
embedding_array = embedding_array.flatten()
return embedding_array, True
return None, False
except Exception as e:
self._log("error", f"提取声纹特征失败: {e}")
return None, False
def match_speaker(self, embedding: np.ndarray) -> tuple[str | None, SpeakerState, float, float]:
if embedding is None or len(embedding) == 0:
return None, SpeakerState.UNKNOWN, 0.0, float(self.threshold)
with self._lock:
if len(self.speaker_db) == 0:
return None, SpeakerState.UNKNOWN, 0.0, float(self.threshold)
try:
best_speaker_id = None
best_score = 0.0
with self._lock:
for speaker_id, info in self.speaker_db.items():
stored_embedding = info["embedding"]
# 计算余弦相似度
dot_product = np.dot(embedding, stored_embedding)
norm_embedding = np.linalg.norm(embedding)
norm_stored = np.linalg.norm(stored_embedding)
if norm_embedding > 0 and norm_stored > 0:
score = dot_product / (norm_embedding * norm_stored)
if score > best_score:
best_score = score
best_speaker_id = speaker_id
state = SpeakerState.VERIFIED if best_score >= self.threshold else SpeakerState.REJECTED
return best_speaker_id, state, float(best_score), float(self.threshold)
except Exception as e:
self._log("error", f"匹配说话人失败: {e}")
return None, SpeakerState.ERROR, 0.0, float(self.threshold)
def register_speaker(self, speaker_id: str, embedding: np.ndarray, env: str = "") -> bool:
if embedding is None or len(embedding) == 0:
return False
try:
with self._lock:
self.speaker_db[speaker_id] = {
"embedding": np.array(embedding, dtype=np.float32),
"env": env,
"registered_at": time.time()
}
self._log("info", f"已注册说话人: {speaker_id}")
return True
except Exception as e:
self._log("error", f"注册说话人失败: {e}")
return False
def get_speaker_count(self) -> int:
with self._lock:
return len(self.speaker_db)
def get_speaker_list(self) -> list[str]:
with self._lock:
return list(self.speaker_db.keys())
def remove_speaker(self, speaker_id: str) -> bool:
with self._lock:
if speaker_id in self.speaker_db:
del self.speaker_db[speaker_id]
self._log("info", f"已删除说话人: {speaker_id}")
return True
return False
def cleanup(self):
try:
self.save_speakers()
if hasattr(self, 'model') and self.model:
del self.model
except Exception as e:
self._log("error", f"清理资源失败: {e}")

View File

@@ -0,0 +1,256 @@
"""
音频处理模块:录音 + VAD
"""
import time
import pyaudio
import webrtcvad
import struct
import queue
class VADDetector:
"""VAD语音检测器"""
def __init__(self, mode: int, sample_rate: int):
self.vad = webrtcvad.Vad(mode)
self.sample_rate = sample_rate
class AudioRecorder:
"""音频录音器 - 录音线程"""
def __init__(self, device_index: int, sample_rate: int, channels: int,
chunk: int, vad_detector: VADDetector,
audio_queue: queue.Queue, # 音频队列:录音线程 → ASR线程
silence_duration_ms: int = 1000,
min_energy_threshold: int = 300, # 音频能量 > 300有语音
heartbeat_interval: float = 2.0,
on_heartbeat=None,
is_playing=None,
on_new_segment=None, # 检测到新的人声段
on_speech_start=None, # 检测到人声开始
on_speech_end=None, # 检测到静音结束(说话结束)
stop_flag=None,
on_audio_chunk=None, # 音频chunk回调用于声纹录音等可选
should_put_to_queue=None, # 检查是否应该将音频放入队列用于阻止ASR可选
get_silence_threshold=None, # 获取动态静音阈值(毫秒,可选)
logger=None):
self.device_index = device_index
self.sample_rate = sample_rate
self.channels = channels
self.chunk = chunk
self.vad_detector = vad_detector
self.audio_queue = audio_queue
self.silence_duration_ms = int(silence_duration_ms)
self.min_energy_threshold = int(min_energy_threshold)
self.heartbeat_interval = heartbeat_interval
self.on_heartbeat = on_heartbeat
self.is_playing = is_playing or (lambda: False)
self.on_new_segment = on_new_segment
self.on_speech_start = on_speech_start
self.on_speech_end = on_speech_end
self.stop_flag = stop_flag or (lambda: False)
self.on_audio_chunk = on_audio_chunk # 音频chunk回调用于声纹录音等
self.should_put_to_queue = should_put_to_queue or (lambda: True) # 默认允许放入队列
self.get_silence_threshold = get_silence_threshold # 动态静音阈值回调
self.logger = logger
self.audio = pyaudio.PyAudio()
# 自动查找 iFLYTEK 麦克风设备
try:
count = self.audio.get_device_count()
found_index = -1
if self.logger:
self.logger.info(f"开始扫描音频设备 (总数: {count})...")
for i in range(count):
device_info = self.audio.get_device_info_by_index(i)
device_name = device_info.get('name', '')
max_input_channels = device_info.get('maxInputChannels', 0)
if self.logger:
try:
self.logger.info(f"扫描设备 [{i}]: Name='{device_name}', MaxInput={max_input_channels}, Rate={int(device_info.get('defaultSampleRate'))}")
except:
pass
# 检查是否包含 iFLYTEK 且支持录音(输入通道 > 0
if 'iFLYTEK' in device_name and max_input_channels > 0:
found_index = i
if self.logger:
self.logger.info(f"已自动定位到麦克风设备: {device_name} (Index: {i})")
break
if found_index != -1:
self.device_index = found_index
else:
if self.logger:
self.logger.warning(f"未自动检测到 iFLYTEK 设备请检查USB连接或执行 'arecord -l' 确认系统是否识别到录音设备,将继续使用配置的索引: {self.device_index}")
except Exception as e:
if self.logger:
self.logger.error(f"设备自动检测过程出错: {e}")
self.format = pyaudio.paInt16
self._debug_counter = 0
def record_with_vad(self):
"""录音线程VAD + 能量检测"""
if self.on_heartbeat:
self.on_heartbeat()
try:
stream = self.audio.open(
format=self.format,
channels=self.channels,
rate=self.sample_rate,
input=True,
input_device_index=self.device_index if self.device_index >= 0 else None,
frames_per_buffer=self.chunk
)
except Exception as e:
raise RuntimeError(f"无法打开音频输入设备: {e}")
# VAD检测窗口, 最快 0.5s 内发现说话
window_sec = 0.5
# 连续 1s 没有检测到语音,就判定为静音状态
no_speech_threshold = max(self.silence_duration_ms / 1000.0, 0.1)
last_heartbeat_time = time.time()
audio_buffer = [] # VAD 滑动窗口
last_active_time = time.time() # 静音计时基准
in_speech_segment = False # 是否处于语音段中(从检测到人声开始,直到静音超时结束)
try:
while not self.stop_flag():
# exception_on_overflow=False, 宁可丢帧,也不阻塞
data = stream.read(self.chunk, exception_on_overflow=False)
processed_data = data
# 检查是否应该将音频放入队列用于阻止ASR例如无声纹文件时需要注册
if self.should_put_to_queue():
# 队列满时丢弃最旧的数据ASR 跟不上时系统仍然听得见
if self.audio_queue.full():
self.audio_queue.get_nowait()
# 使用处理后的音频数据(经过回声消除)
self.audio_queue.put_nowait(processed_data)
# 音频chunk回调用于声纹录音等仅在需要时调用
if self.on_audio_chunk:
# 回调使用处理后的音频数据
self.on_audio_chunk(processed_data)
# VAD检测使用处理后的音频经过回声消除
audio_buffer.append(processed_data) # 只用于 VAD不用于 ASR
# VAD检测窗口
now = time.time()
if len(audio_buffer) * self.chunk / self.sample_rate >= window_sec:
raw_audio = b''.join(audio_buffer)
energy = self._calculate_energy(raw_audio)
vad_result = self._check_activity(raw_audio)
self._debug_counter += 1
if self._debug_counter >= 10:
if self.logger:
self.logger.info(f"[VAD调试] 能量={energy:.1f}, 阈值={self.min_energy_threshold}, VAD结果={vad_result}")
self._debug_counter = 0
if vad_result:
last_active_time = now
if not in_speech_segment: # 上一轮没说话,本轮开始说话
in_speech_segment = True
if self.on_speech_start:
self.on_speech_start()
# 检测当前 TTS 是否在播放
if self.is_playing() and self.on_new_segment:
self.on_new_segment() # 打断 TTS的回调
else:
if in_speech_segment:
# 处于语音段中,但当前帧为静音,检查静音时长
silence_duration = now - last_active_time
# 动态获取静音阈值(如果提供回调函数)
if self.get_silence_threshold:
current_silence_ms = self.get_silence_threshold()
current_no_speech_threshold = max(current_silence_ms / 1000.0, 0.1)
else:
current_no_speech_threshold = no_speech_threshold
# 添加调试日志
if self.logger and silence_duration < current_no_speech_threshold:
self.logger.debug(f"[VAD] 静音中: {silence_duration:.3f}秒 < {current_no_speech_threshold:.3f}秒阈值")
if silence_duration >= current_no_speech_threshold:
if self.on_speech_end:
if self.logger:
self.logger.debug(f"[VAD] 触发speech_end: 静音持续时间 {silence_duration:.3f}秒 >= 阈值 {current_no_speech_threshold:.3f}")
self.on_speech_end() # 通知系统用户停止说话
in_speech_segment = False
if self.on_heartbeat and now - last_heartbeat_time >= self.heartbeat_interval:
self.on_heartbeat()
last_heartbeat_time = now
audio_buffer = []
finally:
if stream.is_active():
stream.stop_stream()
stream.close()
@staticmethod
def _calculate_energy(audio_chunk: bytes) -> float:
"""计算音频能量RMS"""
if not audio_chunk:
return 0.0
# 计算样本数:音频字节数 // 2因为是16位PCM1个样本=2字节
n = len(audio_chunk) // 2
if n <= 0:
return 0.0
# 把字节数据解包为16位有符号整数小端序
samples = struct.unpack(f'<{n}h', audio_chunk[: n * 2])
if not samples:
return 0.0
return (sum(s * s for s in samples) / len(samples)) ** 0.5
def _check_activity(self, audio_data: bytes) -> bool:
"""VAD + 能量检测先VAD检测能量作为辅助判断"""
energy = self._calculate_energy(audio_data)
rate = 0.4 # 连续人声经验值
num = 0
# 采样率:16000 Hz, 帧时长:20ms=0.02s, 每帧采样点数=16000×0.02=320samples
# 每帧字节数=320×2=640bytes
bytes_per_sample = 2 # paInt16
frame_samples = int(self.sample_rate * 0.02)
frame_bytes = frame_samples * bytes_per_sample
if frame_bytes <= 0 or len(audio_data) < frame_bytes:
return False
total_frames = len(audio_data) // frame_bytes
required = max(1, int(total_frames * rate))
for i in range(0, len(audio_data), frame_bytes):
chunk = audio_data[i:i + frame_bytes]
if len(chunk) == frame_bytes:
if self.vad_detector.vad.is_speech(chunk, sample_rate=self.sample_rate):
num += 1
# 语音开头能量高, 中后段(拖音、尾音)能量下降
vad_result = num >= required
if vad_result and energy < self.min_energy_threshold * 0.5:
return False
return vad_result
def cleanup(self):
"""清理资源"""
if hasattr(self, 'audio') and self.audio:
self.audio.terminate()

View File

@@ -1,55 +0,0 @@
import rclpy
from rclpy.node import Node
from example_interfaces.msg import String
import threading
from queue import Queue
import time
import espeakng
import pyttsx3
class RobotSpeakerNode(Node):
def __init__(self, node_name):
super().__init__(node_name)
self.novels_queue_ = Queue()
self.novel_subscriber_ = self.create_subscription(
String, 'robot_msg', self.novel_callback, 10)
self.speech_thread_ = threading.Thread(target=self.speak_thread)
self.speech_thread_.start()
def novel_callback(self, msg):
self.novels_queue_.put(msg.data)
def speak_thread(self):
# 初始化引擎
engine = pyttsx3.init()
# 调整参数
engine.setProperty('rate', 150) # 语速150更自然
engine.setProperty('volume', 1.0) # 音量0.0-1.0
# 选择中文音色(修正:使用 languages 属性,且是列表)
voices = engine.getProperty('voices')
for voice in voices:
# 检查语音支持的语言列表中是否包含中文('zh' 或 'zh-CN' 等)
if any('zh' in lang for lang in voice.languages):
engine.setProperty('voice', voice.id)
self.get_logger().info(f'已选择中文语音:{voice.id}')
break
else:
self.get_logger().warning('未找到中文语音库,将使用默认语音')
while rclpy.ok():
if self.novels_queue_.qsize() > 0:
text = self.novels_queue_.get()
engine.say(text)
engine.runAndWait() # 等待语音播放完成
else:
time.sleep(0.5)
def main(args=None):
rclpy.init(args=args)
node = RobotSpeakerNode("robot_speaker_node")
rclpy.spin(node)
rclpy.shutdown()

View File

@@ -0,0 +1,22 @@
"""
Service节点模块
"""

View File

@@ -0,0 +1,703 @@
import rclpy
from rclpy.node import Node
from interfaces.srv import ASRRecognize, AudioData, VADEvent
import threading
import queue
import time
import pyaudio
import yaml
import os
import collections
import numpy as np
import base64
import dashscope
from dashscope.audio.qwen_omni import OmniRealtimeConversation, OmniRealtimeCallback
from dashscope.audio.qwen_omni.omni_realtime import TranscriptionParams, MultiModality
from ament_index_python.packages import get_package_share_directory
class AudioRecorder:
def __init__(self, device_index: int, sample_rate: int, channels: int,
chunk: int, audio_queue: queue.Queue, stop_event, logger=None):
self.device_index = device_index
self.sample_rate = sample_rate
self.channels = channels
self.chunk = chunk
self.audio_queue = audio_queue
self.stop_event = stop_event
self.logger = logger
self.audio = pyaudio.PyAudio()
original_index = self.device_index
try:
for i in range(self.audio.get_device_count()):
device_info = self.audio.get_device_info_by_index(i)
if 'iFLYTEK' in device_info['name'] and device_info['maxInputChannels'] > 0:
self.device_index = i
if self.logger:
self.logger.info(f"[ASR-Recorder] 已自动定位到麦克风设备: {device_info['name']} (Index: {i})")
break
except Exception as e:
if self.logger:
self.logger.error(f"[ASR-Recorder] 设备自动检测过程出错: {e}")
if self.device_index == original_index and original_index == -1:
self.device_index = 0
if self.logger:
self.logger.info("[ASR-Recorder] 未找到 iFLYTEK 设备,使用系统默认输入设备")
self.format = pyaudio.paInt16
def record(self):
if self.logger:
self.logger.info(f"[ASR-Recorder] 录音线程启动,设备索引: {self.device_index}")
stream = None
try:
stream = self.audio.open(
format=self.format,
channels=self.channels,
rate=self.sample_rate,
input=True,
input_device_index=self.device_index if self.device_index >= 0 else None,
frames_per_buffer=self.chunk
)
if self.logger:
self.logger.info("[ASR-Recorder] 音频输入设备已打开")
except Exception as e:
if self.logger:
self.logger.error(f"[ASR-Recorder] 无法打开音频输入设备: {e}")
return
try:
while not self.stop_event.is_set():
try:
data = stream.read(self.chunk, exception_on_overflow=False)
if self.audio_queue.full():
self.audio_queue.get_nowait()
self.audio_queue.put_nowait(data)
except OSError as e:
if self.logger:
self.logger.debug(f"[ASR-Recorder] 录音设备错误: {e}")
break
except KeyboardInterrupt:
if self.logger:
self.logger.info("[ASR-Recorder] 录音线程收到中断信号")
finally:
if stream is not None:
try:
if stream.is_active():
stream.stop_stream()
stream.close()
except Exception as e:
pass
if self.logger:
self.logger.info("[ASR-Recorder] 录音线程已退出")
class DashScopeASR:
def __init__(self, api_key: str, sample_rate: int, model: str, url: str, logger=None):
dashscope.api_key = api_key
self.sample_rate = sample_rate
self.model = model
self.url = url
self.logger = logger
self.conversation = None
self.running = False
self.on_sentence_end = None
self.on_speech_started = None
self.on_speech_stopped = None
self._stop_lock = threading.Lock()
self._final_result_event = threading.Event()
self._pending_commit = False
# ========== 连接生命周期管理: 解决 DashScope ASR WebSocket 连接超时导致的识别不稳定 ==========
self._connection_start_time = None # 连接创建时间
self._last_audio_time = None # 最后一次发送音频的时间
self._recognition_count = 0 # 识别次数计数
self._audio_send_count = 0 # 音频发送次数计数
self._last_audio_send_success = True # 最后一次音频发送是否成功
self._consecutive_send_failures = 0 # 连续发送失败次数
# 配置参数
self.MAX_CONNECTION_AGE = 300 # 连接最大存活时间5分钟
self.MAX_IDLE_TIME = 180 # 最大空闲时间3分钟
self.MAX_RECOGNITIONS = 30 # 最大识别次数30次后重建连接
self.MAX_CONSECUTIVE_FAILURES = 3 # 最大连续失败次数
def _log(self, level: str, msg: str):
if not self.logger:
return
try:
if level == "debug":
self.logger.debug(msg)
elif level == "warning":
self.logger.warn(msg)
elif level == "error":
self.logger.error(msg)
elif level == "info":
self.logger.info(msg)
except Exception:
pass
def _should_reconnect(self) -> tuple[bool, str]:
if not self.running or not self.conversation:
return False, ""
current_time = time.time()
# 检查1连接时间
if self._connection_start_time:
connection_age = current_time - self._connection_start_time
if connection_age > self.MAX_CONNECTION_AGE:
return True, f"连接已存活{connection_age:.0f}秒,超过{self.MAX_CONNECTION_AGE}秒阈值"
# 检查2空闲时间
if self._last_audio_time:
idle_time = current_time - self._last_audio_time
if idle_time > self.MAX_IDLE_TIME:
return True, f"连接已空闲{idle_time:.0f}秒,超过{self.MAX_IDLE_TIME}秒阈值"
# 检查3识别次数
if self._recognition_count >= self.MAX_RECOGNITIONS:
return True, f"已完成{self._recognition_count}次识别,达到重连阈值"
# 检查4连续发送失败
if self._consecutive_send_failures >= self.MAX_CONSECUTIVE_FAILURES:
return True, f"连续{self._consecutive_send_failures}次音频发送失败"
return False, ""
def _reset_connection_stats(self):
self._connection_start_time = time.time()
self._last_audio_time = time.time()
self._recognition_count = 0
self._audio_send_count = 0
self._last_audio_send_success = True
self._consecutive_send_failures = 0
def start(self):
if self.running:
return False
try:
callback = _ASRCallback(self)
self.conversation = OmniRealtimeConversation(
model=self.model,
url=self.url,
callback=callback
)
callback.conversation = self.conversation
self.conversation.connect()
transcription_params = TranscriptionParams(
language='zh',
sample_rate=self.sample_rate,
input_audio_format="pcm",
)
self.conversation.update_session(
output_modalities=[MultiModality.TEXT],
enable_input_audio_transcription=True,
transcription_params=transcription_params,
enable_turn_detection=True,
turn_detection_type='server_vad',
prefix_padding_ms=1000,
turn_detection_threshold=0.3,
turn_detection_silence_duration_ms=800,
)
self.running = True
self._reset_connection_stats()
self._log("info", f"[ASR] 已启动 | 连接ID:{id(self.conversation)}")
return True
except Exception as e:
self.running = False
self._log("error", f"[ASR] 启动失败: {e}")
if self.conversation:
try:
self.conversation.close()
except Exception:
pass
self.conversation = None
return False
def send_audio(self, audio_chunk: bytes):
should_reconnect, reason = self._should_reconnect()
if should_reconnect:
self._log("warning", f"[ASR] 检测到需要重连: {reason}")
self.running = False
try:
if self.conversation:
self.conversation.close()
except:
pass
self.conversation = None
time.sleep(1.0)
if not self.start():
self._log("error", "[ASR] 自动重连失败")
return False
self._log("info", "[ASR] 自动重连成功")
import threading
self._log("debug", f"[ASR] send_audio 被调用 | 线程:{threading.current_thread().name} | running:{self.running} | conversation:{self.conversation is not None}")
if not self.running or not self.conversation:
self._log("debug", f"[ASR] send_audio 跳过 | running:{self.running} | conversation:{self.conversation is not None}")
return False
try:
audio_b64 = base64.b64encode(audio_chunk).decode('ascii')
self.conversation.append_audio(audio_b64)
self._last_audio_time = time.time()
self._audio_send_count += 1
self._last_audio_send_success = True
self._consecutive_send_failures = 0
self._log("debug", f"[ASR] 音频发送成功 | 总计:{self._audio_send_count} | 连接年龄:{time.time() - self._connection_start_time:.1f}")
return True
except Exception as e:
self._last_audio_send_success = False
self._consecutive_send_failures += 1
error_msg = str(e)
error_type = type(e).__name__
if "Connection is already closed" in error_msg or "WebSocketConnectionClosedException" in error_type or "ConnectionClosed" in error_type or "websocket" in error_msg.lower():
self._log("warning", f"[ASR] WebSocket 连接已断开 | 错误:{error_msg} | 连续失败:{self._consecutive_send_failures}")
self.running = False
try:
if self.conversation:
self.conversation.close()
except:
pass
self.conversation = None
else:
self._log("error", f"[ASR] send_audio 异常 | 错误:{error_msg} | 类型:{error_type} | 连续失败:{self._consecutive_send_failures}")
return False
def stop_current_recognition(self):
import threading
self._log("debug", f"[ASR] stop_current_recognition 被调用 | 线程:{threading.current_thread().name} | running:{self.running}")
if not self._stop_lock.acquire(blocking=False):
self._log("debug", f"[ASR] 锁获取失败,有其他线程正在执行 stop_current_recognition")
return False
self._final_result_event.clear()
self._pending_commit = True
try:
self._log("debug", f"[ASR] 获得锁,开始停止识别 | conversation:{self.conversation is not None}")
if not self.running or not self.conversation:
self._log("debug", f"[ASR] 无法停止 | running:{self.running} | conversation:{self.conversation is not None}")
return False
self._recognition_count += 1
should_reconnect, reason = self._should_reconnect()
if should_reconnect:
self._log("info", f"[ASR] 识别完成后检测到需要重连: {reason}")
self._final_result_event.clear()
self._pending_commit = True
try:
self.conversation.commit()
self._final_result_event.wait(timeout=3.0)
except Exception as e:
self._log("debug", f"[ASR] commit 异常: {e}")
self._log("debug", f"[ASR] 准备关闭旧连接 | conversation_id:{id(self.conversation)}")
self.running = False
old_conversation = self.conversation
self.conversation = None
self._log("debug", f"[ASR] conversation已设为None准备关闭旧连接")
try:
old_conversation.close()
self._log("debug", f"[ASR] 旧连接已关闭")
except Exception as e:
self._log("warning", f"[ASR] 关闭连接异常: {e}")
self._log("debug", f"[ASR] 连接已关闭,等待下次语音活动时重连")
return True
finally:
self._pending_commit = False
self._stop_lock.release()
self._log("debug", f"[ASR] stop_current_recognition 完成,锁已释放")
def stop(self):
with self._stop_lock:
self.running = False
self._final_result_event.set()
if self.conversation:
try:
self.conversation.close()
except Exception:
pass
self.conversation = None
self._log("info", "[ASR] 已完全停止")
class _ASRCallback(OmniRealtimeCallback):
def __init__(self, asr_client: DashScopeASR):
self.asr_client = asr_client
self.conversation = None
def on_event(self, response):
try:
event_type = response['type']
if event_type == 'conversation.item.input_audio_transcription.completed':
transcript = response['transcript']
if transcript.strip() and self.asr_client.on_sentence_end:
self.asr_client.on_sentence_end(transcript.strip())
if self.asr_client._pending_commit:
self.asr_client._final_result_event.set()
elif event_type == 'input_audio_buffer.speech_started':
if self.asr_client.logger:
self.asr_client.logger.info("[ASR] 检测到语音开始")
if self.asr_client.on_speech_started:
self.asr_client.on_speech_started()
elif event_type == 'input_audio_buffer.speech_stopped':
if self.asr_client.logger:
self.asr_client.logger.info("[ASR] 检测到语音结束")
if self.asr_client.on_speech_stopped:
self.asr_client.on_speech_stopped()
except Exception:
pass
class ASRAudioNode(Node):
def __init__(self):
super().__init__('asr_audio_node')
self._load_config()
self.audio_queue = queue.Queue(maxsize=100)
self.stop_event = threading.Event()
self._shutdown_in_progress = False
self._init_components()
self.recognize_service = self.create_service(
ASRRecognize, '/asr/recognize', self._recognize_callback
)
self.audio_data_service = self.create_service(
AudioData, '/asr/audio_data', self._audio_data_callback
)
self.vad_event_service = self.create_service(
VADEvent, '/vad/event', self._vad_event_callback
)
self._last_result = None
self._result_event = threading.Event()
self._last_result_time = None
self.vad_event_queue = queue.Queue()
self.audio_buffer = collections.deque(maxlen=240000)
self.audio_recording = False
self.audio_lock = threading.Lock()
# ========== 异常识别检测 ==========
self._abnormal_results = ["嗯。", "", "啊。", "哦。"] # 异常识别结果列表
self._consecutive_abnormal_count = 0 # 连续异常识别次数
self.MAX_CONSECUTIVE_ABNORMAL = 5 # 最大连续异常次数
self.recording_thread = threading.Thread(
target=self.audio_recorder.record, name="RecordingThread", daemon=True
)
self.recording_thread.start()
self.asr_thread = threading.Thread(
target=self._asr_worker, name="ASRThread", daemon=True
)
self.asr_thread.start()
self.get_logger().info("ASR Audio节点已启动")
def _load_config(self):
config_file = os.path.join(
get_package_share_directory('robot_speaker'),
'config',
'voice.yaml'
)
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
mic = config['audio']['microphone']
self.input_device_index = mic['device_index']
self.sample_rate = mic['sample_rate']
self.channels = mic['channels']
self.chunk = mic['chunk']
dashscope = config['dashscope']
self.dashscope_api_key = dashscope['api_key']
self.asr_model = dashscope['asr']['model']
self.asr_url = dashscope['asr']['url']
def _init_components(self):
self.audio_recorder = AudioRecorder(
device_index=self.input_device_index,
sample_rate=self.sample_rate,
channels=self.channels,
chunk=self.chunk,
audio_queue=self.audio_queue,
stop_event=self.stop_event,
logger=self.get_logger()
)
self.asr_client = DashScopeASR(
api_key=self.dashscope_api_key,
sample_rate=self.sample_rate,
model=self.asr_model,
url=self.asr_url,
logger=self.get_logger()
)
self.asr_client.on_sentence_end = self._on_asr_result
self.asr_client.on_speech_started = lambda: self._put_vad_event("speech_started")
self.asr_client.on_speech_stopped = lambda: (self._clear_result(), self._put_vad_event("speech_stopped"))
self.asr_client.start()
def _on_asr_result(self, text: str):
if not text or not text.strip():
return
self._last_result = text.strip()
self._last_result_time = time.time()
self._result_event.set()
is_abnormal = self._last_result in self._abnormal_results and len(self._last_result) <= 2
if is_abnormal:
self._consecutive_abnormal_count += 1
self.get_logger().warn(f"[ASR] 检测到异常识别结果: '{self._last_result}' | 连续异常:{self._consecutive_abnormal_count}")
# 如果连续多次异常,强制重置 ASR 连接
if self._consecutive_abnormal_count >= self.MAX_CONSECUTIVE_ABNORMAL:
self.get_logger().error(f"[ASR] 连续{self._consecutive_abnormal_count}次异常识别,标记需要重连")
self.asr_client._consecutive_send_failures = self.asr_client.MAX_CONSECUTIVE_FAILURES
self._consecutive_abnormal_count = 0
else:
# 正常识别,重置异常计数
self._consecutive_abnormal_count = 0
try:
self.get_logger().info(f"[ASR] 识别结果: {self._last_result}")
except Exception:
pass
def _put_vad_event(self, event_type):
try:
self.vad_event_queue.put(event_type, timeout=0.1)
except queue.Full:
try:
self.get_logger().warn(f"[ASR] VAD事件队列已满丢弃{event_type}事件")
except Exception:
pass
def _audio_data_callback(self, request, response):
import threading
self.get_logger().debug(f"[ASR-AudioData] 回调触发 | command:{request.command} | 线程:{threading.current_thread().name}")
response.sample_rate = self.sample_rate
response.channels = self.channels
if request.command == "start":
with self.audio_lock:
self.get_logger().debug(f"[ASR-AudioData] start命令 | 旧buffer大小:{len(self.audio_buffer)} | recording:{self.audio_recording}")
self.audio_buffer.clear()
self.audio_recording = True
self.get_logger().debug(f"[ASR-AudioData] buffer已清空recording=True")
response.success = True
response.message = "开始录音"
response.samples = 0
return response
if request.command == "stop":
self.get_logger().debug(f"[ASR-AudioData] stop命令 | recording:{self.audio_recording}")
with self.audio_lock:
self.audio_recording = False
audio_list = list(self.audio_buffer)
self.get_logger().debug(f"[ASR-AudioData] 读取buffer | 大小:{len(audio_list)}")
self.audio_buffer.clear()
if len(audio_list) > 0:
audio_array = np.array(audio_list, dtype=np.int16)
response.success = True
response.audio_data = audio_array.tobytes()
response.samples = len(audio_list)
response.message = f"录音完成{len(audio_list)}样本"
self.get_logger().debug(f"[ASR-AudioData] 返回音频 | samples:{len(audio_list)}")
else:
response.success = False
response.message = "缓冲区为空"
response.samples = 0
self.get_logger().debug(f"[ASR-AudioData] buffer为空")
return response
if request.command == "get":
with self.audio_lock:
audio_list = list(self.audio_buffer)
if len(audio_list) > 0:
audio_array = np.array(audio_list, dtype=np.int16)
response.success = True
response.audio_data = audio_array.tobytes()
response.samples = len(audio_list)
response.message = f"获取到{len(audio_list)}样本"
else:
response.success = False
response.message = "缓冲区为空"
response.samples = 0
return response
def _vad_event_callback(self, request, response):
timeout = request.timeout_ms / 1000.0 if request.timeout_ms > 0 else None
try:
event = self.vad_event_queue.get(timeout=timeout)
response.success = True
response.event = event
response.message = "收到VAD事件"
except queue.Empty:
response.success = False
response.event = "none"
response.message = "等待超时"
except KeyboardInterrupt:
try:
self.get_logger().info("[ASR-VAD] 收到中断信号,正在关闭")
except Exception:
pass
response.success = False
response.event = "none"
response.message = "节点正在关闭"
self.stop_event.set()
return response
def _clear_result(self):
self._last_result = None
self._last_result_time = None
self._result_event.clear()
def _return_result(self, response, text, message):
response.success = True
response.text = text
response.message = message
self._clear_result()
return response
def _recognize_callback(self, request, response):
if request.command == "stop":
if self.asr_client.running:
self.asr_client.stop_current_recognition()
response.success = True
response.text = ""
response.message = "识别已停止"
return response
if request.command == "reset":
self.asr_client.stop_current_recognition()
time.sleep(0.1)
self.asr_client.start()
response.success = True
response.text = ""
response.message = "识别器已重置"
return response
if self.asr_client.running:
current_time = time.time()
if (self._last_result and self._last_result_time and
(current_time - self._last_result_time) < 0.3) or (self._result_event.is_set() and self._last_result):
return self._return_result(response, self._last_result, "返回最近识别结果")
if self._result_event.wait(timeout=2.0) and self._last_result:
return self._return_result(response, self._last_result, "识别成功(等待中)")
self.asr_client.stop_current_recognition()
time.sleep(0.2)
self._clear_result()
if not self.asr_client.running and not self.asr_client.start():
response.success = False
response.text = ""
response.message = "ASR启动失败"
return response
if self._result_event.wait(timeout=5.0) and self._last_result:
response.success = True
response.text = self._last_result
response.message = "识别成功"
else:
response.success = False
response.text = ""
response.message = "识别超时" if not self._result_event.is_set() else "识别结果为空"
self._clear_result()
return response
def _asr_worker(self):
while not self.stop_event.is_set():
try:
audio_chunk = self.audio_queue.get(timeout=0.1)
except queue.Empty:
continue
except KeyboardInterrupt:
try:
self.get_logger().info("[ASR-Worker] 收到中断信号")
except Exception:
pass
break
if self.audio_recording:
self.get_logger().debug(f"[ASR-Worker] 收到音频chunk | recording:{self.audio_recording} | buffer_size:{len(self.audio_buffer)}")
try:
audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
with self.audio_lock:
self.audio_buffer.extend(audio_array)
except Exception as e:
self.get_logger().error(f"[ASR-Worker] buffer写入异常 | 错误:{e}")
pass
if self.asr_client.running:
self.asr_client.send_audio(audio_chunk)
else:
if not self.asr_client.start():
time.sleep(1.0)
def destroy_node(self):
if self._shutdown_in_progress:
return
self._shutdown_in_progress = True
try:
self.get_logger().info("ASR Audio节点正在关闭...")
except Exception:
pass
self.stop_event.set()
if hasattr(self, 'recording_thread') and self.recording_thread.is_alive():
self.recording_thread.join(timeout=1.0)
if hasattr(self, 'asr_thread') and self.asr_thread.is_alive():
self.asr_thread.join(timeout=1.0)
try:
if hasattr(self, 'audio_recorder'):
self.audio_recorder.audio.terminate()
except Exception:
pass
try:
if hasattr(self, 'asr_client'):
self.asr_client.stop()
except Exception:
pass
try:
super().destroy_node()
except Exception:
pass
def main(args=None):
rclpy.init(args=args)
node = ASRAudioNode()
try:
rclpy.spin(node)
except KeyboardInterrupt:
try:
node.get_logger().info("收到中断信号,正在关闭节点")
except Exception:
pass
finally:
try:
node.destroy_node()
except Exception:
pass
try:
rclpy.shutdown()
except Exception:
pass
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,341 @@
import rclpy
from rclpy.node import Node
from rclpy.callback_groups import ReentrantCallbackGroup
from interfaces.srv import TTSSynthesize
import threading
import yaml
import os
import signal
import subprocess
import time
import dashscope
from dashscope.audio.tts_v2 import SpeechSynthesizer, ResultCallback, AudioFormat
from ament_index_python.packages import get_package_share_directory
class DashScopeTTSClient:
def __init__(self, api_key: str,
model: str,
voice: str,
card_index: int,
device_index: int,
output_sample_rate: int,
output_channels: int,
output_volume: float,
tts_source_sample_rate: int,
tts_source_channels: int,
tts_ffmpeg_thread_queue_size: int,
force_stop_delay: float,
cleanup_timeout: float,
terminate_timeout: float,
logger):
dashscope.api_key = api_key
self.model = model
self.voice = voice
self.card_index = card_index
self.device_index = device_index
self.output_sample_rate = output_sample_rate
self.output_channels = output_channels
self.output_volume = output_volume
self.tts_source_sample_rate = tts_source_sample_rate
self.tts_source_channels = tts_source_channels
self.tts_ffmpeg_thread_queue_size = tts_ffmpeg_thread_queue_size
self.force_stop_delay = force_stop_delay
self.cleanup_timeout = cleanup_timeout
self.terminate_timeout = terminate_timeout
self.logger = logger
self.current_ffmpeg_pid = None
self._current_callback = None
self.alsa_device = f"plughw:{card_index},{device_index}" if (
card_index >= 0 and device_index >= 0
) else "default"
def force_stop(self):
if self._current_callback:
self._current_callback._interrupted = True
if not self.current_ffmpeg_pid:
if self.logger:
self.logger.warn("[TTS] force_stop: current_ffmpeg_pid is None")
return
pid = self.current_ffmpeg_pid
try:
if self.logger:
self.logger.info(f"[TTS] force_stop: 正在kill进程 {pid}")
os.kill(pid, signal.SIGTERM)
time.sleep(self.force_stop_delay)
try:
os.kill(pid, 0)
os.kill(pid, signal.SIGKILL)
if self.logger:
self.logger.info(f"[TTS] force_stop: 已发送SIGKILL到进程 {pid}")
except ProcessLookupError:
if self.logger:
self.logger.info(f"[TTS] force_stop: 进程 {pid} 已退出")
except (ProcessLookupError, OSError) as e:
if self.logger:
self.logger.warn(f"[TTS] force_stop: kill进程失败 {pid}: {e}")
finally:
self.current_ffmpeg_pid = None
self._current_callback = None
def synthesize(self, text: str, voice: str = None,
on_chunk=None,
interrupt_check=None) -> bool:
callback = _TTSCallback(self, interrupt_check, on_chunk)
self._current_callback = callback
voice_to_use = voice if voice and voice.strip() else self.voice
if not voice_to_use or not voice_to_use.strip():
if self.logger:
self.logger.error(f"[TTS] Voice参数无效: '{voice_to_use}'")
self._current_callback = None
return False
synthesizer = SpeechSynthesizer(
model=self.model,
voice=voice_to_use,
format=AudioFormat.PCM_22050HZ_MONO_16BIT,
callback=callback,
)
try:
synthesizer.streaming_call(text)
synthesizer.streaming_complete()
finally:
callback.cleanup()
self._current_callback = None
return not callback._interrupted
class _TTSCallback(ResultCallback):
def __init__(self, tts_client: DashScopeTTSClient,
interrupt_check=None,
on_chunk=None):
self.tts_client = tts_client
self.interrupt_check = interrupt_check
self.on_chunk = on_chunk
self._proc = None
self._interrupted = False
self._cleaned_up = False
def on_open(self):
ffmpeg_cmd = [
'ffmpeg',
'-f', 's16le',
'-ar', str(self.tts_client.tts_source_sample_rate),
'-ac', str(self.tts_client.tts_source_channels),
'-i', 'pipe:0',
'-f', 'alsa',
'-ar', str(self.tts_client.output_sample_rate),
'-ac', str(self.tts_client.output_channels),
'-acodec', 'pcm_s16le',
'-fflags', 'nobuffer',
'-flags', 'low_delay',
'-avioflags', 'direct',
self.tts_client.alsa_device
]
insert_pos = ffmpeg_cmd.index('-i')
ffmpeg_cmd.insert(insert_pos, str(self.tts_client.tts_ffmpeg_thread_queue_size))
ffmpeg_cmd.insert(insert_pos, '-thread_queue_size')
if self.tts_client.output_volume != 1.0:
acodec_idx = ffmpeg_cmd.index('-acodec')
ffmpeg_cmd.insert(acodec_idx, f'volume={self.tts_client.output_volume}')
ffmpeg_cmd.insert(acodec_idx, '-af')
self._proc = subprocess.Popen(
ffmpeg_cmd,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE
)
self.tts_client.current_ffmpeg_pid = self._proc.pid
def on_data(self, data: bytes) -> None:
if self._interrupted:
return
if self.interrupt_check and self.interrupt_check():
self._interrupted = True
if self._proc:
self._proc.terminate()
return
if self._proc and self._proc.stdin and not self._interrupted:
try:
self._proc.stdin.write(data)
self._proc.stdin.flush()
except BrokenPipeError:
self._interrupted = True
except OSError:
self._interrupted = True
if self.on_chunk and not self._interrupted:
self.on_chunk(data)
def cleanup(self):
if self._cleaned_up or not self._proc:
return
self._cleaned_up = True
if self._proc.stdin and not self._proc.stdin.closed:
self._proc.stdin.close()
if self._proc.poll() is None:
self._proc.wait(timeout=self.tts_client.cleanup_timeout)
if self._proc.poll() is None:
self._proc.terminate()
self._proc.wait(timeout=self.tts_client.terminate_timeout)
if self._proc.poll() is None:
self._proc.kill()
if self.tts_client.current_ffmpeg_pid == self._proc.pid:
self.tts_client.current_ffmpeg_pid = None
class TTSAudioNode(Node):
def __init__(self):
super().__init__('tts_audio_node')
self._load_config()
self._init_tts_client()
self.callback_group = ReentrantCallbackGroup()
self.synthesize_service = self.create_service(
TTSSynthesize, '/tts/synthesize', self._synthesize_callback,
callback_group=self.callback_group
)
self.interrupt_event = threading.Event()
self.playing_lock = threading.Lock()
self.is_playing = False
self.get_logger().info("[TTS] TTS Audio节点已启动")
def _load_config(self):
config_file = os.path.join(
get_package_share_directory('robot_speaker'),
'config',
'voice.yaml'
)
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
audio = config['audio']
soundcard = audio['soundcard']
tts_audio = audio['tts']
dashscope = config['dashscope']
self.output_card_index = soundcard['card_index']
self.output_device_index = soundcard['device_index']
self.output_sample_rate = soundcard['sample_rate']
self.output_channels = soundcard['channels']
self.output_volume = soundcard['volume']
self.tts_source_sample_rate = tts_audio['source_sample_rate']
self.tts_source_channels = tts_audio['source_channels']
self.tts_ffmpeg_thread_queue_size = tts_audio['ffmpeg_thread_queue_size']
self.force_stop_delay = tts_audio['force_stop_delay']
self.cleanup_timeout = tts_audio['cleanup_timeout']
self.terminate_timeout = tts_audio['terminate_timeout']
self.interrupt_wait = tts_audio['interrupt_wait']
self.dashscope_api_key = dashscope['api_key']
self.tts_model = dashscope['tts']['model']
self.tts_voice = dashscope['tts']['voice']
def _init_tts_client(self):
self.tts_client = DashScopeTTSClient(
api_key=self.dashscope_api_key,
model=self.tts_model,
voice=self.tts_voice,
card_index=self.output_card_index,
device_index=self.output_device_index,
output_sample_rate=self.output_sample_rate,
output_channels=self.output_channels,
output_volume=self.output_volume,
tts_source_sample_rate=self.tts_source_sample_rate,
tts_source_channels=self.tts_source_channels,
tts_ffmpeg_thread_queue_size=self.tts_ffmpeg_thread_queue_size,
force_stop_delay=self.force_stop_delay,
cleanup_timeout=self.cleanup_timeout,
terminate_timeout=self.terminate_timeout,
logger=self.get_logger()
)
def _synthesize_callback(self, request, response):
command = request.command if request.command else "synthesize"
if command == "interrupt":
with self.playing_lock:
was_playing = self.is_playing
has_pid = self.tts_client.current_ffmpeg_pid is not None
if was_playing or has_pid:
self.interrupt_event.set()
self.tts_client.force_stop()
self.is_playing = False
response.success = True
response.message = "已中断播放"
response.status = "interrupted"
else:
response.success = False
response.message = "没有正在播放的内容"
response.status = "none"
return response
if not request.text or not request.text.strip():
response.success = False
response.message = "文本为空"
response.status = "error"
return response
with self.playing_lock:
if self.is_playing:
self.tts_client.force_stop()
time.sleep(self.interrupt_wait)
self.is_playing = True
self.interrupt_event.clear()
def synthesize_worker():
try:
success = self.tts_client.synthesize(
request.text.strip(),
voice=request.voice if request.voice else None,
interrupt_check=lambda: self.interrupt_event.is_set()
)
with self.playing_lock:
self.is_playing = False
if self.get_logger():
if success:
self.get_logger().info("[TTS] 合成并播放成功")
else:
self.get_logger().info("[TTS] 播放被中断")
except Exception as e:
with self.playing_lock:
self.is_playing = False
if self.get_logger():
self.get_logger().error(f"[TTS] 合成失败: {e}")
thread = threading.Thread(target=synthesize_worker, daemon=True)
thread.start()
response.success = True
response.message = "合成任务已启动"
response.status = "playing"
return response
def main(args=None):
rclpy.init(args=args)
node = TTSAudioNode()
rclpy.spin(node)
node.destroy_node()
rclpy.shutdown()
if __name__ == '__main__':
main()

View File

@@ -1,26 +1,38 @@
from setuptools import find_packages, setup
from setuptools import setup, find_packages
import os
from glob import glob
package_name = 'robot_speaker'
setup(
name=package_name,
version='0.0.0',
packages=[package_name],
version='0.0.1',
packages=find_packages(where='.'),
package_dir={'': '.'},
data_files=[
('share/ament_index/resource_index/packages',
['resource/' + package_name]),
('share/' + package_name, ['package.xml']),
(os.path.join('share', package_name, 'launch'), glob('launch/*.launch.py')),
(os.path.join('share', package_name, 'config'), glob('config/*.yaml') + glob('config/*.json')),
(os.path.join('share', package_name, 'srv'), glob('srv/*.srv')),
],
install_requires=[
'setuptools',
'pypinyin',
],
install_requires=['setuptools'],
zip_safe=True,
maintainer='mzebra',
maintainer_email='mzebra@foxmail.com',
description='TODO: Package description',
description='语音识别和合成ROS2包',
license='Apache-2.0',
tests_require=['pytest'],
entry_points={
'console_scripts': [
'robot_speaker_node=robot_speaker.robot_speaker_node:main'
'robot_speaker_node = robot_speaker.core.robot_speaker_node:main',
'register_speaker_node = robot_speaker.core.register_speaker_node:main',
'skill_bridge_node = robot_speaker.bridge.skill_bridge_node:main',
'asr_audio_node = robot_speaker.services.asr_audio_node:main',
'tts_audio_node = robot_speaker.services.tts_audio_node:main',
],
},
)

10
srv/ASRRecognize.srv Normal file
View File

@@ -0,0 +1,10 @@
# 请求:启动识别
string command # "start" (默认), "stop", "reset"
---
# 响应:识别结果
bool success
string text # 识别文本(空字符串表示未识别到)
string message # 状态消息

27
srv/AudioData.srv Normal file
View File

@@ -0,0 +1,27 @@
# 请求:获取音频数据
string command # "start" (开始录音), "stop" (停止并返回), "get" (获取当前缓冲区)
int32 duration_ms # 录音时长毫秒仅用于start命令
---
# 响应:音频数据
bool success
uint8[] audio_data # PCM音频数据int16格式
int32 sample_rate
int32 channels
int32 samples # 样本数
string message

14
srv/TTSSynthesize.srv Normal file
View File

@@ -0,0 +1,14 @@
# 请求:合成文本或中断命令
string command # "synthesize" (默认), "interrupt"
string text
string voice # 可选,默认使用配置
---
# 响应:合成状态
bool success
string message
string status # "playing", "completed", "interrupted"

11
srv/VADEvent.srv Normal file
View File

@@ -0,0 +1,11 @@
# 请求等待VAD事件
string command # "wait" (等待下一个事件)
int32 timeout_ms # 超时时间毫秒0表示无限等待
---
# 响应VAD事件
bool success
string event # "speech_started", "speech_stopped", "none"
string message