Compare commits
26 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a0ceb934ce | |||
|
|
ed861a9fb1 | ||
| aaa17c10f2 | |||
|
|
c65395c50f | ||
| 9c8bd017e1 | |||
|
|
856c07715c | ||
| e8a9821ce4 | |||
| ab1fb4f3f8 | |||
| dd6ccf77bb | |||
| 7324630458 | |||
|
|
04ca80c3f9 | ||
| 98c0eb5ca5 | |||
| 71062701e1 | |||
| 0409ce0de4 | |||
|
|
ce0d581770 | ||
|
|
a1b91ed52f | ||
| 6d101b9d9e | |||
|
|
c282f9b4de | ||
| 9fd658990c | |||
| 0c118412ec | |||
| eb91e2f139 | |||
| 838a4a357c | |||
| 9c775cff5c | |||
| 63a21999bb | |||
| 8fffd4ab42 | |||
| b90d84c325 |
9
.gitignore
vendored
Normal file
9
.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
build/
|
||||
install/
|
||||
log/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.egg-info/
|
||||
dist/
|
||||
lib/
|
||||
installed_files.txt
|
||||
116
CMakeLists.txt
Normal file
116
CMakeLists.txt
Normal file
@@ -0,0 +1,116 @@
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
project(robot_speaker)
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
add_compile_options(-Wall -Wextra -Wpedantic)
|
||||
endif()
|
||||
|
||||
find_package(ament_cmake REQUIRED)
|
||||
find_package(ament_cmake_python REQUIRED)
|
||||
find_package(interfaces REQUIRED)
|
||||
|
||||
# 确保使用系统 Python(而不是 conda/miniconda 的 Python)
|
||||
find_program(PYTHON3_CMD python3 PATHS /usr/bin /usr/local/bin NO_DEFAULT_PATH)
|
||||
if(NOT PYTHON3_CMD)
|
||||
find_program(PYTHON3_CMD python3)
|
||||
endif()
|
||||
if(PYTHON3_CMD)
|
||||
set(Python3_EXECUTABLE ${PYTHON3_CMD} CACHE FILEPATH "Python 3 executable" FORCE)
|
||||
set(PYTHON_EXECUTABLE ${PYTHON3_CMD} CACHE FILEPATH "Python executable" FORCE)
|
||||
endif()
|
||||
|
||||
install(CODE "
|
||||
execute_process(
|
||||
COMMAND ${PYTHON3_CMD} -m pip install --prefix=${CMAKE_INSTALL_PREFIX} --no-deps ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
RESULT_VARIABLE install_result
|
||||
OUTPUT_VARIABLE install_output
|
||||
ERROR_VARIABLE install_error
|
||||
)
|
||||
if(NOT install_result EQUAL 0)
|
||||
message(FATAL_ERROR \"Failed to install Python package. Output: ${install_output} Error: ${install_error}\")
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND ${PYTHON3_CMD} -c \"
|
||||
import os
|
||||
import shutil
|
||||
import glob
|
||||
import sysconfig
|
||||
|
||||
install_prefix = '${CMAKE_INSTALL_PREFIX}'
|
||||
build_dir = '${CMAKE_CURRENT_BINARY_DIR}'
|
||||
python_version = f'{sysconfig.get_python_version()}'
|
||||
|
||||
# ROS2 期望的 Python 包位置
|
||||
ros2_site_packages = os.path.join(install_prefix, 'lib', f'python{python_version}', 'site-packages')
|
||||
os.makedirs(ros2_site_packages, exist_ok=True)
|
||||
|
||||
# pip install --prefix 可能将包安装到不同位置(系统环境通常是 local/lib/pythonX/dist-packages)
|
||||
pip_locations = [
|
||||
os.path.join(install_prefix, 'local', 'lib', f'python{python_version}', 'dist-packages'),
|
||||
os.path.join(install_prefix, 'lib', f'python{python_version}', 'site-packages'),
|
||||
os.path.join(install_prefix, 'local', 'lib', f'python{python_version}', 'site-packages'),
|
||||
]
|
||||
|
||||
# 查找并复制 robot_speaker 包到 ROS2 期望的位置
|
||||
robot_speaker_src = None
|
||||
for location in pip_locations:
|
||||
candidate = os.path.join(location, 'robot_speaker')
|
||||
if os.path.exists(candidate) and os.path.isdir(candidate):
|
||||
robot_speaker_src = candidate
|
||||
break
|
||||
|
||||
if robot_speaker_src:
|
||||
robot_speaker_dest = os.path.join(ros2_site_packages, 'robot_speaker')
|
||||
if os.path.exists(robot_speaker_dest):
|
||||
shutil.rmtree(robot_speaker_dest)
|
||||
if robot_speaker_src != robot_speaker_dest:
|
||||
shutil.copytree(robot_speaker_src, robot_speaker_dest)
|
||||
print(f'Copied robot_speaker from {robot_speaker_src} to {ros2_site_packages}')
|
||||
else:
|
||||
print(f'robot_speaker already in correct location')
|
||||
|
||||
# 处理 entry_points 脚本
|
||||
lib_dir = os.path.join(install_prefix, 'lib', 'robot_speaker')
|
||||
os.makedirs(lib_dir, exist_ok=True)
|
||||
|
||||
# 脚本可能在 local/bin 或 bin 中
|
||||
for bin_dir in [os.path.join(install_prefix, 'local', 'bin'), os.path.join(install_prefix, 'bin')]:
|
||||
if os.path.exists(bin_dir):
|
||||
scripts = glob.glob(os.path.join(bin_dir, '*_node'))
|
||||
for script in scripts:
|
||||
script_name = os.path.basename(script)
|
||||
dest = os.path.join(lib_dir, script_name)
|
||||
if script != dest:
|
||||
shutil.copy2(script, dest)
|
||||
os.chmod(dest, 0o755)
|
||||
print(f'Copied {script_name} to {lib_dir}')
|
||||
\"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
RESULT_VARIABLE python_result
|
||||
OUTPUT_VARIABLE python_output
|
||||
)
|
||||
if(python_result EQUAL 0)
|
||||
message(STATUS \"${python_output}\")
|
||||
else()
|
||||
message(WARNING \"Failed to setup Python package: ${python_output}\")
|
||||
endif()
|
||||
")
|
||||
|
||||
install(DIRECTORY launch/
|
||||
DESTINATION share/${PROJECT_NAME}/launch
|
||||
FILES_MATCHING PATTERN "*.launch.py"
|
||||
)
|
||||
|
||||
install(DIRECTORY config/
|
||||
DESTINATION share/${PROJECT_NAME}/config
|
||||
FILES_MATCHING PATTERN "*.yaml" PATTERN "*.json"
|
||||
)
|
||||
|
||||
if(BUILD_TESTING)
|
||||
find_package(ament_lint_auto REQUIRED)
|
||||
ament_lint_auto_find_test_dependencies()
|
||||
endif()
|
||||
|
||||
ament_package()
|
||||
102
README.md
102
README.md
@@ -1,2 +1,102 @@
|
||||
# hivecore_robot_voice
|
||||
# ROS 语音包 (robot_speaker)
|
||||
|
||||
## 注册阿里云百炼获取api_key
|
||||
https://bailian.console.aliyun.com/?tab=model#/api-key
|
||||
->密钥管理
|
||||
放到config/voice.yaml
|
||||
|
||||
## 安装依赖
|
||||
1. 系统依赖
|
||||
```bash
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg swig meson ninja-build build-essential pkg-config libwebrtc-audio-processing-dev
|
||||
```
|
||||
|
||||
2. Python依赖
|
||||
```bash
|
||||
cd ~/ros_learn/hivecore_robot_voice
|
||||
# 在 Python 3.10 环境下,需要单独安装 aec-audio-processing 以跳过版本检查
|
||||
pip3 install aec-audio-processing --no-binary :all: --ignore-requires-python --break-system-packages
|
||||
pip3 install -r requirements.txt --break-system-packages
|
||||
```
|
||||
|
||||
## 编译启动
|
||||
1. 注册声纹
|
||||
- 启动节点后可以说:er gou我现在正在注册声纹,这是一段很长的测试语音,请把我的声音录进去。
|
||||
- 正确的注册姿势:包含唤醒词二狗,不要停顿的尽量说完3秒
|
||||
|
||||
- 现在的逻辑只要识别到二狗就注册,然后退出节点,识别不到二狗继续等待
|
||||
- 多注册几段,换方向距离注册,可以提高识别相似度,注册方向对声纹相似性影响很大
|
||||
```bash
|
||||
cd ~/ros_learn/hivecore_robot_voice
|
||||
colcon build
|
||||
source install/setup.bash
|
||||
```
|
||||
|
||||
```bash
|
||||
# 终端1: 启动ASR节点
|
||||
ros2 run robot_speaker asr_audio_node
|
||||
# 终端2: 注册声纹
|
||||
ros2 run robot_speaker register_speaker_node
|
||||
```
|
||||
|
||||
2. 主节点
|
||||
- 启动节点后每句交互包含唤醒词,唤醒词和语句之间不要有停顿
|
||||
- 二狗拍照看看开启图文交互
|
||||
- 支持已注册声纹用户打断
|
||||
```bash
|
||||
cd ~/ros_learn/hivecore_robot_voice
|
||||
colcon build
|
||||
source install/setup.bash
|
||||
ros2 launch robot_speaker voice.launch.py
|
||||
```
|
||||
|
||||
3. ASR节点
|
||||
```bash
|
||||
ros2 run robot_speaker asr_audio_node
|
||||
```
|
||||
|
||||
4. TTS节点
|
||||
```bash
|
||||
# 终端1: 启动TTS节点
|
||||
ros2 run robot_speaker tts_audio_node
|
||||
|
||||
# 终端2: 启动播放
|
||||
source install/setup.bash
|
||||
ros2 service call /tts/synthesize robot_speaker/srv/TTSSynthesize \
|
||||
"{command: 'synthesize', text: '这是一段很长的测试文本,用于测试TTS中断功能。我需要说很多很多内容,这样你才有足够的时间来测试中断命令。让我继续说下去,这是一段很长的测试文本,用于测试TTS中断功能。我需要说很多很多内容,这样你才有足够的时间来测试中断命令。让我继续说下去,这是一段很长的测试文本,用于测试TTS中断功能。我需要说很多很多内容,这样你才有足够的时间来测试中断命令。', voice: ''}"
|
||||
|
||||
# 终端3: 立即执行中断
|
||||
source install/setup.bash
|
||||
ros2 service call /tts/synthesize robot_speaker/srv/TTSSynthesize \
|
||||
"{command: 'interrupt', text: '', voice: ''}"
|
||||
```
|
||||
|
||||
5. 完整运行
|
||||
```bash
|
||||
# 终端1:启动 brain 节点
|
||||
# 终端2:启动 voice 节点
|
||||
# 终端3:启动 bridge 节点
|
||||
# 终端4:订阅相机
|
||||
```
|
||||
|
||||
## 用到的命令
|
||||
1. 音频设备
|
||||
```bash
|
||||
# 1. 查看所有音频设备
|
||||
cat /proc/asound/cards
|
||||
# 2. 查看 card(1)的流信息(设备参数)
|
||||
cat /proc/asound/card1/stream0
|
||||
```
|
||||
|
||||
2. 相机设备
|
||||
```bash
|
||||
# 1. 查看相机所有基础信息(型号、固件版本、序列号等)
|
||||
rs-enumerate-devices -c
|
||||
```
|
||||
|
||||
3. 模型下载
|
||||
```bash
|
||||
modelscope download --model iic/speech_campplus_sv_zh-cn_16k-common --local_dir [指定路径]
|
||||
```
|
||||
|
||||
|
||||
46
config/knowledge.json
Normal file
46
config/knowledge.json
Normal file
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"entries": [
|
||||
{
|
||||
"id": "robot_identity_1",
|
||||
"patterns": [
|
||||
"ni shi shui"
|
||||
],
|
||||
"answer": "我叫二狗,是蜂核科技的机器人,很高兴为你服务"
|
||||
},
|
||||
{
|
||||
"id": "robot_identity_2",
|
||||
"patterns": [
|
||||
"ni jiao sha"
|
||||
],
|
||||
"answer": "我叫二狗呀,我是你的好帮手"
|
||||
},
|
||||
{
|
||||
"id": "wake_word",
|
||||
"patterns": [
|
||||
"ni de ming zi"
|
||||
],
|
||||
"answer": "我的名字是二狗"
|
||||
},
|
||||
{
|
||||
"id": "skill_1",
|
||||
"patterns": [
|
||||
"tiao ge wu"
|
||||
],
|
||||
"answer": "这个我真不会,我怕跳起来吓到你"
|
||||
},
|
||||
{
|
||||
"id": "skill_2",
|
||||
"patterns": [
|
||||
"ni neng gan"
|
||||
],
|
||||
"answer": "我可以陪你聊天,也能帮你干活"
|
||||
},
|
||||
{
|
||||
"id": "skill_3",
|
||||
"patterns": [
|
||||
"ni hui gan"
|
||||
],
|
||||
"answer": "我可以陪你聊天,你也可以发布具体的指令让我干活"
|
||||
}
|
||||
]
|
||||
}
|
||||
596
config/speakers.json
Normal file
596
config/speakers.json
Normal file
@@ -0,0 +1,596 @@
|
||||
{
|
||||
"user_1769589229": {
|
||||
"embedding": [
|
||||
0.018443606793880463,
|
||||
0.12385621666908264,
|
||||
0.42172902822494507,
|
||||
1.3724409341812134,
|
||||
-0.4492957293987274,
|
||||
-0.6218937635421753,
|
||||
-0.9678031802177429,
|
||||
0.678302526473999,
|
||||
1.744055151939392,
|
||||
-1.8670854568481445,
|
||||
-1.9064403772354126,
|
||||
0.5380862951278687,
|
||||
0.16627110540866852,
|
||||
-0.6322636008262634,
|
||||
-1.7715388536453247,
|
||||
-0.2003282904624939,
|
||||
-2.1722018718719482,
|
||||
0.5719940662384033,
|
||||
-0.6866416931152344,
|
||||
1.5751206874847412,
|
||||
0.27836838364601135,
|
||||
-0.03192685544490814,
|
||||
-0.486663818359375,
|
||||
1.6337751150131226,
|
||||
-1.0401458740234375,
|
||||
0.0581182986497879,
|
||||
0.9309709072113037,
|
||||
-0.00908487569540739,
|
||||
-0.05825135484337807,
|
||||
1.042805552482605,
|
||||
0.95391845703125,
|
||||
0.5708717107772827,
|
||||
-1.3427493572235107,
|
||||
-0.46104469895362854,
|
||||
-0.4387856423854828,
|
||||
-2.2000691890716553,
|
||||
-1.2598334550857544,
|
||||
-0.34516626596450806,
|
||||
-1.5205646753311157,
|
||||
-1.3810551166534424,
|
||||
-0.9685532450675964,
|
||||
0.33360639214515686,
|
||||
0.7115882039070129,
|
||||
-0.6262675523757935,
|
||||
-1.831620216369629,
|
||||
-1.0514777898788452,
|
||||
0.677291750907898,
|
||||
1.6341345310211182,
|
||||
1.0802626609802246,
|
||||
0.2750645875930786,
|
||||
2.517354726791382,
|
||||
-0.5022090077400208,
|
||||
-0.512808084487915,
|
||||
-1.0913103818893433,
|
||||
-0.5228419899940491,
|
||||
0.7334955334663391,
|
||||
-0.04904095083475113,
|
||||
0.5420397520065308,
|
||||
0.76543128490448,
|
||||
-0.28510582447052,
|
||||
-0.015149342827498913,
|
||||
-0.38553595542907715,
|
||||
-0.8873414993286133,
|
||||
-0.7940725684165955,
|
||||
2.0196990966796875,
|
||||
1.079050064086914,
|
||||
-0.3385912775993347,
|
||||
0.687140703201294,
|
||||
0.8218201994895935,
|
||||
-0.8151140809059143,
|
||||
-0.12016838788986206,
|
||||
-0.5360821485519409,
|
||||
1.5735585689544678,
|
||||
2.2081315517425537,
|
||||
-0.8545964956283569,
|
||||
-0.7184719443321228,
|
||||
1.0227694511413574,
|
||||
1.004757285118103,
|
||||
1.279994010925293,
|
||||
1.0615602731704712,
|
||||
-0.026518817991018295,
|
||||
-0.12089776247739792,
|
||||
1.9652493000030518,
|
||||
-2.219129800796509,
|
||||
1.3730603456497192,
|
||||
-0.2324638068675995,
|
||||
1.1085208654403687,
|
||||
0.38454243540763855,
|
||||
-0.7640709280967712,
|
||||
1.8690227270126343,
|
||||
-2.371783971786499,
|
||||
0.4353397786617279,
|
||||
0.6538525223731995,
|
||||
-1.0312976837158203,
|
||||
-0.06995117664337158,
|
||||
2.4163870811462402,
|
||||
0.16073228418827057,
|
||||
-0.6870989799499512,
|
||||
-1.6179540157318115,
|
||||
-1.3476271629333496,
|
||||
0.20239552855491638,
|
||||
-0.050261445343494415,
|
||||
-0.038828205317258835,
|
||||
0.4753866195678711,
|
||||
0.6126185059547424,
|
||||
0.8918412923812866,
|
||||
-0.3909176290035248,
|
||||
0.2147030234336853,
|
||||
0.39352068305015564,
|
||||
-0.6788452863693237,
|
||||
-2.1740481853485107,
|
||||
1.1571974754333496,
|
||||
-0.4064839482307434,
|
||||
1.2412688732147217,
|
||||
0.7256757616996765,
|
||||
1.7226027250289917,
|
||||
-0.0026558407116681337,
|
||||
-0.5800378918647766,
|
||||
-0.15300726890563965,
|
||||
-0.7650083899497986,
|
||||
-2.0132904052734375,
|
||||
-1.0595450401306152,
|
||||
-0.49976038932800293,
|
||||
0.9254617094993591,
|
||||
-1.2378792762756348,
|
||||
1.6656403541564941,
|
||||
-0.7135428786277771,
|
||||
-0.9382724761962891,
|
||||
0.9358375668525696,
|
||||
0.3685700595378876,
|
||||
-0.10180468112230301,
|
||||
-0.1037834882736206,
|
||||
-0.23670005798339844,
|
||||
1.75762140750885,
|
||||
-0.17887072265148163,
|
||||
0.046728529036045074,
|
||||
-0.8897371888160706,
|
||||
-1.3732428550720215,
|
||||
-1.258161187171936,
|
||||
-1.8424062728881836,
|
||||
-0.20653045177459717,
|
||||
1.2090659141540527,
|
||||
-2.8419432640075684,
|
||||
-0.21915671229362488,
|
||||
0.9777458310127258,
|
||||
-0.4830246567726135,
|
||||
-1.0184019804000854,
|
||||
-1.981907606124878,
|
||||
-0.9043097496032715,
|
||||
1.2316601276397705,
|
||||
0.4337644577026367,
|
||||
-1.4176150560379028,
|
||||
-0.0775287076830864,
|
||||
1.9701248407363892,
|
||||
-0.49479153752326965,
|
||||
-0.8893828988075256,
|
||||
-1.4819709062576294,
|
||||
1.7628812789916992,
|
||||
-1.1569868326187134,
|
||||
-0.5023629069328308,
|
||||
1.0665892362594604,
|
||||
0.380581796169281,
|
||||
0.8616085052490234,
|
||||
1.566547155380249,
|
||||
-0.08466020226478577,
|
||||
-6.428647611755878e-05,
|
||||
-0.4506562650203705,
|
||||
1.4498881101608276,
|
||||
-0.8292654752731323,
|
||||
-1.5012402534484863,
|
||||
-2.3441176414489746,
|
||||
-0.1354956328868866,
|
||||
0.9400366544723511,
|
||||
-2.566408157348633,
|
||||
-0.6355810761451721,
|
||||
0.6913732290267944,
|
||||
-1.6313157081604004,
|
||||
-0.7377245426177979,
|
||||
-0.6275296807289124,
|
||||
1.2654041051864624,
|
||||
-1.2346998453140259,
|
||||
-0.9682437181472778,
|
||||
1.750296950340271,
|
||||
0.145521342754364,
|
||||
0.3888598680496216,
|
||||
-0.10642947256565094,
|
||||
0.534409761428833,
|
||||
-0.07756417989730835,
|
||||
-0.36027759313583374,
|
||||
0.45393145084381104,
|
||||
0.48670390248298645,
|
||||
-0.41557130217552185
|
||||
],
|
||||
"env": "",
|
||||
"registered_at": 1769589229.8906083
|
||||
},
|
||||
"user_1769589397": {
|
||||
"embedding": [
|
||||
-0.4532654285430908,
|
||||
0.9910935163497925,
|
||||
0.7677441835403442,
|
||||
0.6021982431411743,
|
||||
-0.15526464581489563,
|
||||
0.07699152082204819,
|
||||
-0.20115968585014343,
|
||||
1.1546334028244019,
|
||||
1.3028098344802856,
|
||||
-1.102020263671875,
|
||||
-1.785357117652893,
|
||||
1.0002834796905518,
|
||||
0.29556989669799805,
|
||||
-1.1847732067108154,
|
||||
-1.6235555410385132,
|
||||
-0.37263453006744385,
|
||||
-1.0660096406936646,
|
||||
1.1186366081237793,
|
||||
-0.2739306390285492,
|
||||
1.2053704261779785,
|
||||
-0.4484007656574249,
|
||||
-0.036067165434360504,
|
||||
-0.22930052876472473,
|
||||
0.7094787359237671,
|
||||
-1.289236307144165,
|
||||
0.6730620265007019,
|
||||
0.139224573969841,
|
||||
0.9508735537528992,
|
||||
0.19451767206192017,
|
||||
0.09167198091745377,
|
||||
0.6681411266326904,
|
||||
0.5114644169807434,
|
||||
-0.41296282410621643,
|
||||
-0.3286001980304718,
|
||||
-0.13978855311870575,
|
||||
-1.4886829853057861,
|
||||
-1.125450849533081,
|
||||
-0.5365853309631348,
|
||||
-1.491755723953247,
|
||||
-0.9122400879859924,
|
||||
-0.336325466632843,
|
||||
0.4180590510368347,
|
||||
0.28993961215019226,
|
||||
-0.18810254335403442,
|
||||
-0.8575659990310669,
|
||||
-0.7043600082397461,
|
||||
0.1335042417049408,
|
||||
0.7772237658500671,
|
||||
0.5636520385742188,
|
||||
-0.7948008179664612,
|
||||
1.7150989770889282,
|
||||
-0.13010169565677643,
|
||||
-0.17901964485645294,
|
||||
0.049516208469867706,
|
||||
-0.3525894284248352,
|
||||
0.47636479139328003,
|
||||
0.4723852276802063,
|
||||
0.21579991281032562,
|
||||
0.4706135094165802,
|
||||
-0.7862219214439392,
|
||||
0.3285289406776428,
|
||||
0.06317808479070663,
|
||||
-0.44086384773254395,
|
||||
-0.48760634660720825,
|
||||
0.5548083782196045,
|
||||
0.9824976921081543,
|
||||
0.002366408007219434,
|
||||
0.9341856837272644,
|
||||
0.7644594311714172,
|
||||
-0.4781777560710907,
|
||||
0.140120267868042,
|
||||
-0.27633413672447205,
|
||||
0.2346642166376114,
|
||||
1.050230860710144,
|
||||
-1.269995927810669,
|
||||
-0.05720380321145058,
|
||||
1.291229248046875,
|
||||
0.9839679002761841,
|
||||
0.8129491209983826,
|
||||
1.5021783113479614,
|
||||
-0.3042735457420349,
|
||||
-0.5572257041931152,
|
||||
0.9156222343444824,
|
||||
-1.9603447914123535,
|
||||
0.43610018491744995,
|
||||
0.4057847559452057,
|
||||
0.7319568395614624,
|
||||
0.20832139253616333,
|
||||
-0.3430367410182953,
|
||||
1.1169347763061523,
|
||||
-1.3572204113006592,
|
||||
-0.338941365480423,
|
||||
0.68513023853302,
|
||||
-0.5876723527908325,
|
||||
0.028429267928004265,
|
||||
1.647197961807251,
|
||||
0.16790558397769928,
|
||||
-0.39321064949035645,
|
||||
-0.6376479864120483,
|
||||
-0.8013231754302979,
|
||||
0.2443818897008896,
|
||||
-0.4631305932998657,
|
||||
0.22423194348812103,
|
||||
1.2424927949905396,
|
||||
-0.29924842715263367,
|
||||
0.8623120784759521,
|
||||
-0.1876244992017746,
|
||||
0.4357032775878906,
|
||||
-0.1294589787721634,
|
||||
-0.6075098514556885,
|
||||
-0.13139747083187103,
|
||||
0.7296662330627441,
|
||||
-0.535290539264679,
|
||||
0.36691513657569885,
|
||||
0.7906659841537476,
|
||||
1.353682279586792,
|
||||
-0.09513506293296814,
|
||||
-0.25815069675445557,
|
||||
0.49696165323257446,
|
||||
-0.8457471132278442,
|
||||
-1.6415969133377075,
|
||||
-1.4221503734588623,
|
||||
-0.8390084505081177,
|
||||
0.78926020860672,
|
||||
-0.6399183869361877,
|
||||
1.2397722005844116,
|
||||
-0.4215489625930786,
|
||||
-1.6843048334121704,
|
||||
0.2801710367202759,
|
||||
0.14025956392288208,
|
||||
-0.07066306471824646,
|
||||
0.6200811862945557,
|
||||
0.06813270598649979,
|
||||
1.0460718870162964,
|
||||
-0.10868484526872635,
|
||||
-0.4543164074420929,
|
||||
-0.2009115219116211,
|
||||
-1.5997940301895142,
|
||||
-0.901277482509613,
|
||||
-0.6989807486534119,
|
||||
-0.6416334509849548,
|
||||
0.6334083676338196,
|
||||
-1.9596667289733887,
|
||||
0.5712984204292297,
|
||||
0.46919143199920654,
|
||||
-0.29728618264198303,
|
||||
-1.1560853719711304,
|
||||
-1.0001498460769653,
|
||||
-0.514187753200531,
|
||||
0.5281404256820679,
|
||||
-0.30581149458885193,
|
||||
-0.509894073009491,
|
||||
-0.5975268483161926,
|
||||
1.3572251796722412,
|
||||
-0.6662765145301819,
|
||||
-0.42911258339881897,
|
||||
-1.1632274389266968,
|
||||
1.3836815357208252,
|
||||
-0.3148840367794037,
|
||||
-0.4249371290206909,
|
||||
0.7550786733627319,
|
||||
-0.05023616552352905,
|
||||
0.4652675986289978,
|
||||
0.5009594559669495,
|
||||
-0.539340615272522,
|
||||
0.5251657366752625,
|
||||
-0.3844148814678192,
|
||||
1.1907575130462646,
|
||||
-0.05959271639585495,
|
||||
-1.3751143217086792,
|
||||
-1.4880049228668213,
|
||||
0.07974031567573547,
|
||||
1.0876556634902954,
|
||||
-1.8819210529327393,
|
||||
-0.33337870240211487,
|
||||
0.8860157132148743,
|
||||
-0.7781083583831787,
|
||||
-0.18586120009422302,
|
||||
0.36383724212646484,
|
||||
-0.05233919247984886,
|
||||
-1.4240131378173828,
|
||||
-0.6472991704940796,
|
||||
0.9354408383369446,
|
||||
-0.22309261560440063,
|
||||
0.8367215991020203,
|
||||
-0.20836658775806427,
|
||||
0.7580796480178833,
|
||||
-0.06159410998225212,
|
||||
-0.1761341243982315,
|
||||
-0.4837302267551422,
|
||||
-0.1933494508266449,
|
||||
-0.23003722727298737
|
||||
],
|
||||
"env": "",
|
||||
"registered_at": 1769589397.5840247
|
||||
},
|
||||
"user_1769589494": {
|
||||
"embedding": [
|
||||
0.23541471362113953,
|
||||
0.667961597442627,
|
||||
0.38707974553108215,
|
||||
0.6673084497451782,
|
||||
-1.869005560874939,
|
||||
-0.4901138246059418,
|
||||
-0.9352726936340332,
|
||||
0.49656397104263306,
|
||||
0.004735413007438183,
|
||||
1.1503483057022095,
|
||||
-0.7223904728889465,
|
||||
1.1780078411102295,
|
||||
-1.1934415102005005,
|
||||
0.5933876633644104,
|
||||
-0.047901105135679245,
|
||||
-0.6350924372673035,
|
||||
0.9101377725601196,
|
||||
0.9945328235626221,
|
||||
-0.6955628395080566,
|
||||
-1.4766680002212524,
|
||||
0.14297445118427277,
|
||||
1.0183905363082886,
|
||||
-0.5544767379760742,
|
||||
0.7108471989631653,
|
||||
0.12324491143226624,
|
||||
0.8664625287055969,
|
||||
-1.0339009761810303,
|
||||
0.6388123035430908,
|
||||
-0.3606623709201813,
|
||||
1.1092636585235596,
|
||||
-0.2134912759065628,
|
||||
-1.0129042863845825,
|
||||
1.1676888465881348,
|
||||
-0.25849631428718567,
|
||||
0.21622547507286072,
|
||||
-0.21850265562534332,
|
||||
-2.146343469619751,
|
||||
0.9746832251548767,
|
||||
-1.0417606830596924,
|
||||
-1.118934988975525,
|
||||
0.45158135890960693,
|
||||
-0.12440077215433121,
|
||||
0.9278182983398438,
|
||||
0.673552393913269,
|
||||
-1.4133691787719727,
|
||||
-0.9833011031150818,
|
||||
1.7980570793151855,
|
||||
1.1249372959136963,
|
||||
0.6850293278694153,
|
||||
-0.4094180762767792,
|
||||
1.3220067024230957,
|
||||
-0.5562354922294617,
|
||||
0.35797858238220215,
|
||||
0.7082096338272095,
|
||||
0.38267695903778076,
|
||||
-0.3067215085029602,
|
||||
-0.12430296093225479,
|
||||
-1.3622304201126099,
|
||||
-1.2127659320831299,
|
||||
-0.14369715750217438,
|
||||
0.744861900806427,
|
||||
0.35735955834388733,
|
||||
0.30824899673461914,
|
||||
-0.3879246413707733,
|
||||
0.332281231880188,
|
||||
0.31966903805732727,
|
||||
-0.014374539256095886,
|
||||
0.37477824091911316,
|
||||
1.2712546586990356,
|
||||
-0.1365097314119339,
|
||||
0.5229204893112183,
|
||||
0.47963225841522217,
|
||||
0.8237362504005432,
|
||||
0.7043209671974182,
|
||||
-1.673892855644226,
|
||||
0.13583803176879883,
|
||||
0.5652695298194885,
|
||||
0.40299320220947266,
|
||||
0.08790996670722961,
|
||||
0.2492693066596985,
|
||||
-0.4379039406776428,
|
||||
-1.14923894405365,
|
||||
-0.5844811797142029,
|
||||
-1.132568359375,
|
||||
0.49928411841392517,
|
||||
-0.4650140404701233,
|
||||
1.1566886901855469,
|
||||
-0.07155625522136688,
|
||||
0.36949872970581055,
|
||||
0.31576940417289734,
|
||||
-0.4941798746585846,
|
||||
0.8808521628379822,
|
||||
0.12892158329486847,
|
||||
-0.3473222255706787,
|
||||
-0.1342766135931015,
|
||||
0.6350370645523071,
|
||||
-1.524943470954895,
|
||||
0.11389171332120895,
|
||||
-0.14301487803459167,
|
||||
-1.9267250299453735,
|
||||
-1.5791492462158203,
|
||||
-0.19560043513774872,
|
||||
1.5311495065689087,
|
||||
1.9668593406677246,
|
||||
-0.964552104473114,
|
||||
-1.3139442205429077,
|
||||
-0.9792137145996094,
|
||||
0.4413124918937683,
|
||||
-0.18592560291290283,
|
||||
-0.5387620329856873,
|
||||
-0.7066377997398376,
|
||||
0.9972496032714844,
|
||||
-0.12376223504543304,
|
||||
-0.6737706661224365,
|
||||
0.7983350157737732,
|
||||
0.5444274544715881,
|
||||
-1.3038272857666016,
|
||||
1.101620078086853,
|
||||
-1.5507662296295166,
|
||||
0.02854086272418499,
|
||||
-0.6057300567626953,
|
||||
-0.782597005367279,
|
||||
0.3482932448387146,
|
||||
-0.055229704827070236,
|
||||
0.38987356424331665,
|
||||
-0.35090646147727966,
|
||||
-0.190815731883049,
|
||||
-0.5883421301841736,
|
||||
0.6471948027610779,
|
||||
0.5951821804046631,
|
||||
0.4943574070930481,
|
||||
-0.1316496580839157,
|
||||
-0.8007314205169678,
|
||||
-0.13866537809371948,
|
||||
-0.012848706915974617,
|
||||
1.1189842224121094,
|
||||
-1.1396784782409668,
|
||||
-0.33659735321998596,
|
||||
-0.27989667654037476,
|
||||
0.15101654827594757,
|
||||
-0.44554460048675537,
|
||||
0.4468748867511749,
|
||||
0.4023851454257965,
|
||||
-0.37321993708610535,
|
||||
-0.4136735200881958,
|
||||
-0.22391735017299652,
|
||||
-0.3109915256500244,
|
||||
0.9604361057281494,
|
||||
-0.6297188401222229,
|
||||
1.3016139268875122,
|
||||
0.36373990774154663,
|
||||
-1.05316162109375,
|
||||
0.41111207008361816,
|
||||
1.8767585754394531,
|
||||
-0.754970133304596,
|
||||
0.16698729991912842,
|
||||
-0.2632003128528595,
|
||||
-0.4256270229816437,
|
||||
1.7379480600357056,
|
||||
1.2178281545639038,
|
||||
-0.0028167024720460176,
|
||||
0.42778730392456055,
|
||||
-0.12732906639575958,
|
||||
-0.3295230567455292,
|
||||
0.36760953068733215,
|
||||
0.057388786226511,
|
||||
-0.4098236858844757,
|
||||
0.9829326868057251,
|
||||
-0.34538817405700684,
|
||||
-1.3545023202896118,
|
||||
-0.4676443040370941,
|
||||
0.7782469987869263,
|
||||
0.14342212677001953,
|
||||
-1.7002856731414795,
|
||||
0.4266798794269562,
|
||||
-0.33054685592651367,
|
||||
0.9089714884757996,
|
||||
0.5873302221298218,
|
||||
-0.9908685088157654,
|
||||
-0.6938693523406982,
|
||||
-1.5290637016296387,
|
||||
-0.0892898365855217,
|
||||
0.5326513648033142,
|
||||
-0.07912395894527435,
|
||||
0.4673354923725128,
|
||||
-1.0052272081375122,
|
||||
0.13853217661380768,
|
||||
-0.08604929596185684,
|
||||
0.3112524449825287,
|
||||
-1.377512812614441,
|
||||
-0.05614912137389183,
|
||||
0.2633572220802307
|
||||
],
|
||||
"env": "",
|
||||
"registered_at": 1769589494.0118024
|
||||
}
|
||||
}
|
||||
67
config/voice.yaml
Normal file
67
config/voice.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
# ROS 语音包配置文件
|
||||
|
||||
dashscope:
|
||||
api_key: "sk-7215a5ab7a00469db4072e1672a0661e"
|
||||
asr:
|
||||
model: "qwen3-asr-flash-realtime"
|
||||
url: "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||
llm:
|
||||
model: "qwen3-vl-flash"
|
||||
base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
temperature: 0.7
|
||||
max_tokens: 4096
|
||||
max_history: 10
|
||||
summary_trigger: 3
|
||||
tts:
|
||||
model: "cosyvoice-v3-flash"
|
||||
voice: "longanyang"
|
||||
|
||||
audio:
|
||||
microphone:
|
||||
device_index: -1 # 使用系统默认输入设备
|
||||
sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz,避免重采样可能导致的问题
|
||||
channels: 1 # 输入声道数:单声道(MONO,适合语音采集)
|
||||
chunk: 1024
|
||||
heartbeat_interval: 2.0 # 心跳间隔(秒),用于定期输出录音状态
|
||||
soundcard:
|
||||
card_index: -1 # 使用默认声卡
|
||||
device_index: -1 # 使用默认输出设备
|
||||
sample_rate: 48000 # 输出采样率:默认 44100
|
||||
channels: 2 # 输出声道数:立体声(2声道,FL+FR)
|
||||
volume: 1.0 # 音量比例(0.0-1.0,0.2表示20%音量)
|
||||
tts:
|
||||
source_sample_rate: 22050 # TTS服务固定输出采样率(DashScope服务固定值,不可修改)
|
||||
source_channels: 1 # TTS服务固定输出声道数(DashScope服务固定值,不可修改)
|
||||
ffmpeg_thread_queue_size: 4096 # ffmpeg输入线程队列大小(增大以减少卡顿)
|
||||
force_stop_delay: 0.1 # 强制停止时的延迟(秒)
|
||||
cleanup_timeout: 30.0 # 清理超时(秒)
|
||||
terminate_timeout: 1.0 # 终止超时(秒)
|
||||
interrupt_wait: 0.1 # 中断等待时间(秒)
|
||||
|
||||
vad:
|
||||
vad_mode: 3 # VAD模式:0-3,3最严格
|
||||
silence_duration_ms: 1000 # 静音持续时长(毫秒)
|
||||
min_energy_threshold: 300 # 最小能量阈值
|
||||
|
||||
system:
|
||||
use_wake_word: true # 是否启用唤醒词检测
|
||||
wake_word: "er gou" # 唤醒词(拼音)
|
||||
session_timeout: 3.0 # 会话超时时间(秒)
|
||||
shutup_keywords: "bi zui" # 闭嘴指令关键词(拼音,逗号分隔)
|
||||
interrupt_command_queue_depth: 10 # 中断命令订阅的队列深度(QoS)
|
||||
sv_enabled: false # 是否启用声纹识别
|
||||
# sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
|
||||
sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
|
||||
sv_threshold: 0.65 # 声纹识别阈值(0.0-1.0,值越小越宽松,值越大越严格)
|
||||
# sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json" # 声纹数据库保存路径(JSON格式,相对于ROS2包share目录)
|
||||
sv_speaker_db_path: "~/ros_learn/hivecore_robot_voice/config/speakers.json" # 声纹数据库保存路径(JSON格式,相对于ROS2包share目录)
|
||||
sv_buffer_size: 96000 # 声纹验证录音缓冲区大小(样本数,48kHz下2秒=96000)
|
||||
continue_without_image: true # 多模态意图(skill_sequence/chat_camera)未获取到图片时是否继续推理
|
||||
|
||||
camera:
|
||||
image:
|
||||
jpeg_quality: 85 # JPEG压缩质量(0-100,85是质量和大小平衡点)
|
||||
|
||||
interfaces:
|
||||
# root_path: "~/hivecore_robot_os1/hivecore_robot_interfaces/src" # 接口文件根目录,支持 ~ 展开和相对路径
|
||||
root_path: "~/ros_learn/hivecore_robot_interfaces/src" # 接口文件根目录,支持 ~ 展开和相对路径
|
||||
54
launch/register_speaker.launch.py
Normal file
54
launch/register_speaker.launch.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from launch import LaunchDescription
|
||||
from launch_ros.actions import Node
|
||||
from launch.actions import SetEnvironmentVariable, RegisterEventHandler
|
||||
from launch.event_handlers import OnProcessExit
|
||||
from launch.actions import EmitEvent
|
||||
from launch.events import Shutdown
|
||||
import os
|
||||
|
||||
|
||||
def generate_launch_description():
|
||||
"""启动声纹注册节点(需要ASR服务)"""
|
||||
# 获取interfaces包的install路径
|
||||
interfaces_install_path = os.path.expanduser('~/ros_learn/hivecore_robot_interfaces/install')
|
||||
|
||||
# 设置AMENT_PREFIX_PATH,确保能找到interfaces包的消息类型
|
||||
ament_prefix_path = os.environ.get('AMENT_PREFIX_PATH', '')
|
||||
if interfaces_install_path not in ament_prefix_path:
|
||||
if ament_prefix_path:
|
||||
ament_prefix_path = f'{ament_prefix_path}:{interfaces_install_path}'
|
||||
else:
|
||||
ament_prefix_path = interfaces_install_path
|
||||
|
||||
# ASR + 音频输入设备节点(提供ASR和AudioData服务)
|
||||
asr_audio_node = Node(
|
||||
package='robot_speaker',
|
||||
executable='asr_audio_node',
|
||||
name='asr_audio_node',
|
||||
output='screen'
|
||||
)
|
||||
|
||||
# 声纹注册节点
|
||||
register_speaker_node = Node(
|
||||
package='robot_speaker',
|
||||
executable='register_speaker_node',
|
||||
name='register_speaker_node',
|
||||
output='screen'
|
||||
)
|
||||
|
||||
# 当注册节点退出时,关闭整个 launch
|
||||
register_exit_handler = RegisterEventHandler(
|
||||
OnProcessExit(
|
||||
target_action=register_speaker_node,
|
||||
on_exit=[
|
||||
EmitEvent(event=Shutdown(reason='注册完成,关闭所有节点'))
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return LaunchDescription([
|
||||
SetEnvironmentVariable('AMENT_PREFIX_PATH', ament_prefix_path),
|
||||
asr_audio_node,
|
||||
register_speaker_node,
|
||||
register_exit_handler,
|
||||
])
|
||||
46
launch/voice.launch.py
Normal file
46
launch/voice.launch.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from launch import LaunchDescription
|
||||
from launch_ros.actions import Node
|
||||
from launch.actions import SetEnvironmentVariable
|
||||
import os
|
||||
|
||||
|
||||
def generate_launch_description():
|
||||
"""启动语音交互节点,所有参数从 voice.yaml 读取"""
|
||||
# 获取interfaces包的install路径
|
||||
interfaces_install_path = os.path.expanduser('~/ros_learn/hivecore_robot_interfaces/install')
|
||||
|
||||
# 设置AMENT_PREFIX_PATH,确保能找到interfaces包的消息类型
|
||||
ament_prefix_path = os.environ.get('AMENT_PREFIX_PATH', '')
|
||||
if interfaces_install_path not in ament_prefix_path:
|
||||
if ament_prefix_path:
|
||||
ament_prefix_path = f'{ament_prefix_path}:{interfaces_install_path}'
|
||||
else:
|
||||
ament_prefix_path = interfaces_install_path
|
||||
|
||||
return LaunchDescription([
|
||||
SetEnvironmentVariable('AMENT_PREFIX_PATH', ament_prefix_path),
|
||||
# ASR + 音频输入设备节点(同时提供VAD事件Service,利用云端ASR的VAD)
|
||||
Node(
|
||||
package='robot_speaker',
|
||||
executable='asr_audio_node',
|
||||
name='asr_audio_node',
|
||||
output='screen'
|
||||
),
|
||||
# TTS + 音频输出设备节点
|
||||
Node(
|
||||
package='robot_speaker',
|
||||
executable='tts_audio_node',
|
||||
name='tts_audio_node',
|
||||
output='screen'
|
||||
),
|
||||
# 主业务逻辑节点
|
||||
Node(
|
||||
package='robot_speaker',
|
||||
executable='robot_speaker_node',
|
||||
name='robot_speaker_node',
|
||||
output='screen'
|
||||
),
|
||||
])
|
||||
|
||||
|
||||
|
||||
21
package.xml
21
package.xml
@@ -2,13 +2,26 @@
|
||||
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
|
||||
<package format="3">
|
||||
<name>robot_speaker</name>
|
||||
<version>0.0.0</version>
|
||||
<description>TODO: Package description</description>
|
||||
<version>0.0.1</version>
|
||||
<description>语音识别和合成ROS2包</description>
|
||||
<maintainer email="mzebra@foxmail.com">mzebra</maintainer>
|
||||
<license>Apache-2.0</license>
|
||||
|
||||
<depend>rclpy</depend>
|
||||
<depend>example_interfaces</depend>
|
||||
<depend>std_msgs</depend>
|
||||
<depend>sensor_msgs</depend>
|
||||
<depend>cv_bridge</depend>
|
||||
<depend>ament_index_python</depend>
|
||||
<depend>interfaces</depend>
|
||||
<buildtool_depend>ament_cmake</buildtool_depend>
|
||||
<buildtool_depend>ament_cmake_python</buildtool_depend>
|
||||
|
||||
<exec_depend>python3-pyaudio</exec_depend>
|
||||
<exec_depend>python3-requests</exec_depend>
|
||||
<exec_depend>python3-edge-tts</exec_depend>
|
||||
<exec_depend>python3-webrtcvad</exec_depend>
|
||||
<exec_depend>python3-yaml</exec_depend>
|
||||
<exec_depend>python3-pypinyin</exec_depend>
|
||||
|
||||
<test_depend>ament_copyright</test_depend>
|
||||
<test_depend>ament_flake8</test_depend>
|
||||
@@ -16,6 +29,6 @@
|
||||
<test_depend>python3-pytest</test_depend>
|
||||
|
||||
<export>
|
||||
<build_type>ament_python</build_type>
|
||||
<build_type>ament_cmake</build_type>
|
||||
</export>
|
||||
</package>
|
||||
|
||||
9
requirements.txt
Normal file
9
requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
dashscope>=1.20.0
|
||||
openai>=1.0.0
|
||||
pyaudio>=0.2.11
|
||||
pypinyin>=0.49.0
|
||||
rclpy>=3.0.0
|
||||
Pillow>=10.0.0
|
||||
numpy>=1.24.0,<2.0.0 # cv_bridge需要NumPy 1.x,NumPy 2.x会导致段错误
|
||||
PyYAML>=6.0
|
||||
funasr>=1.0.0
|
||||
@@ -0,0 +1,6 @@
|
||||
# robot_speaker package
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
24
robot_speaker/bridge/__init__.py
Normal file
24
robot_speaker/bridge/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# Bridge package for connecting LLM outputs to brain execution.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
239
robot_speaker/bridge/skill_bridge_node.py
Normal file
239
robot_speaker/bridge/skill_bridge_node.py
Normal file
@@ -0,0 +1,239 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
桥接LLM技能序列到小脑ExecuteBtAction,并转发反馈/结果。
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import rclpy
|
||||
from rclpy.node import Node
|
||||
from rclpy.action import ActionClient
|
||||
from std_msgs.msg import String
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
from interfaces.action import ExecuteBtAction
|
||||
from interfaces.srv import BtRebuild
|
||||
|
||||
|
||||
class SkillBridgeNode(Node):
|
||||
def __init__(self):
|
||||
super().__init__('skill_bridge_node')
|
||||
self._action_client = ActionClient(self, ExecuteBtAction, '/execute_bt_action')
|
||||
self._current_epoch = 1
|
||||
self.run_trigger_ = self.create_client(BtRebuild, '/cerebrum/rebuild_now')
|
||||
self.rebuild_requests = 0
|
||||
self._allowed_skills = self._load_allowed_skills()
|
||||
|
||||
self.skill_seq_sub = self.create_subscription(
|
||||
String, '/llm_skill_sequence', self._on_skill_sequence_received, 10
|
||||
)
|
||||
self.feedback_pub = self.create_publisher(String, '/skill_execution_feedback', 10)
|
||||
self.result_pub = self.create_publisher(String, '/skill_execution_result', 10)
|
||||
|
||||
self.get_logger().info('SkillBridgeNode started')
|
||||
|
||||
def _on_skill_sequence_received(self, msg: String):
|
||||
raw = (msg.data or "").strip()
|
||||
if not raw:
|
||||
return
|
||||
if not self._allowed_skills:
|
||||
self.get_logger().warning("No skill whitelist loaded; reject all sequences")
|
||||
return
|
||||
|
||||
# 尝试解析JSON格式
|
||||
sequence_list = None
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
sequence_list = self._parse_json_sequence(data)
|
||||
if sequence_list is None:
|
||||
self.get_logger().error("Invalid skill sequence format; must be JSON or plain text")
|
||||
return
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
self.get_logger().debug(f"JSON解析失败,尝试文本解析: {e}")
|
||||
|
||||
# JSON格式处理
|
||||
try:
|
||||
skill_names = [item["skill"] for item in sequence_list]
|
||||
if any(skill in skill_names for skill in ["VisionObjectRecognition", "Arm", "GripperCmd0"]):
|
||||
self.get_logger().info(f"Skill sequence contains special skills, triggering rebuild: {skill_names}")
|
||||
self.rebuild_now("Trigger", "bt_vision_grasp_dual_arm", "")
|
||||
else:
|
||||
skill_params = []
|
||||
for item in sequence_list:
|
||||
p = item.get("parameters")
|
||||
params = ""
|
||||
if isinstance(p, dict):
|
||||
lines = []
|
||||
for k, v in p.items():
|
||||
lines.append(f"{k}: {v}")
|
||||
if lines:
|
||||
params = "\n".join(lines) + "\n"
|
||||
skill_params.append(params)
|
||||
|
||||
self.get_logger().info(f"Sending skill sequence: {skill_names}")
|
||||
self.get_logger().info(f"Sending skill parameters: {skill_params}")
|
||||
|
||||
# 将技能名和参数列表分别用单引号包括,并用逗号隔开
|
||||
# names_str = ", ".join([f"'{name}'" for name in skill_names])
|
||||
# params_str = ", ".join([f"'{param}'" for param in skill_params])
|
||||
names_str = ", ".join(skill_names)
|
||||
params_str = ", ".join(skill_params)
|
||||
|
||||
self.rebuild_now("Remote", names_str, params_str)
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"Error processing skill sequence: {e}")
|
||||
|
||||
def _load_allowed_skills(self) -> set[str]:
|
||||
try:
|
||||
brain_share = get_package_share_directory("brain")
|
||||
skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
|
||||
if not os.path.exists(skill_path):
|
||||
return set()
|
||||
import yaml
|
||||
with open(skill_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or []
|
||||
return {str(entry["name"]) for entry in data if isinstance(entry, dict) and entry.get("name")}
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"Load skills failed: {e}")
|
||||
return set()
|
||||
|
||||
def _extract_skill_sequence(self, text: str) -> tuple[str, list[str]]:
|
||||
# Accept CSV/space/semicolon and filter by CamelCase tokens
|
||||
tokens = re.split(r'[,\s;]+', text.strip())
|
||||
skills = [t for t in tokens if re.match(r'^[A-Z][A-Za-z0-9]*$', t)]
|
||||
if not skills:
|
||||
return "", []
|
||||
invalid = [s for s in skills if s not in self._allowed_skills]
|
||||
return ",".join(skills), invalid
|
||||
|
||||
def _parse_json_sequence(self, data: dict) -> list[dict] | None:
|
||||
"""解析JSON格式的技能序列"""
|
||||
if not isinstance(data, dict) or "sequence" not in data:
|
||||
return None
|
||||
|
||||
sequence = data["sequence"]
|
||||
if not isinstance(sequence, list):
|
||||
return None
|
||||
|
||||
validated = []
|
||||
for item in sequence:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
skill = item.get("skill")
|
||||
if not skill or skill not in self._allowed_skills:
|
||||
continue
|
||||
|
||||
execution = item.get("execution", "serial")
|
||||
if execution not in ["serial", "parallel"]:
|
||||
execution = "serial"
|
||||
|
||||
body_id = item.get("body_id")
|
||||
# 只支持数字格式(0,1,2)和null,与意图路由对齐
|
||||
if body_id not in [0, 1, 2, None]:
|
||||
body_id = None
|
||||
|
||||
validated.append({
|
||||
"skill": skill,
|
||||
"execution": execution,
|
||||
"body_id": body_id,
|
||||
"parameters": item.get("parameters")
|
||||
})
|
||||
|
||||
return validated if validated else None
|
||||
|
||||
def _send_skill_sequence(self, skill_sequence: str):
|
||||
if not self._action_client.wait_for_server(timeout_sec=2.0):
|
||||
self.get_logger().error('ExecuteBtAction server unavailable')
|
||||
return
|
||||
goal = ExecuteBtAction.Goal()
|
||||
goal.epoch = self._current_epoch
|
||||
self._current_epoch += 1
|
||||
goal.action_name = skill_sequence
|
||||
goal.calls = []
|
||||
|
||||
self.get_logger().info(f"Dispatch skill sequence: {skill_sequence}")
|
||||
send_future = self._action_client.send_goal_async(goal, feedback_callback=self._feedback_callback)
|
||||
rclpy.spin_until_future_complete(self, send_future, timeout_sec=5.0)
|
||||
if not send_future.done():
|
||||
self.get_logger().warning("Send goal timed out")
|
||||
return
|
||||
goal_handle = send_future.result()
|
||||
if not goal_handle or not goal_handle.accepted:
|
||||
self.get_logger().error("Goal rejected")
|
||||
return
|
||||
result_future = goal_handle.get_result_async()
|
||||
rclpy.spin_until_future_complete(self, result_future)
|
||||
if result_future.done():
|
||||
self._handle_result(result_future.result())
|
||||
|
||||
def _feedback_callback(self, feedback_msg):
|
||||
fb = feedback_msg.feedback
|
||||
payload = {
|
||||
"stage": fb.stage,
|
||||
"current_skill": fb.current_skill,
|
||||
"progress": float(fb.progress),
|
||||
"detail": fb.detail,
|
||||
"epoch": int(fb.epoch),
|
||||
}
|
||||
msg = String()
|
||||
msg.data = json.dumps(payload, ensure_ascii=True)
|
||||
self.feedback_pub.publish(msg)
|
||||
|
||||
def _handle_result(self, result_wrapper):
|
||||
result = result_wrapper.result
|
||||
if not result:
|
||||
return
|
||||
payload = {
|
||||
"success": bool(result.success),
|
||||
"message": result.message,
|
||||
"total_skills": int(result.total_skills),
|
||||
"succeeded_skills": int(result.succeeded_skills),
|
||||
}
|
||||
msg = String()
|
||||
msg.data = json.dumps(payload, ensure_ascii=True)
|
||||
self.result_pub.publish(msg)
|
||||
|
||||
def rebuild_now(self, type: str, config: str, param: str) -> None:
|
||||
if not self.run_trigger_.service_is_ready():
|
||||
self.get_logger().error('Rebuild service not ready')
|
||||
return
|
||||
|
||||
self.rebuild_requests += 1
|
||||
self.get_logger().info(f'Rebuild BehaviorTree now. Total requests: {self.rebuild_requests}')
|
||||
|
||||
request = BtRebuild.Request()
|
||||
request.type = type
|
||||
request.config = config
|
||||
request.param = param
|
||||
|
||||
self.get_logger().info(f'Calling rebuild service... request info: {request}')
|
||||
|
||||
future = self.run_trigger_.call_async(request)
|
||||
future.add_done_callback(self._rebuild_done_callback)
|
||||
|
||||
def _rebuild_done_callback(self, future):
|
||||
try:
|
||||
response = future.result()
|
||||
if response.success:
|
||||
self.get_logger().info('Rebuild request successful')
|
||||
else:
|
||||
self.get_logger().warning(f'Rebuild request failed: {response.message}')
|
||||
except Exception as e:
|
||||
self.get_logger().error(f'Rebuild request exception: {str(e)}')
|
||||
|
||||
self.get_logger().info(f"Rebuild requested. Total rebuild requests: {str(self.rebuild_requests)}")
|
||||
|
||||
|
||||
def main(args=None):
|
||||
rclpy.init(args=args)
|
||||
node = SkillBridgeNode()
|
||||
rclpy.spin(node)
|
||||
node.destroy_node()
|
||||
rclpy.shutdown()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
28
robot_speaker/core/__init__.py
Normal file
28
robot_speaker/core/__init__.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""核心模块"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
130
robot_speaker/core/context_manager.py
Normal file
130
robot_speaker/core/context_manager.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""
|
||||
对话历史管理模块
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
import threading
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMMessage:
|
||||
"""LLM消息"""
|
||||
role: str # "user", "assistant", "system"
|
||||
content: str
|
||||
|
||||
|
||||
class ConversationHistory:
|
||||
"""对话历史管理器 - 实时语音"""
|
||||
|
||||
def __init__(self, max_history: int, summary_trigger: int):
|
||||
self.max_history = max_history
|
||||
self.summary_trigger = summary_trigger
|
||||
self.conversation_history: list[LLMMessage] = []
|
||||
self.summary: str | None = None
|
||||
|
||||
# 待确认机制
|
||||
self._pending_user_message: LLMMessage | None = None # 待确认的用户消息
|
||||
self._lock = threading.Lock() # 线程安全锁
|
||||
|
||||
def start_turn(self, user_content: str):
|
||||
"""开始一个新的对话轮次,暂存用户消息,等待LLM完成后确认写入历史"""
|
||||
with self._lock:
|
||||
self._pending_user_message = LLMMessage(role="user", content=user_content)
|
||||
|
||||
def commit_turn(self, assistant_content: str) -> bool:
|
||||
"""确认当前轮次完成,将usr和assistant消息写入历史"""
|
||||
with self._lock:
|
||||
if self._pending_user_message is None:
|
||||
return False
|
||||
|
||||
if not assistant_content or not assistant_content.strip():
|
||||
self._pending_user_message = None
|
||||
return False
|
||||
|
||||
self.conversation_history.append(self._pending_user_message)
|
||||
self.conversation_history.append(
|
||||
LLMMessage(role="assistant", content=assistant_content.strip())
|
||||
)
|
||||
|
||||
self._pending_user_message = None
|
||||
|
||||
self._maybe_compress()
|
||||
return True
|
||||
|
||||
def cancel_turn(self):
|
||||
"""取消当前待确认的轮次,丢弃待确认的用户消息,用于处理中断情况,防止不完整内容污染历史"""
|
||||
with self._lock:
|
||||
if self._pending_user_message is not None:
|
||||
self._pending_user_message = None
|
||||
|
||||
def add_message(self, role: str, content: str):
|
||||
"""直接添加消息"""
|
||||
with self._lock:
|
||||
# 如果有待确认的轮次,先取消它
|
||||
self.cancel_turn()
|
||||
self.conversation_history.append(LLMMessage(role=role, content=content))
|
||||
self._maybe_compress()
|
||||
|
||||
def get_messages(self) -> list[LLMMessage]:
|
||||
"""获取消息列表"""
|
||||
with self._lock:
|
||||
messages = []
|
||||
|
||||
if self.summary:
|
||||
messages.append(LLMMessage(role="system", content=self.summary))
|
||||
|
||||
if self.max_history > 0:
|
||||
messages.extend(self.conversation_history[-self.max_history * 2:])
|
||||
|
||||
if self._pending_user_message is not None:
|
||||
messages.append(self._pending_user_message)
|
||||
|
||||
return messages
|
||||
|
||||
def has_pending_turn(self) -> bool:
|
||||
"""检查是否有待确认的轮次"""
|
||||
with self._lock:
|
||||
return self._pending_user_message is not None
|
||||
|
||||
def _maybe_compress(self):
|
||||
"""压缩对话历史"""
|
||||
if self.max_history <= 0:
|
||||
self.conversation_history.clear()
|
||||
return
|
||||
|
||||
max_len = self.summary_trigger * 2
|
||||
if len(self.conversation_history) <= max_len:
|
||||
return
|
||||
|
||||
old = self.conversation_history[:-max_len]
|
||||
self.conversation_history = self.conversation_history[-max_len:]
|
||||
|
||||
summary_text = []
|
||||
for msg in old:
|
||||
summary_text.append(f"{msg.role}: {msg.content}")
|
||||
|
||||
compressed = "对话摘要:\n" + "\n".join(summary_text[-10:])
|
||||
|
||||
if self.summary:
|
||||
self.summary += "\n" + compressed
|
||||
else:
|
||||
self.summary = compressed
|
||||
|
||||
def clear(self):
|
||||
"""清空历史和待确认消息"""
|
||||
with self._lock:
|
||||
self.conversation_history.clear()
|
||||
self.summary = None
|
||||
self._pending_user_message = None
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
272
robot_speaker/core/intent_router.py
Normal file
272
robot_speaker/core/intent_router.py
Normal file
@@ -0,0 +1,272 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
import os
|
||||
import yaml
|
||||
import json
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
from pypinyin import pinyin, Style
|
||||
from robot_speaker.core.skill_interface_parser import SkillInterfaceParser
|
||||
|
||||
|
||||
@dataclass
|
||||
class IntentResult:
|
||||
intent: str # "skill_sequence" | "kb_qa" | "chat_text" | "chat_camera"
|
||||
text: str
|
||||
need_camera: bool
|
||||
camera_mode: Optional[str] # "top" | "left" | "right" | "hand_r" | None
|
||||
system_prompt: Optional[str]
|
||||
|
||||
|
||||
class IntentRouter:
|
||||
def __init__(self):
|
||||
self.camera_capture_keywords = [
|
||||
"pai zhao", "pai ge zhao", "pai zhang zhao"
|
||||
]
|
||||
# 动作词列表(拼音)- 用于检测技能序列意图
|
||||
self.action_verbs = [
|
||||
"zou", "zou liang bu", "zou ji bu", # 走、走两步、走几步
|
||||
"na", "na qi", "na zhu", # 拿、拿起、拿住
|
||||
"ban", "ban yun", # 搬、搬运
|
||||
"zhua", "zhua qu", # 抓、抓取
|
||||
"tui", "tui dong", # 推、推动
|
||||
"la", "la dong", # 拉、拉动
|
||||
"yi dong", "qian jin", "hou tui", # 移动、前进、后退
|
||||
"kong zhi", "cao zuo", # 控制、操作
|
||||
"fang xia", "fang zhi", # 放下、放置
|
||||
"ju qi", "sheng qi", # 举起、升起
|
||||
"jia zhua", "jia qi", "jia", # 夹爪、夹起、夹
|
||||
"shen you bi", "shen zuo bi", "shen chu", "shen shou", # 伸右臂、伸左臂、伸出、伸手
|
||||
"zhuan quan", "zhuan yi quan", "zhuan", # 转个圈、转一圈、转
|
||||
]
|
||||
self.kb_keywords = [
|
||||
"ni shi shui", "ni de ming zi", "tiao ge wu", "ni jiao sha", "ni hui gan", "ni neng gan"
|
||||
]
|
||||
self._cached_skill_names: list[str] | None = None
|
||||
self._cached_kb_data: list[dict] | None = None
|
||||
|
||||
interfaces_root = self._get_interfaces_root()
|
||||
self.interface_parser = SkillInterfaceParser(interfaces_root)
|
||||
|
||||
def _get_interfaces_root(self) -> str:
|
||||
"""从配置文件读取接口文件根目录"""
|
||||
try:
|
||||
robot_speaker_share = get_package_share_directory("robot_speaker")
|
||||
config_path = os.path.join(robot_speaker_share, "config", "voice.yaml")
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
config = yaml.safe_load(f) or {}
|
||||
|
||||
interfaces_config = config.get("interfaces", {})
|
||||
root_path = interfaces_config.get("root_path", "")
|
||||
|
||||
if not root_path:
|
||||
raise ValueError("interfaces.root_path 未在配置文件中配置")
|
||||
|
||||
if root_path.startswith("~"):
|
||||
root_path = os.path.expanduser(root_path)
|
||||
|
||||
if not os.path.isabs(root_path):
|
||||
config_dir = os.path.dirname(robot_speaker_share)
|
||||
root_path = os.path.join(config_dir, root_path)
|
||||
|
||||
abs_path = os.path.abspath(root_path)
|
||||
|
||||
if not os.path.exists(abs_path):
|
||||
raise ValueError(f"接口文件根目录不存在: {abs_path}")
|
||||
|
||||
return abs_path
|
||||
except Exception as e:
|
||||
raise ValueError(f"读取接口文件根目录失败: {e}")
|
||||
|
||||
def _load_brain_skill_names(self) -> list[str]:
|
||||
"""加载技能名称(使用接口解析器,避免重复读取)"""
|
||||
if self._cached_skill_names is not None:
|
||||
return self._cached_skill_names
|
||||
|
||||
skill_names = self.interface_parser.get_skill_names()
|
||||
self._cached_skill_names = skill_names
|
||||
return skill_names
|
||||
|
||||
def to_pinyin(self, text: str) -> str:
|
||||
chars = [c for c in text if '\u4e00' <= c <= '\u9fa5']
|
||||
if not chars:
|
||||
return ""
|
||||
py_list = pinyin(''.join(chars), style=Style.NORMAL)
|
||||
return ' '.join([item[0] for item in py_list]).lower().strip()
|
||||
|
||||
def is_skill_sequence_intent(self, text: str, text_pinyin: str | None = None) -> bool:
|
||||
if text_pinyin is None:
|
||||
text_pinyin = self.to_pinyin(text)
|
||||
|
||||
# 检查动作词(精确匹配:动作词必须是完整的单词序列)
|
||||
text_words = text_pinyin.split()
|
||||
for action in self.action_verbs:
|
||||
action_words = action.split()
|
||||
# 检查动作词的单词序列是否是文本单词序列的连续子序列
|
||||
for i in range(len(text_words) - len(action_words) + 1):
|
||||
if text_words[i:i+len(action_words)] == action_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def check_camera_command(self, text: str, text_pinyin: str | None = None) -> tuple[bool, Optional[str]]:
|
||||
"""检查是否包含拍照指令,返回(是否需要相机, 相机模式)"""
|
||||
if not text:
|
||||
return False, None
|
||||
if text_pinyin is None:
|
||||
text_pinyin = self.to_pinyin(text)
|
||||
# 精确匹配:关键词必须作为完整短语出现在文本拼音中
|
||||
if any(keyword in text_pinyin for keyword in self.camera_capture_keywords):
|
||||
return True, self.detect_camera_mode(text, text_pinyin)
|
||||
return False, None
|
||||
|
||||
def detect_camera_mode(self, text: str, text_pinyin: str | None = None) -> str:
|
||||
"""检测相机模式,返回与相机驱动匹配的position值:left/right/top/hand_r"""
|
||||
if text_pinyin is None:
|
||||
text_pinyin = self.to_pinyin(text)
|
||||
if any(kw in text_pinyin for kw in ["zuo shou", "zuo bi", "zuo bian", "zuo shou bi"]):
|
||||
return "left"
|
||||
if any(kw in text_pinyin for kw in ["you shou", "you bi", "you bian", "you shou bi"]):
|
||||
return "right"
|
||||
if any(kw in text_pinyin for kw in ["shou bu", "shou", "shou xiang ji", "shou bi xiang ji"]):
|
||||
return "hand_r"
|
||||
if any(kw in text_pinyin for kw in ["tou", "nao dai", "ding bu", "shang fang"]):
|
||||
return "top"
|
||||
return "top"
|
||||
|
||||
def build_skill_prompt(self, execution_status: Optional[str] = None) -> str:
|
||||
skills = self._load_brain_skill_names()
|
||||
skills_text = ", ".join(skills) if skills else ""
|
||||
skill_guard = (
|
||||
"【技能限制】只能使用以下技能名称:" + skills_text
|
||||
if skills_text
|
||||
else "【技能限制】技能列表不可用,请不要输出任何技能名称。"
|
||||
)
|
||||
|
||||
execution_hint = ""
|
||||
if execution_status:
|
||||
execution_hint = f"【上一轮执行状态】{execution_status}\n请参考上述执行状态,根据成功/失败信息调整本次技能序列。\n"
|
||||
else:
|
||||
execution_hint = "【注意】这是首次执行或没有上一轮执行状态,请根据当前图片和用户请求规划技能序列。\n"
|
||||
|
||||
skill_params_doc = self.interface_parser.generate_params_documentation()
|
||||
|
||||
return (
|
||||
"你是机器人任务规划器。\n"
|
||||
"本任务必须拍照。请根据用户请求选择使用哪个相机拍照,并结合当前环境信息生成简洁、可执行的技能序列。\n"
|
||||
"如果用户明确要求或者任务明显需要双手/双臂协作(如扶稳+操作、抓取大体积的物体),必须规划双手技能。\n"
|
||||
+ execution_hint
|
||||
+ "\n"
|
||||
"【规划要求】\n"
|
||||
"1. execution规划:判断技能之间的执行关系\n"
|
||||
" - serial(串行):技能必须按顺序执行,前一个完成后再执行下一个\n"
|
||||
" - parallel(并行):技能可以同时执行\n"
|
||||
"2. parameters规划:根据目标物距离和任务需求,规划具体参数值\n"
|
||||
" - parameters字典必须包含该技能接口文件目标字段的所有字段\n"
|
||||
"【输出格式要求】\n"
|
||||
"必须输出JSON格式,包含sequence数组。每个技能对象包含3个一级字段:\n"
|
||||
"1. skill: 技能名称(字符串)\n"
|
||||
"2. execution: 执行方式,serial(串行)或 parallel(并行)\n"
|
||||
"3. parameters: 参数字典,包含该技能接口文件目标字段的所有字段,并填入合理的预测值。如果技能无参数,使用null。\n"
|
||||
"\n"
|
||||
"注意:一级字段(skill, execution, parameters)是固定结构。\n"
|
||||
"\n"
|
||||
"【技能参数说明】\n"
|
||||
+ skill_params_doc +
|
||||
"\n"
|
||||
"示例格式:\n"
|
||||
"{\n"
|
||||
' "sequence": [\n'
|
||||
' {"skill": "MoveWheel", "execution": "serial", "parameters": {"move_distance": 1.5, "move_angle": 0.0}},\n'
|
||||
' {"skill": "Arm", "execution": "serial", "parameters": {"body_id": 0, "data_type": 1, "data_length": 6, "command_id": 0, "frame_time_stamp": 0, "data_array": [0.1, 0.2, 0.3, 0.0, 0.0, 0.0]}},\n'
|
||||
' {"skill": "GripperCmd0", "execution": "parallel", "parameters": {"loc": 128, "speed": 100, "torque": 80, "mode": 1}}\n'
|
||||
" ]\n"
|
||||
"}\n"
|
||||
+ skill_guard
|
||||
)
|
||||
|
||||
def build_chat_prompt(self, need_camera: bool) -> str:
|
||||
if need_camera:
|
||||
return (
|
||||
"你是一个机器人视觉助理,擅长分析图片中物体的相对位置和空间关系。\n"
|
||||
"请结合图片内容,重点描述物体之间的相对位置(如左右、前后、上下、远近),仅基于可观察信息回答。\n"
|
||||
"回答应简短、客观,不要超过100个token。"
|
||||
)
|
||||
return (
|
||||
"你是一个表达清晰、语气自然的真人助理。\n"
|
||||
"请简短地与用户对话,不要超过100个token。"
|
||||
)
|
||||
|
||||
def _load_kb_data(self) -> list[dict]:
|
||||
"""加载知识库数据"""
|
||||
if self._cached_kb_data is not None:
|
||||
return self._cached_kb_data
|
||||
kb_data = []
|
||||
try:
|
||||
robot_speaker_share = get_package_share_directory("robot_speaker")
|
||||
kb_path = os.path.join(robot_speaker_share, "config", "knowledge.json")
|
||||
with open(kb_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
kb_data = data["entries"]
|
||||
except Exception as e:
|
||||
kb_data = []
|
||||
self._cached_kb_data = kb_data
|
||||
return kb_data
|
||||
|
||||
def search_kb(self, text: str) -> Optional[str]:
|
||||
"""检索知识库,返回匹配的答案"""
|
||||
if not text:
|
||||
return None
|
||||
text_pinyin = self.to_pinyin(text)
|
||||
kb_data = self._load_kb_data()
|
||||
|
||||
for entry in kb_data:
|
||||
patterns = entry["patterns"]
|
||||
for pattern in patterns:
|
||||
if pattern in text_pinyin:
|
||||
answer = entry["answer"]
|
||||
if answer:
|
||||
return answer
|
||||
return None
|
||||
|
||||
def build_default_system_prompt(self) -> str:
|
||||
return (
|
||||
"你是一个工厂专业的助手。\n"
|
||||
"- 当用户发送图片时,请仔细观察图片内容,结合用户的问题或描述,提供简短、专业的回答。\n"
|
||||
"- 当用户没有发送图片时,请自然、友好地与用户对话。\n"
|
||||
"请根据对话模式调整你的回答风格。"
|
||||
)
|
||||
|
||||
def route(self, text: str) -> IntentResult:
|
||||
text_pinyin = self.to_pinyin(text)
|
||||
need_camera, camera_mode = self.check_camera_command(text, text_pinyin)
|
||||
|
||||
if self.is_skill_sequence_intent(text, text_pinyin):
|
||||
# 技能序列意图总是需要相机,复用 detect_camera_mode:用户指定了相机就用指定的,否则默认 "top"
|
||||
skill_camera_mode = self.detect_camera_mode(text, text_pinyin)
|
||||
return IntentResult(
|
||||
intent="skill_sequence",
|
||||
text=text,
|
||||
need_camera=True,
|
||||
camera_mode=skill_camera_mode,
|
||||
system_prompt=self.build_skill_prompt()
|
||||
)
|
||||
|
||||
# 精确匹配:关键词必须作为完整短语出现在文本拼音中
|
||||
if any(keyword in text_pinyin for keyword in self.kb_keywords):
|
||||
return IntentResult(
|
||||
intent="kb_qa",
|
||||
text=text,
|
||||
need_camera=False,
|
||||
camera_mode=None,
|
||||
system_prompt=None # kb_qa不走LLM,不需要system_prompt
|
||||
)
|
||||
|
||||
return IntentResult(
|
||||
intent="chat_camera" if need_camera else "chat_text",
|
||||
text=text,
|
||||
need_camera=need_camera,
|
||||
camera_mode=camera_mode,
|
||||
system_prompt=self.build_chat_prompt(need_camera)
|
||||
)
|
||||
|
||||
236
robot_speaker/core/register_speaker_node.py
Normal file
236
robot_speaker/core/register_speaker_node.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""声纹注册独立节点:运行完成后退出"""
|
||||
import os
|
||||
import time
|
||||
import yaml
|
||||
import numpy as np
|
||||
import threading
|
||||
import queue
|
||||
|
||||
import rclpy
|
||||
from rclpy.node import Node
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
from interfaces.srv import ASRRecognize, AudioData, VADEvent
|
||||
from robot_speaker.core.speaker_verifier import SpeakerVerificationClient
|
||||
from pypinyin import pinyin, Style
|
||||
|
||||
|
||||
class RegisterSpeakerNode(Node):
|
||||
def __init__(self):
|
||||
super().__init__('register_speaker_node')
|
||||
self._load_config()
|
||||
|
||||
self.asr_client = self.create_client(ASRRecognize, '/asr/recognize')
|
||||
self.audio_data_client = self.create_client(AudioData, '/asr/audio_data')
|
||||
self.vad_client = self.create_client(VADEvent, '/vad/event')
|
||||
|
||||
self.get_logger().info('等待服务启动...')
|
||||
self.asr_client.wait_for_service(timeout_sec=10.0)
|
||||
self.audio_data_client.wait_for_service(timeout_sec=10.0)
|
||||
self.vad_client.wait_for_service(timeout_sec=10.0)
|
||||
self.get_logger().info('所有服务已就绪')
|
||||
|
||||
self.sv_client = SpeakerVerificationClient(
|
||||
model_path=self.sv_model_path,
|
||||
threshold=self.sv_threshold,
|
||||
speaker_db_path=self.sv_speaker_db_path,
|
||||
logger=self.get_logger()
|
||||
)
|
||||
|
||||
self.registered = False
|
||||
self.shutting_down = False
|
||||
self.get_logger().info("声纹注册节点启动,请说唤醒词开始注册(例如:'二狗我现在正在注册声纹,这是一段很长的测试语音,请把我的声音录进去')")
|
||||
|
||||
# 使用队列在线程间传递 VAD 事件,避免在子线程中调用 spin_until_future_complete
|
||||
self.vad_event_queue = queue.Queue()
|
||||
self.recording = False # 录音状态标志
|
||||
self.pending_asr_future = None # 待处理的 ASR future
|
||||
self.pending_audio_future = None # 待处理的 AudioData future
|
||||
self.state = "waiting_speech" # 状态机:waiting_speech, waiting_asr, waiting_audio
|
||||
|
||||
self.vad_thread = threading.Thread(target=self._vad_event_worker, daemon=True)
|
||||
self.vad_thread.start()
|
||||
self.timer = self.create_timer(0.1, self._main_loop)
|
||||
|
||||
def _load_config(self):
|
||||
config_file = os.path.join(
|
||||
get_package_share_directory('robot_speaker'),
|
||||
'config',
|
||||
'voice.yaml'
|
||||
)
|
||||
with open(config_file, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
system = config['system']
|
||||
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
|
||||
self.sv_threshold = system['sv_threshold']
|
||||
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
|
||||
self.wake_word = system['wake_word']
|
||||
|
||||
def _vad_event_worker(self):
|
||||
"""VAD 事件监听线程,只负责接收事件并放入队列,不调用 spin_until_future_complete"""
|
||||
while not self.registered and not self.shutting_down:
|
||||
try:
|
||||
request = VADEvent.Request()
|
||||
request.command = "wait"
|
||||
request.timeout_ms = 1000
|
||||
future = self.vad_client.call_async(request)
|
||||
|
||||
# 简单等待 future 完成,不使用 spin_until_future_complete
|
||||
start_time = time.time()
|
||||
while not future.done() and (time.time() - start_time) < 1.5:
|
||||
time.sleep(0.01)
|
||||
|
||||
if not future.done() or self.registered or self.shutting_down:
|
||||
continue
|
||||
|
||||
response = future.result()
|
||||
if response.success and response.event in ["speech_started", "speech_stopped"]:
|
||||
# 将事件放入队列,由主线程处理
|
||||
try:
|
||||
self.vad_event_queue.put(response.event, timeout=0.1)
|
||||
except queue.Full:
|
||||
self.get_logger().warn(f"[VAD] 事件队列已满,丢弃事件: {response.event}")
|
||||
except Exception as e:
|
||||
if not self.shutting_down:
|
||||
self.get_logger().error(f"[VAD] 线程异常: {e}")
|
||||
break
|
||||
|
||||
def _start_recording(self):
|
||||
"""启动录音,返回 future 供主线程处理"""
|
||||
request = AudioData.Request()
|
||||
request.command = "start"
|
||||
return self.audio_data_client.call_async(request)
|
||||
|
||||
def _to_pinyin(self, text: str) -> str:
|
||||
chars = [c for c in text if '\u4e00' <= c <= '\u9fa5']
|
||||
if not chars:
|
||||
return ""
|
||||
py_list = pinyin(chars, style=Style.NORMAL)
|
||||
return ' '.join([item[0] for item in py_list]).lower().strip()
|
||||
|
||||
def _check_wake_word(self, text: str):
|
||||
text_pinyin = self._to_pinyin(text)
|
||||
wake_word_pinyin = self.wake_word.lower().strip()
|
||||
|
||||
if not wake_word_pinyin:
|
||||
return
|
||||
|
||||
text_pinyin_parts = text_pinyin.split() if text_pinyin else []
|
||||
wake_word_parts = wake_word_pinyin.split()
|
||||
|
||||
has_wake_word = False
|
||||
for i in range(len(text_pinyin_parts) - len(wake_word_parts) + 1):
|
||||
if text_pinyin_parts[i:i + len(wake_word_parts)] == wake_word_parts:
|
||||
has_wake_word = True
|
||||
break
|
||||
|
||||
if has_wake_word:
|
||||
self.get_logger().info(f"[注册唤醒词] 检测到唤醒词 '{self.wake_word}',停止录音并获取音频")
|
||||
request = AudioData.Request()
|
||||
request.command = "stop"
|
||||
future = self.audio_data_client.call_async(request)
|
||||
future._future_type = "stop"
|
||||
self.pending_audio_future = future
|
||||
|
||||
def _process_voiceprint_audio(self, response):
|
||||
"""处理声纹音频数据 - 直接使用 AudioData 返回的音频,不再过滤"""
|
||||
if not response or not response.success or response.samples == 0:
|
||||
self.get_logger().error(f"[注册录音] 获取音频数据失败: {response.message if response else '无响应'}")
|
||||
return
|
||||
|
||||
audio_array = np.frombuffer(response.audio_data, dtype=np.int16)
|
||||
buffer_sec = response.samples / response.sample_rate
|
||||
self.get_logger().info(f"[注册录音] 音频长度: {buffer_sec:.2f}秒")
|
||||
|
||||
# 直接使用音频,不再进行 VAD 过滤
|
||||
# 因为 AudioData 服务基于 DashScope VAD,已经是语音活动片段
|
||||
embedding, success = self.sv_client.extract_embedding(
|
||||
audio_array,
|
||||
sample_rate=response.sample_rate
|
||||
)
|
||||
if not success or embedding is None:
|
||||
self.get_logger().error("[注册录音] 提取embedding失败")
|
||||
return
|
||||
|
||||
speaker_id = f"user_{int(time.time())}"
|
||||
if self.sv_client.register_speaker(speaker_id, embedding):
|
||||
# 注册成功后立即保存到文件
|
||||
self.sv_client.save_speakers()
|
||||
self.get_logger().info(f"[注册录音] 注册成功,用户ID: {speaker_id},已保存到文件,准备退出")
|
||||
self.registered = True
|
||||
else:
|
||||
self.get_logger().error("[注册录音] 注册失败")
|
||||
|
||||
def _main_loop(self):
|
||||
"""主循环,在主线程中处理所有异步操作"""
|
||||
# 检查是否完成注册
|
||||
if self.registered:
|
||||
self.get_logger().info("注册完成,节点退出")
|
||||
self.shutting_down = True
|
||||
self.timer.cancel()
|
||||
rclpy.shutdown()
|
||||
return
|
||||
|
||||
# 处理待处理的 ASR future
|
||||
if self.pending_asr_future and self.pending_asr_future.done():
|
||||
response = self.pending_asr_future.result()
|
||||
self.pending_asr_future = None
|
||||
|
||||
if response.success and response.text:
|
||||
text = response.text.strip()
|
||||
if text:
|
||||
self._check_wake_word(text)
|
||||
|
||||
self.state = "waiting_speech"
|
||||
|
||||
# 处理待处理的 AudioData future
|
||||
if self.pending_audio_future and self.pending_audio_future.done():
|
||||
response = self.pending_audio_future.result()
|
||||
future_type = getattr(self.pending_audio_future, '_future_type', None)
|
||||
self.pending_audio_future = None
|
||||
|
||||
if future_type == "start":
|
||||
if response.success:
|
||||
self.get_logger().info("[注册录音] 已开始录音")
|
||||
self.recording = True
|
||||
else:
|
||||
self.get_logger().warn(f"[注册录音] 启动录音失败: {response.message}")
|
||||
self.state = "waiting_speech"
|
||||
elif future_type == "stop":
|
||||
self.recording = False
|
||||
self._process_voiceprint_audio(response)
|
||||
|
||||
# 处理 VAD 事件队列
|
||||
try:
|
||||
event = self.vad_event_queue.get_nowait()
|
||||
|
||||
if event == "speech_started" and self.state == "waiting_speech" and not self.recording:
|
||||
self.get_logger().info("[VAD] 检测到语音开始,启动录音")
|
||||
future = self._start_recording()
|
||||
future._future_type = "start"
|
||||
self.pending_audio_future = future
|
||||
|
||||
elif event == "speech_stopped" and self.recording and self.state == "waiting_speech":
|
||||
self.get_logger().info("[VAD] 检测到语音结束,请求 ASR 识别")
|
||||
self.state = "waiting_asr"
|
||||
request = ASRRecognize.Request()
|
||||
request.command = "start"
|
||||
self.pending_asr_future = self.asr_client.call_async(request)
|
||||
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
|
||||
def main(args=None):
|
||||
rclpy.init(args=args)
|
||||
node = RegisterSpeakerNode()
|
||||
rclpy.spin(node)
|
||||
node.destroy_node()
|
||||
try:
|
||||
rclpy.shutdown()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
802
robot_speaker/core/robot_speaker_node.py
Normal file
802
robot_speaker/core/robot_speaker_node.py
Normal file
@@ -0,0 +1,802 @@
|
||||
import rclpy
|
||||
from rclpy.node import Node
|
||||
from std_msgs.msg import String
|
||||
import threading
|
||||
import queue
|
||||
import time
|
||||
import base64
|
||||
import io
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from cv_bridge import CvBridge
|
||||
from interfaces.msg import ImgMsg
|
||||
import collections
|
||||
import os
|
||||
import yaml
|
||||
from typing import Optional
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
from interfaces.srv import VADEvent, ASRRecognize, TTSSynthesize, AudioData
|
||||
from openai import OpenAI
|
||||
from robot_speaker.core.context_manager import ConversationHistory
|
||||
from robot_speaker.core.speaker_verifier import SpeakerVerificationClient, SpeakerState
|
||||
from robot_speaker.core.intent_router import IntentRouter, IntentResult
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ConversationState(Enum):
|
||||
IDLE = "idle"
|
||||
CHECK_VOICE = "check_voice"
|
||||
AUTHORIZED = "authorized"
|
||||
|
||||
|
||||
class RobotSpeakerNode(Node):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__('robot_speaker_node')
|
||||
self._load_config()
|
||||
|
||||
self.text_queue = queue.Queue()
|
||||
self.tts_queue = queue.Queue()
|
||||
|
||||
self.interrupt_event = threading.Event()
|
||||
self.stop_event = threading.Event()
|
||||
|
||||
self.conversation_state = ConversationState.IDLE
|
||||
self.state_lock = threading.Lock()
|
||||
|
||||
self.current_speaker_id = None
|
||||
self.current_speaker_state = SpeakerState.UNKNOWN
|
||||
self.current_speaker_score = 0.0
|
||||
self.current_speaker_threshold = 0.0
|
||||
self.sv_lock = threading.Lock()
|
||||
self.sv_speech_end_event = threading.Event()
|
||||
self.sv_result_ready_event = threading.Event()
|
||||
self.sv_audio_buffer = None
|
||||
self.sv_recording = False
|
||||
|
||||
self._init_components()
|
||||
|
||||
if self.sv_enabled and self.sv_client:
|
||||
speaker_count = self.sv_client.get_speaker_count()
|
||||
if speaker_count == 0:
|
||||
self.get_logger().info("[Speaker] 声纹数据库为空,请注册声纹")
|
||||
|
||||
self.skill_sequence_pub = self.create_publisher(String, '/llm_skill_sequence', 10)
|
||||
|
||||
self.last_execution_status: Optional[str] = None
|
||||
self.execution_status_lock = threading.Lock()
|
||||
self.skill_result_sub = self.create_subscription(
|
||||
String, '/skill_execution_result', self._on_skill_result_received, 10
|
||||
)
|
||||
|
||||
self._start_threads()
|
||||
self.get_logger().info("[Speaker] 语音节点已启动")
|
||||
|
||||
def _load_config(self):
|
||||
config_file = os.path.join(
|
||||
get_package_share_directory('robot_speaker'),
|
||||
'config',
|
||||
'voice.yaml'
|
||||
)
|
||||
with open(config_file, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
audio = config['audio']
|
||||
mic = audio['microphone']
|
||||
soundcard = audio['soundcard']
|
||||
tts_audio = audio['tts']
|
||||
|
||||
self.input_device_index = mic['device_index']
|
||||
self.output_card_index = soundcard['card_index']
|
||||
self.output_device_index = soundcard['device_index']
|
||||
self.sample_rate = mic['sample_rate']
|
||||
self.channels = mic['channels']
|
||||
self.chunk = mic['chunk']
|
||||
self.audio_microphone_heartbeat_interval = mic['heartbeat_interval']
|
||||
self.output_sample_rate = soundcard['sample_rate']
|
||||
self.output_channels = soundcard['channels']
|
||||
self.output_volume = soundcard['volume']
|
||||
self.audio_tts_source_sample_rate = tts_audio['source_sample_rate']
|
||||
self.audio_tts_source_channels = tts_audio['source_channels']
|
||||
self.audio_tts_ffmpeg_thread_queue_size = tts_audio['ffmpeg_thread_queue_size']
|
||||
|
||||
vad = config['vad']
|
||||
self.vad_mode = vad['vad_mode']
|
||||
self.silence_duration_ms = vad['silence_duration_ms']
|
||||
self.min_energy_threshold = vad['min_energy_threshold']
|
||||
|
||||
dashscope = config['dashscope']
|
||||
self.dashscope_api_key = dashscope['api_key']
|
||||
self.asr_model = dashscope['asr']['model']
|
||||
self.asr_url = dashscope['asr']['url']
|
||||
self.llm_model = dashscope['llm']['model']
|
||||
self.llm_base_url = dashscope['llm']['base_url']
|
||||
self.llm_temperature = dashscope['llm']['temperature']
|
||||
self.llm_max_tokens = dashscope['llm']['max_tokens']
|
||||
self.llm_max_history = dashscope['llm']['max_history']
|
||||
self.llm_summary_trigger = dashscope['llm']['summary_trigger']
|
||||
self.tts_model = dashscope['tts']['model']
|
||||
self.tts_voice = dashscope['tts']['voice']
|
||||
|
||||
system = config['system']
|
||||
self.use_wake_word = system['use_wake_word']
|
||||
self.wake_word = system['wake_word']
|
||||
self.system_shutup_keywords = system['shutup_keywords']
|
||||
self.sv_enabled = system['sv_enabled']
|
||||
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
|
||||
self.sv_threshold = system['sv_threshold']
|
||||
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
|
||||
self.sv_buffer_size = system['sv_buffer_size']
|
||||
self.continue_without_image = system['continue_without_image']
|
||||
|
||||
camera = config['camera']
|
||||
self.camera_image_jpeg_quality = camera['image']['jpeg_quality']
|
||||
|
||||
def _init_components(self):
|
||||
self.shutup_keywords = [k.strip() for k in self.system_shutup_keywords.split(',') if k.strip()]
|
||||
|
||||
self.intent_router = IntentRouter()
|
||||
self.sv_audio_buffer = collections.deque(maxlen=self.sv_buffer_size)
|
||||
|
||||
self.vad_client = self.create_client(VADEvent, '/vad/event')
|
||||
self.asr_client = self.create_client(ASRRecognize, '/asr/recognize')
|
||||
self.tts_client = self.create_client(TTSSynthesize, '/tts/synthesize')
|
||||
self.audio_data_client = self.create_client(AudioData, '/asr/audio_data')
|
||||
|
||||
self.get_logger().info("[Speaker] 等待service节点启动...")
|
||||
self.vad_client.wait_for_service(timeout_sec=5.0)
|
||||
self.asr_client.wait_for_service(timeout_sec=5.0)
|
||||
self.tts_client.wait_for_service(timeout_sec=5.0)
|
||||
self.audio_data_client.wait_for_service(timeout_sec=5.0)
|
||||
self.get_logger().info("[Speaker] 所有service节点已就绪")
|
||||
|
||||
self.llm_client = OpenAI(api_key=self.dashscope_api_key, base_url=self.llm_base_url)
|
||||
self.history = ConversationHistory(
|
||||
max_history=self.llm_max_history,
|
||||
summary_trigger=self.llm_summary_trigger
|
||||
)
|
||||
|
||||
self.cv_bridge = CvBridge()
|
||||
self.img_msg_cache = {}
|
||||
self.img_msg_lock = threading.Lock()
|
||||
self.img_sub = self.create_subscription(
|
||||
ImgMsg,
|
||||
'/img_msg',
|
||||
lambda msg: (self.img_msg_lock.acquire(), self.img_msg_cache.update({msg.position: msg}), self.img_msg_lock.release()),
|
||||
10
|
||||
)
|
||||
|
||||
if self.sv_enabled and self.sv_model_path:
|
||||
try:
|
||||
self.sv_client = SpeakerVerificationClient(
|
||||
model_path=self.sv_model_path,
|
||||
threshold=self.sv_threshold,
|
||||
speaker_db_path=self.sv_speaker_db_path,
|
||||
logger=self.get_logger()
|
||||
)
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"[Speaker] 声纹识别初始化失败: {e},声纹功能将不可用")
|
||||
self.sv_client = None
|
||||
self.sv_enabled = False
|
||||
else:
|
||||
self.sv_client = None
|
||||
|
||||
def _start_threads(self):
|
||||
self.vad_thread = threading.Thread(
|
||||
target=self._vad_event_worker,
|
||||
name="VADEventThread",
|
||||
daemon=True
|
||||
)
|
||||
self.vad_thread.start()
|
||||
|
||||
self.process_thread = threading.Thread(
|
||||
target=self._process_worker,
|
||||
name="ProcessThread",
|
||||
daemon=True
|
||||
)
|
||||
self.process_thread.start()
|
||||
|
||||
self.tts_thread = threading.Thread(
|
||||
target=self._tts_worker,
|
||||
name="TTSThread",
|
||||
daemon=True
|
||||
)
|
||||
self.tts_thread.start()
|
||||
|
||||
if self.sv_enabled and self.sv_client:
|
||||
self.sv_thread = threading.Thread(
|
||||
target=self._sv_worker,
|
||||
name="SVThread",
|
||||
daemon=True
|
||||
)
|
||||
self.sv_thread.start()
|
||||
else:
|
||||
self.sv_thread = None
|
||||
|
||||
def _change_state(self, new_state: ConversationState, reason: str):
|
||||
with self.state_lock:
|
||||
old_state = self.conversation_state
|
||||
self.conversation_state = new_state
|
||||
self.get_logger().info(f"[Speaker-State] {old_state.value} -> {new_state.value}: {reason}")
|
||||
|
||||
def _on_speech_started(self):
|
||||
self.get_logger().info("[Speaker-VAD] 检测到人声开始")
|
||||
with self.state_lock:
|
||||
state = self.conversation_state
|
||||
if state == ConversationState.AUTHORIZED:
|
||||
if self.sv_enabled and self.sv_client:
|
||||
self._start_sv_recording()
|
||||
self._change_state(ConversationState.CHECK_VOICE, "新指令,重新验证声纹")
|
||||
if state == ConversationState.IDLE:
|
||||
if self.sv_enabled and self.sv_client:
|
||||
self._start_sv_recording()
|
||||
self._change_state(ConversationState.CHECK_VOICE, "检测到语音,开始检查声纹")
|
||||
else:
|
||||
self._change_state(ConversationState.AUTHORIZED, "未启用声纹,直接授权")
|
||||
elif state == ConversationState.CHECK_VOICE:
|
||||
self._start_sv_recording()
|
||||
|
||||
def _on_speech_stopped(self):
|
||||
import threading
|
||||
self.get_logger().debug(f"[Speaker-VAD] speech_stopped 被调用 | 线程:{threading.current_thread().name} | 当前状态:{self.conversation_state.value}")
|
||||
with self.state_lock:
|
||||
state = self.conversation_state
|
||||
self.get_logger().debug(f"[Speaker-VAD] 准备停止声纹录音 | sv_enabled:{self.sv_enabled} | state:{state}")
|
||||
if self.sv_enabled and state in [ConversationState.CHECK_VOICE, ConversationState.AUTHORIZED]:
|
||||
self.sv_speech_end_event.clear()
|
||||
self._stop_sv_recording()
|
||||
self._call_asr_service()
|
||||
|
||||
def _start_sv_recording(self):
|
||||
if not self.sv_enabled:
|
||||
return
|
||||
request = AudioData.Request()
|
||||
request.command = "start"
|
||||
request.duration_ms = 0
|
||||
self.audio_data_client.call_async(request)
|
||||
|
||||
def _stop_sv_recording(self):
|
||||
import threading
|
||||
self.get_logger().debug(f"[Speaker-SV] _stop_sv_recording 开始 | 线程:{threading.current_thread().name} | 时间:{time.time()}")
|
||||
request = AudioData.Request()
|
||||
request.command = "stop"
|
||||
future = self.audio_data_client.call_async(request)
|
||||
future.add_done_callback(self._on_sv_audio_ready)
|
||||
self.get_logger().debug(f"[Speaker-SV] _stop_sv_recording 已发送异步请求 | future_id:{id(future)}")
|
||||
|
||||
def _on_sv_audio_ready(self, future):
|
||||
import threading
|
||||
self.get_logger().debug(f"[Speaker-SV] _on_sv_audio_ready 回调触发 | 线程:{threading.current_thread().name} | future_id:{id(future)} | 时间:{time.time()}")
|
||||
try:
|
||||
response = future.result()
|
||||
self.get_logger().debug(f"[Speaker-SV] 收到响应 | success:{response.success} | samples:{response.samples}")
|
||||
if response.success and response.samples > 0:
|
||||
audio_array = np.frombuffer(response.audio_data, dtype=np.int16)
|
||||
with self.sv_lock:
|
||||
self.get_logger().debug(f"[Speaker-SV] 准备写入buffer | 旧大小:{len(self.sv_audio_buffer)} | 新数据:{len(audio_array)}")
|
||||
self.sv_audio_buffer.clear()
|
||||
self.sv_audio_buffer.extend(audio_array)
|
||||
self.get_logger().debug(f"[Speaker-SV] buffer已更新 | 新大小:{len(self.sv_audio_buffer)}")
|
||||
self.get_logger().debug(f"[Speaker-SV] 准备设置 sv_speech_end_event")
|
||||
self.sv_speech_end_event.set()
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[Speaker-SV] _on_sv_audio_ready 异常 | 错误:{e} | 类型:{type(e).__name__}")
|
||||
|
||||
def _call_asr_service(self):
|
||||
self.get_logger().info("[Speaker] 调用ASR服务获取识别结果")
|
||||
request = ASRRecognize.Request()
|
||||
request.command = "start"
|
||||
future = self.asr_client.call_async(request)
|
||||
future.add_done_callback(self._asr_service_callback)
|
||||
|
||||
def _asr_service_callback(self, future):
|
||||
import threading
|
||||
self.get_logger().debug(f"[Speaker-ASR] ASR回调触发 | 线程:{threading.current_thread().name} | 时间:{time.time()}")
|
||||
try:
|
||||
response = future.result()
|
||||
self.get_logger().debug(f"[Speaker-ASR] 收到响应 | success:{response.success} | text:{response.text if response.success else 'N/A'}")
|
||||
if response.success and response.text:
|
||||
self.text_queue.put(response.text)
|
||||
self.get_logger().debug(f"[Speaker-ASR] 文本已放入队列 | queue_size:{self.text_queue.qsize()}")
|
||||
else:
|
||||
self.get_logger().warn(f"[Speaker-ASR] 识别失败或为空: success={response.success}, message={response.message}")
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[Speaker-ASR] 异常 | 错误:{e} | 类型:{type(e).__name__}")
|
||||
|
||||
def _vad_event_worker(self):
|
||||
import threading
|
||||
self.get_logger().info(f"[Speaker-VAD] 启动 | 线程ID:{threading.current_thread().ident}")
|
||||
while not self.stop_event.is_set():
|
||||
request = VADEvent.Request()
|
||||
request.command = "wait"
|
||||
request.timeout_ms = 500
|
||||
future = self.vad_client.call_async(request)
|
||||
future.add_done_callback(self._on_vad_event_response)
|
||||
time.sleep(0.05)
|
||||
|
||||
def _on_vad_event_response(self, future):
|
||||
import threading
|
||||
self.get_logger().debug(f"[Speaker-VAD] 回调触发 | 线程:{threading.current_thread().name}")
|
||||
try:
|
||||
response = future.result()
|
||||
if not response.success or response.event == "none":
|
||||
return
|
||||
self.get_logger().debug(f"[Speaker-VAD] 收到事件 | event:{response.event} | 线程:{threading.current_thread().name} | 时间:{time.time()}")
|
||||
if response.event == "speech_started":
|
||||
self._on_speech_started()
|
||||
elif response.event == "speech_stopped":
|
||||
self._on_speech_stopped()
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[Speaker-VAD] 异常 | 错误:{e} | 类型:{type(e).__name__}")
|
||||
|
||||
def _process_worker(self):
|
||||
"""获取文本 → 状态转换 → 唤醒词处理 → 闭嘴指令检查 → 意图路由 → 处理请求"""
|
||||
self.get_logger().info("[Speaker] 主线程启动")
|
||||
while not self.stop_event.is_set():
|
||||
try:
|
||||
text = self.text_queue.get(timeout=0.1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
self.interrupt_event.clear()
|
||||
with self.state_lock:
|
||||
current_state = self.conversation_state
|
||||
previous_state = current_state
|
||||
current_state = self._handle_state_transition(current_state, text)
|
||||
if current_state is None:
|
||||
continue
|
||||
if current_state == ConversationState.AUTHORIZED and previous_state == ConversationState.CHECK_VOICE:
|
||||
self._interrupt_tts()
|
||||
processed_text = self._handle_wake_word(text, current_state)
|
||||
if not processed_text:
|
||||
continue
|
||||
if self._check_shutup_command(processed_text):
|
||||
self._handle_shutup_command()
|
||||
continue
|
||||
intent_result = self.intent_router.route(processed_text)
|
||||
self.get_logger().info(f"[Speaker-Intent] intent={intent_result.intent}, need_camera={intent_result.need_camera}, camera_mode={intent_result.camera_mode}")
|
||||
if intent_result.intent == "kb_qa":
|
||||
self.interrupt_event.clear()
|
||||
if self._handle_kb_qa(processed_text):
|
||||
continue
|
||||
self._put_tts_text("抱歉,我没有找到相关信息")
|
||||
continue
|
||||
self.interrupt_event.clear()
|
||||
self._handle_llm_request(intent_result, processed_text)
|
||||
|
||||
def _handle_state_transition(self, current_state: ConversationState, text: str) -> ConversationState | None:
|
||||
if current_state == ConversationState.CHECK_VOICE:
|
||||
if self.sv_enabled and self.sv_client:
|
||||
if not self._handle_speaker_verification():
|
||||
return None
|
||||
else:
|
||||
self._change_state(ConversationState.AUTHORIZED, "未启用声纹")
|
||||
if self.use_wake_word:
|
||||
wake_result = self._handle_wake_word(text, current_state)
|
||||
if not wake_result:
|
||||
self._change_state(ConversationState.IDLE, "未检测到唤醒词")
|
||||
return None
|
||||
elif current_state == ConversationState.AUTHORIZED:
|
||||
if self.sv_enabled and self.sv_client:
|
||||
if not self._handle_speaker_verification():
|
||||
return None
|
||||
if self.use_wake_word:
|
||||
wake_result = self._handle_wake_word(text, current_state)
|
||||
if not wake_result:
|
||||
self._change_state(ConversationState.IDLE, "未检测到唤醒词")
|
||||
return None
|
||||
elif current_state == ConversationState.IDLE:
|
||||
if self.sv_enabled and self.sv_client:
|
||||
return None
|
||||
else:
|
||||
self._change_state(ConversationState.AUTHORIZED, "收到文本但状态为IDLE,未启用声纹,直接授权")
|
||||
if self.use_wake_word:
|
||||
wake_result = self._handle_wake_word(text, current_state)
|
||||
if not wake_result:
|
||||
self._change_state(ConversationState.IDLE, "未检测到唤醒词")
|
||||
return None
|
||||
with self.state_lock:
|
||||
return self.conversation_state
|
||||
|
||||
def _handle_speaker_verification(self) -> bool:
|
||||
import threading
|
||||
self.get_logger().debug(f"[Speaker-SV] 开始声纹验证 | 线程:{threading.current_thread().name} | result_ready:{self.sv_result_ready_event.is_set()}")
|
||||
if self.sv_result_ready_event.is_set():
|
||||
self.get_logger().debug(f"[Speaker-SV] 结果已ready,跳过等待")
|
||||
pass
|
||||
elif not self.sv_speech_end_event.wait(timeout=2.0):
|
||||
self.get_logger().warn(f"[Speaker-SV] speech_end_event 等待超时")
|
||||
self._change_state(ConversationState.IDLE, "没有录音数据,无法验证")
|
||||
return False
|
||||
self.get_logger().debug(f"[Speaker-SV] speech_end_event 已触发,等待result_ready_event")
|
||||
if not self.sv_result_ready_event.wait(timeout=3.0):
|
||||
self.get_logger().warn(f"[Speaker-SV] result_ready_event 等待超时")
|
||||
with self.sv_lock:
|
||||
self.sv_audio_buffer.clear()
|
||||
self._change_state(ConversationState.IDLE, "声纹结果未ready")
|
||||
return False
|
||||
self.get_logger().debug(f"[Speaker-SV] result_ready_event 已触发,读取结果")
|
||||
self.sv_result_ready_event.clear()
|
||||
with self.sv_lock:
|
||||
speaker_id = self.current_speaker_id
|
||||
speaker_state = self.current_speaker_state
|
||||
score = self.current_speaker_score
|
||||
self.get_logger().debug(f"[Speaker-SV] 验证结果 | speaker_id:{speaker_id} | state:{speaker_state.value} | score:{score:.4f}")
|
||||
if not (speaker_id and speaker_state == SpeakerState.VERIFIED):
|
||||
if self.sv_client.get_speaker_count() == 0:
|
||||
self._change_state(ConversationState.IDLE, "声纹数据库为空")
|
||||
else:
|
||||
self._change_state(ConversationState.IDLE, f"声纹验证失败, 得分: {score:.4f}")
|
||||
return False
|
||||
self._change_state(ConversationState.AUTHORIZED, f"声纹验证成功: {speaker_id}, 得分: {score:.4f}")
|
||||
return True
|
||||
|
||||
def _handle_shutup_command(self):
|
||||
with self.state_lock:
|
||||
current_state = self.conversation_state
|
||||
|
||||
if current_state == ConversationState.AUTHORIZED or not self.sv_enabled or not self.sv_client:
|
||||
self._interrupt_tts()
|
||||
if self.history:
|
||||
self.history.cancel_turn()
|
||||
|
||||
def _handle_kb_qa(self, text: str) -> bool:
|
||||
kb_answer = self.intent_router.search_kb(text)
|
||||
if kb_answer:
|
||||
self._put_tts_text(kb_answer)
|
||||
return True
|
||||
return False
|
||||
|
||||
def _handle_llm_request(self, intent_result, processed_text: str):
|
||||
is_skill_sequence = intent_result.intent == "skill_sequence"
|
||||
if self.history and not is_skill_sequence:
|
||||
self.history.start_turn(intent_result.text)
|
||||
if not self.llm_client:
|
||||
self._put_tts_text(processed_text)
|
||||
return
|
||||
if is_skill_sequence:
|
||||
self.get_logger().info(f"[Speaker-Skill] 任务: {processed_text}")
|
||||
with self.execution_status_lock:
|
||||
last_status = self.last_execution_status
|
||||
self.get_logger().debug(f"[Speaker-Skill] 读取执行状态 | 线程:{threading.current_thread().name} | 时间:{time.time()} | 状态:{last_status}")
|
||||
system_prompt_with_status = self.intent_router.build_skill_prompt(execution_status=last_status)
|
||||
else:
|
||||
system_prompt_with_status = intent_result.system_prompt
|
||||
self.get_logger().debug(f"[Speaker-LLM] intent={intent_result.intent} | system_prompt前100字符: {system_prompt_with_status[:100] if system_prompt_with_status else 'None'}")
|
||||
reply = self._llm_process_stream_with_camera(
|
||||
intent_result.text,
|
||||
intent_result.need_camera,
|
||||
intent_result.camera_mode,
|
||||
system_prompt_with_status,
|
||||
intent_result.intent
|
||||
)
|
||||
if not reply or not reply.strip():
|
||||
if self.history and not is_skill_sequence:
|
||||
self.history.cancel_turn()
|
||||
return
|
||||
if self.history and not is_skill_sequence:
|
||||
self.history.commit_turn(reply)
|
||||
if is_skill_sequence and reply.strip():
|
||||
msg = String()
|
||||
msg.data = reply.strip()
|
||||
self.skill_sequence_pub.publish(msg)
|
||||
self.get_logger().info(f"[Speaker-Skill] 开始新任务: {processed_text}")
|
||||
|
||||
def _check_shutup_command(self, text: str) -> bool:
|
||||
text_pinyin = self.intent_router.to_pinyin(text).lower().strip()
|
||||
for keyword_pinyin in self.shutup_keywords:
|
||||
keyword_pinyin_clean = keyword_pinyin.lower().strip()
|
||||
if keyword_pinyin_clean in text_pinyin:
|
||||
self.get_logger().info(f"[Speaker-Intent] 闭嘴指令匹配到关键词: {keyword_pinyin} (文本拼音: {text_pinyin})")
|
||||
return True
|
||||
return False
|
||||
|
||||
def _interrupt_tts(self):
|
||||
self.interrupt_event.set()
|
||||
while not self.tts_queue.empty():
|
||||
try:
|
||||
self.tts_queue.get_nowait()
|
||||
except queue.Empty:
|
||||
break
|
||||
request = TTSSynthesize.Request()
|
||||
request.command = "interrupt"
|
||||
request.text = ""
|
||||
request.voice = ""
|
||||
future = self.tts_client.call_async(request)
|
||||
future.add_done_callback(lambda f: self.get_logger().info("[Speaker-TTS] interrupt sent"))
|
||||
|
||||
def _on_skill_result_received(self, msg: String):
|
||||
try:
|
||||
import json
|
||||
data = json.loads(msg.data)
|
||||
success = data.get("success", False)
|
||||
message = data.get("message", "")
|
||||
total_skills = data.get("total_skills", 0)
|
||||
succeeded_skills = data.get("succeeded_skills", 0)
|
||||
status_text = f"执行结果: {'成功' if success else '失败'}"
|
||||
if message:
|
||||
status_text += f", 详情: {message}"
|
||||
if total_skills > 0:
|
||||
status_text += f", 总技能数: {total_skills}, 成功: {succeeded_skills}, 失败: {total_skills - succeeded_skills}"
|
||||
with self.execution_status_lock:
|
||||
self.last_execution_status = status_text
|
||||
self.get_logger().info(f"[Speaker-Skill] 执行状态已更新: {status_text}")
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"[Speaker-Skill] 解析执行结果失败: {e}")
|
||||
|
||||
|
||||
def _capture_image_from_img_dev(self, camera_mode: Optional[str] = None) -> Optional[np.ndarray]:
|
||||
timeout_sec = 1.0
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout_sec:
|
||||
with self.img_msg_lock:
|
||||
if camera_mode and camera_mode in self.img_msg_cache:
|
||||
msg = self.img_msg_cache[camera_mode]
|
||||
cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
|
||||
self.get_logger().info(f"[Speaker-Camera] 使用{camera_mode}相机获取图像成功 (position={msg.position})")
|
||||
return cv_image
|
||||
elif camera_mode is None and len(self.img_msg_cache) > 0:
|
||||
msg = next(iter(self.img_msg_cache.values()))
|
||||
cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
|
||||
self.get_logger().info(f"[Speaker-Camera] 未指定相机位置,使用{msg.position}相机获取图像成功")
|
||||
return cv_image
|
||||
time.sleep(0.1)
|
||||
with self.img_msg_lock:
|
||||
available_positions = list(self.img_msg_cache.keys()) if self.img_msg_cache else []
|
||||
self.get_logger().warning(f"[Speaker-Camera] 等待图像超时 (期望位置={camera_mode}, 可用位置={available_positions})")
|
||||
return None
|
||||
|
||||
def _encode_image_to_base64(self, image_data: np.ndarray, quality: int = 85) -> str:
|
||||
try:
|
||||
if image_data.shape[2] == 3:
|
||||
pil_image = Image.fromarray(image_data, 'RGB')
|
||||
else:
|
||||
pil_image = Image.fromarray(image_data)
|
||||
buffer = io.BytesIO()
|
||||
pil_image.save(buffer, format='JPEG', quality=quality)
|
||||
image_bytes = buffer.getvalue()
|
||||
return base64.b64encode(image_bytes).decode('utf-8')
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[Speaker-Camera] 图像编码失败: {e}")
|
||||
return ""
|
||||
|
||||
def _llm_process_stream_with_camera(self, user_text: str, need_camera: bool, camera_mode: Optional[str] = None, system_prompt: Optional[str] = None, intent: str = "chat_text") -> str:
|
||||
if not self.llm_client:
|
||||
return ""
|
||||
if intent == "skill_sequence":
|
||||
messages = []
|
||||
else:
|
||||
if not self.history:
|
||||
return ""
|
||||
messages = [{"role": msg.role, "content": msg.content} for msg in self.history.get_messages()]
|
||||
has_system_msg = any(msg.get("role") == "system" for msg in messages)
|
||||
if not has_system_msg:
|
||||
if system_prompt is None:
|
||||
system_prompt = self.intent_router.build_default_system_prompt()
|
||||
messages.insert(0, {"role": "system", "content": system_prompt})
|
||||
image_base64_list = []
|
||||
if need_camera:
|
||||
image_data = self._capture_image_from_img_dev(camera_mode)
|
||||
if image_data is not None:
|
||||
image_base64 = self._encode_image_to_base64(image_data, quality=self.camera_image_jpeg_quality)
|
||||
if image_base64:
|
||||
image_base64_list.append(image_base64)
|
||||
if not image_base64_list and not self.continue_without_image:
|
||||
self.get_logger().warning(f"[Speaker-LLM] 需要相机但未获取到图片,且配置为不继续推理,放弃请求")
|
||||
return ""
|
||||
if image_base64_list:
|
||||
content_list = [{"type": "text", "text": user_text}]
|
||||
for img_b64 in image_base64_list:
|
||||
content_list.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
|
||||
})
|
||||
if intent == "skill_sequence":
|
||||
messages.append({"role": "user", "content": content_list})
|
||||
else:
|
||||
messages[-1]["content"] = content_list
|
||||
else:
|
||||
if intent == "skill_sequence":
|
||||
messages.append({"role": "user", "content": user_text})
|
||||
full_reply = ""
|
||||
interrupted = False
|
||||
try:
|
||||
stream = self.llm_client.chat.completions.create(
|
||||
model=self.llm_model,
|
||||
messages=messages,
|
||||
temperature=self.llm_temperature,
|
||||
max_tokens=self.llm_max_tokens,
|
||||
stream=True
|
||||
)
|
||||
for chunk in stream:
|
||||
if self.interrupt_event.is_set():
|
||||
interrupted = True
|
||||
break
|
||||
|
||||
if chunk.choices and chunk.choices[0].delta.content:
|
||||
content = chunk.choices[0].delta.content
|
||||
full_reply += content
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[Speaker-LLM] 调用失败: {e}")
|
||||
return ""
|
||||
if interrupted:
|
||||
self.get_logger().info("[Speaker-LLM] 流式处理被中断")
|
||||
return ""
|
||||
reply = full_reply.strip() if full_reply else ""
|
||||
self.get_logger().info(f"[Speaker-LLM] 生成回复: {reply}")
|
||||
if reply and intent != "skill_sequence" and not self.interrupt_event.is_set():
|
||||
self._put_tts_text(reply)
|
||||
return reply
|
||||
|
||||
def _tts_worker(self):
|
||||
self.get_logger().info("[Speaker-TTS] TTS播放线程启动")
|
||||
while not self.stop_event.is_set():
|
||||
try:
|
||||
text = self.tts_queue.get(timeout=0.5)
|
||||
except queue.Empty:
|
||||
continue
|
||||
if self.interrupt_event.is_set():
|
||||
continue
|
||||
text_str = str(text).strip()
|
||||
if not text_str:
|
||||
continue
|
||||
request = TTSSynthesize.Request()
|
||||
request.command = "synthesize"
|
||||
request.text = text_str
|
||||
request.voice = ""
|
||||
future = self.tts_client.call_async(request)
|
||||
future.add_done_callback(self._on_tts_done)
|
||||
|
||||
def _on_tts_done(self, future):
|
||||
try:
|
||||
response = future.result()
|
||||
if not response.success:
|
||||
self.get_logger().warn(f"[Speaker-TTS] 播放失败: {response.message}")
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[Speaker-TTS] error: {e}")
|
||||
|
||||
|
||||
def _sv_worker(self):
|
||||
self.get_logger().info("[Speaker-SV] 启动")
|
||||
min_audio_samples = int(self.sample_rate * 0.5)
|
||||
while not self.stop_event.is_set():
|
||||
try:
|
||||
self.get_logger().debug(f"[Speaker-SV] 等待 sv_speech_end_event...")
|
||||
if not self.sv_speech_end_event.wait(timeout=0.1):
|
||||
continue
|
||||
self.get_logger().debug(f"[Speaker-SV] sv_speech_end_event 触发 | 时间:{time.time()}")
|
||||
self.sv_speech_end_event.clear()
|
||||
if not (self.sv_enabled and self.sv_client):
|
||||
continue
|
||||
self.sv_result_ready_event.clear()
|
||||
speaker_count = self.sv_client.get_speaker_count()
|
||||
if speaker_count == 0:
|
||||
with self.sv_lock:
|
||||
self.current_speaker_id = None
|
||||
self.current_speaker_state = SpeakerState.UNKNOWN
|
||||
self.current_speaker_score = 0.0
|
||||
self.current_speaker_threshold = self.sv_client.threshold
|
||||
self.sv_result_ready_event.set()
|
||||
self.get_logger().info("[Speaker-SV] 数据库为空,跳过验证,直接设置UNKNOWN状态")
|
||||
continue
|
||||
with self.sv_lock:
|
||||
audio_list = list(self.sv_audio_buffer)
|
||||
buffer_size = len(audio_list)
|
||||
self.get_logger().debug(f"[Speaker-SV] 读取buffer | 大小:{buffer_size} | 时间:{time.time()}")
|
||||
self.sv_audio_buffer.clear()
|
||||
self.get_logger().info(f"[Speaker-SV] 收到speech_end事件,录音长度: {buffer_size} 样本({buffer_size/self.sample_rate:.2f}秒)")
|
||||
if buffer_size < min_audio_samples:
|
||||
self.get_logger().debug(f"[Speaker-SV] 录音太短: {buffer_size} < {min_audio_samples},跳过处理")
|
||||
with self.sv_lock:
|
||||
self.current_speaker_id = None
|
||||
self.current_speaker_state = SpeakerState.UNKNOWN
|
||||
self.current_speaker_score = 0.0
|
||||
self.current_speaker_threshold = self.sv_client.threshold
|
||||
self.sv_result_ready_event.set()
|
||||
continue
|
||||
audio_array = np.array(audio_list, dtype=np.int16)
|
||||
|
||||
embedding, success = self.sv_client.extract_embedding(
|
||||
audio_array,
|
||||
sample_rate=self.sample_rate
|
||||
)
|
||||
if not success or embedding is None:
|
||||
self.get_logger().debug("[Speaker-SV] 提取embedding失败")
|
||||
with self.sv_lock:
|
||||
self.current_speaker_id = None
|
||||
self.current_speaker_state = SpeakerState.ERROR
|
||||
self.current_speaker_score = 0.0
|
||||
self.current_speaker_threshold = self.sv_client.threshold
|
||||
self.sv_result_ready_event.set()
|
||||
continue
|
||||
speaker_id, match_state, score, threshold = self.sv_client.match_speaker(embedding)
|
||||
with self.sv_lock:
|
||||
self.current_speaker_id = speaker_id
|
||||
self.current_speaker_state = match_state
|
||||
self.current_speaker_score = score
|
||||
self.current_speaker_threshold = threshold
|
||||
if match_state == SpeakerState.VERIFIED:
|
||||
self.get_logger().info(f"[Speaker-SV] 识别到说话人: {speaker_id}, 相似度: {score:.4f}, 阈值: {threshold:.4f}")
|
||||
elif match_state == SpeakerState.REJECTED:
|
||||
self.get_logger().info(f"[Speaker-SV] 未匹配到已知说话人(相似度不足), 相似度: {score:.4f}, 阈值: {threshold:.4f}")
|
||||
else:
|
||||
self.get_logger().info(f"[Speaker-SV] 状态: {match_state.value}, 相似度: {score:.4f}, 阈值: {threshold:.4f}")
|
||||
self.sv_result_ready_event.set()
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[Speaker-SV] 错误: {e}")
|
||||
time.sleep(0.1)
|
||||
|
||||
def _put_tts_text(self, text: str):
|
||||
try:
|
||||
self.tts_queue.put(text, timeout=0.2)
|
||||
except queue.Full:
|
||||
self.get_logger().warning(f"[Speaker-TTS] 队列已满,无法发送文本: {text[:50]}")
|
||||
|
||||
def _handle_wake_word(self, text: str, current_state: ConversationState = None) -> str:
|
||||
"""处理唤醒词:CHECK_VOICE状态下只检查存在性,AUTHORIZED状态下移除唤醒词"""
|
||||
if not self.use_wake_word:
|
||||
return text.strip()
|
||||
text_pinyin = self.intent_router.to_pinyin(text).lower().strip()
|
||||
wake_word_pinyin = self.wake_word.lower().strip()
|
||||
if not wake_word_pinyin:
|
||||
return ""
|
||||
text_pinyin_parts = text_pinyin.split()
|
||||
wake_word_parts = wake_word_pinyin.split()
|
||||
start_idx = -1
|
||||
for i in range(len(text_pinyin_parts) - len(wake_word_parts) + 1):
|
||||
if text_pinyin_parts[i:i+len(wake_word_parts)] == wake_word_parts:
|
||||
start_idx = i
|
||||
break
|
||||
if start_idx == -1:
|
||||
return ""
|
||||
if current_state == ConversationState.CHECK_VOICE:
|
||||
return text
|
||||
hanzi_count = 0
|
||||
new_text = ""
|
||||
for c in text:
|
||||
if '\u4e00' <= c <= '\u9fa5':
|
||||
if hanzi_count < start_idx or hanzi_count >= start_idx + len(wake_word_parts):
|
||||
new_text += c
|
||||
hanzi_count += 1
|
||||
else:
|
||||
new_text += c
|
||||
return new_text.strip()
|
||||
|
||||
def destroy_node(self):
|
||||
self.get_logger().info("[Speaker] 语音节点正在关闭...")
|
||||
self.stop_event.set()
|
||||
self.interrupt_event.set()
|
||||
self.get_logger().info("[Speaker] 强制停止TTS播放...")
|
||||
self._interrupt_tts()
|
||||
threads_to_join = [self.vad_thread, self.process_thread, self.tts_thread]
|
||||
if self.sv_thread:
|
||||
threads_to_join.append(self.sv_thread)
|
||||
for thread in threads_to_join:
|
||||
if thread and thread.is_alive():
|
||||
thread.join(timeout=1.0)
|
||||
self._interrupt_tts()
|
||||
if hasattr(self, 'sv_client') and self.sv_client:
|
||||
try:
|
||||
self.sv_client.save_speakers()
|
||||
self.sv_client.cleanup()
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"[Speaker] 清理声纹识别资源时出错: {e}")
|
||||
super().destroy_node()
|
||||
|
||||
|
||||
def main(args=None):
|
||||
rclpy.init(args=args)
|
||||
node = RobotSpeakerNode()
|
||||
from rclpy.executors import MultiThreadedExecutor
|
||||
executor = MultiThreadedExecutor(num_threads=4)
|
||||
executor.add_node(node)
|
||||
try:
|
||||
executor.spin()
|
||||
except KeyboardInterrupt:
|
||||
node.get_logger().info("[Speaker] 收到中断信号,正在关闭节点")
|
||||
finally:
|
||||
node.destroy_node()
|
||||
rclpy.shutdown()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
185
robot_speaker/core/skill_interface_parser.py
Normal file
185
robot_speaker/core/skill_interface_parser.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""技能接口文件解析器"""
|
||||
import os
|
||||
import yaml
|
||||
import json
|
||||
from typing import Optional
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
|
||||
class SkillInterfaceParser:
|
||||
def __init__(self, interfaces_root: str):
|
||||
"""初始化解析器"""
|
||||
self.interfaces_root = interfaces_root
|
||||
self._cached_skill_config: list[dict] | None = None
|
||||
self._cached_skill_interfaces: dict[str, dict] | None = None
|
||||
|
||||
def get_skill_names(self) -> list[str]:
|
||||
"""获取所有技能名称(统一读取 robot_skills.yaml,避免重复)"""
|
||||
skill_config = self._load_skill_config()
|
||||
return [entry["name"] for entry in skill_config if isinstance(entry, dict) and entry.get("name")]
|
||||
|
||||
def _load_skill_config(self) -> list[dict]:
|
||||
"""加载 robot_skills.yaml(带缓存,避免重复读取)"""
|
||||
if self._cached_skill_config is not None:
|
||||
return self._cached_skill_config
|
||||
|
||||
try:
|
||||
brain_share = get_package_share_directory("brain")
|
||||
skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
|
||||
with open(skill_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or []
|
||||
self._cached_skill_config = data if isinstance(data, list) else []
|
||||
return self._cached_skill_config
|
||||
except Exception:
|
||||
self._cached_skill_config = []
|
||||
return []
|
||||
|
||||
def parse_skill_interfaces(self) -> dict[str, dict]:
|
||||
"""解析所有技能接口文件的目标字段(带缓存)"""
|
||||
if self._cached_skill_interfaces is not None:
|
||||
return self._cached_skill_interfaces
|
||||
|
||||
result = {}
|
||||
skill_config = self._load_skill_config()
|
||||
|
||||
for skill_entry in skill_config:
|
||||
skill_name = skill_entry.get("name")
|
||||
if not skill_name:
|
||||
continue
|
||||
|
||||
interfaces = skill_entry.get("interfaces", [])
|
||||
for iface in interfaces:
|
||||
if isinstance(iface, dict):
|
||||
iface_name = iface.get("name", "")
|
||||
else:
|
||||
iface_name = str(iface)
|
||||
|
||||
if ".action" in iface_name:
|
||||
iface_type = "action"
|
||||
file_path = os.path.join(self.interfaces_root, "action", iface_name)
|
||||
elif ".srv" in iface_name:
|
||||
iface_type = "srv"
|
||||
file_path = os.path.join(self.interfaces_root, "srv", iface_name)
|
||||
else:
|
||||
continue
|
||||
|
||||
if os.path.exists(file_path):
|
||||
goal_fields = self._parse_goal_fields(file_path)
|
||||
result[skill_name] = {
|
||||
"type": iface_type,
|
||||
"goal_fields": goal_fields
|
||||
}
|
||||
break
|
||||
|
||||
self._cached_skill_interfaces = result
|
||||
return result
|
||||
|
||||
def _parse_goal_fields(self, file_path: str) -> list[dict]:
|
||||
"""解析接口文件的目标字段(第一个---之前的所有字段)"""
|
||||
goal_fields = []
|
||||
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line.startswith("---"):
|
||||
break
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
field_type = parts[0]
|
||||
field_name = parts[1]
|
||||
|
||||
comment = ""
|
||||
if "#" in line:
|
||||
comment = line.split("#", 1)[1].strip()
|
||||
|
||||
goal_fields.append({
|
||||
"name": field_name,
|
||||
"type": field_type,
|
||||
"comment": comment
|
||||
})
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
return goal_fields
|
||||
|
||||
def generate_params_documentation(self) -> str:
|
||||
"""生成技能参数说明文档"""
|
||||
skill_interfaces = self.parse_skill_interfaces()
|
||||
doc_lines = []
|
||||
|
||||
for skill_name, skill_info in skill_interfaces.items():
|
||||
doc_lines.append(f"{skill_name}技能的parameters字段:")
|
||||
goal_fields = skill_info.get("goal_fields", [])
|
||||
|
||||
if not goal_fields:
|
||||
doc_lines.append(" - 无参数,使用 null")
|
||||
else:
|
||||
doc_lines.append(" parameters字典必须包含以下字段:")
|
||||
for field in goal_fields:
|
||||
field_name = field["name"]
|
||||
field_type = field["type"]
|
||||
comment = field.get("comment", "")
|
||||
|
||||
if field_name == "body_id":
|
||||
doc_lines.append(
|
||||
f" - {field_name} ({field_type}): 身体部位ID,0=左臂,1=右臂,2=头部。"
|
||||
f"根据目标物在图片中的方位选择:左侧用0,右侧用1,中央用2。"
|
||||
)
|
||||
else:
|
||||
type_desc = self._get_type_description(field_type)
|
||||
doc_lines.append(f" - {field_name} ({field_type}): {type_desc} {comment}")
|
||||
|
||||
example_params = {}
|
||||
for field in goal_fields:
|
||||
field_name = field["name"]
|
||||
field_type = field["type"]
|
||||
example_params[field_name] = self._get_example_value(field_name, field_type)
|
||||
|
||||
doc_lines.append(f" 示例:{json.dumps(example_params, ensure_ascii=False)}")
|
||||
|
||||
doc_lines.append("")
|
||||
|
||||
return "\n".join(doc_lines)
|
||||
|
||||
def _get_type_description(self, field_type: str) -> str:
|
||||
"""根据字段类型返回描述"""
|
||||
type_map = {
|
||||
"int8": "整数,范围-128到127",
|
||||
"int16": "整数,范围-32768到32767",
|
||||
"int32": "整数",
|
||||
"int64": "整数",
|
||||
"uint8": "无符号整数,范围0到255",
|
||||
"float32": "浮点数",
|
||||
"float64": "浮点数",
|
||||
"string": "字符串",
|
||||
}
|
||||
|
||||
base_type = field_type.replace("[]", "").replace("_", "")
|
||||
return type_map.get(base_type, field_type)
|
||||
|
||||
def _get_example_value(self, field_name: str, field_type: str) -> any:
|
||||
"""根据字段名和类型生成示例值"""
|
||||
if field_name == "body_id":
|
||||
return 0
|
||||
elif field_name == "data_array" and "float64[]" in field_type:
|
||||
return [0.1, 0.2, 0.3, 0.0, 0.0, 0.0]
|
||||
elif "int" in field_type:
|
||||
return 0
|
||||
elif "float" in field_type:
|
||||
return 0.0
|
||||
elif "string" in field_type:
|
||||
return ""
|
||||
elif "[]" in field_type:
|
||||
if "int" in field_type:
|
||||
return [0, 0, 0]
|
||||
elif "float" in field_type:
|
||||
return [0.0, 0.0, 0.0]
|
||||
return []
|
||||
else:
|
||||
return None
|
||||
|
||||
199
robot_speaker/core/speaker_verifier.py
Normal file
199
robot_speaker/core/speaker_verifier.py
Normal file
@@ -0,0 +1,199 @@
|
||||
"""
|
||||
声纹识别模块
|
||||
"""
|
||||
import numpy as np
|
||||
import threading
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SpeakerState(Enum):
|
||||
"""说话人识别状态"""
|
||||
UNKNOWN = "unknown"
|
||||
VERIFIED = "verified"
|
||||
REJECTED = "rejected"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
class SpeakerVerificationClient:
|
||||
"""声纹识别客户端 - 非实时、低频处理"""
|
||||
|
||||
def __init__(self, model_path: str, threshold: float, speaker_db_path: str = None, logger=None):
|
||||
self.model_path = model_path
|
||||
self.threshold = threshold
|
||||
self.speaker_db_path = speaker_db_path
|
||||
self.logger = logger
|
||||
self.speaker_db = {} # {speaker_id: {"embedding": np.ndarray, "env": str, "registered_at": float}}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
# # 优化CPU性能:限制Torch使用的线程数,防止多线程竞争导致性能骤降
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
|
||||
from funasr import AutoModel
|
||||
model_path = os.path.expanduser(self.model_path)
|
||||
# 禁用自动更新检查,防止每次初始化都联网检查
|
||||
self.model = AutoModel(model=model_path, device="cpu", disable_update=True)
|
||||
if self.logger:
|
||||
self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
|
||||
|
||||
if self.speaker_db_path:
|
||||
self.load_speakers()
|
||||
|
||||
def _log(self, level: str, msg: str):
|
||||
"""记录日志 - 修复ROS2 logger在多线程环境中的问题"""
|
||||
if self.logger:
|
||||
try:
|
||||
if level == "info":
|
||||
self.logger.info(msg)
|
||||
elif level == "warning":
|
||||
self.logger.warning(msg)
|
||||
elif level == "error":
|
||||
self.logger.error(msg)
|
||||
elif level == "debug":
|
||||
self.logger.debug(msg)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def load_speakers(self):
|
||||
if not self.speaker_db_path:
|
||||
return
|
||||
|
||||
db_path = os.path.expanduser(self.speaker_db_path)
|
||||
if not os.path.exists(db_path):
|
||||
self._log("info", f"声纹数据库文件不存在: {db_path},将创建新文件")
|
||||
return
|
||||
try:
|
||||
with open(db_path, 'rb') as f:
|
||||
data = json.load(f)
|
||||
with self._lock:
|
||||
self.speaker_db = {}
|
||||
for speaker_id, info in data.items():
|
||||
embedding_array = np.array(info["embedding"], dtype=np.float32)
|
||||
if embedding_array.ndim > 1:
|
||||
embedding_array = embedding_array.flatten()
|
||||
self.speaker_db[speaker_id] = {
|
||||
"embedding": embedding_array,
|
||||
"env": info.get("env", ""),
|
||||
"registered_at": info.get("registered_at", 0.0)
|
||||
}
|
||||
self._log("info", f"已加载 {len(self.speaker_db)} 个已注册说话人")
|
||||
except Exception as e:
|
||||
self._log("error", f"加载声纹数据库失败: {e}")
|
||||
|
||||
def save_speakers(self):
|
||||
if not self.speaker_db_path:
|
||||
return
|
||||
db_path = os.path.expanduser(self.speaker_db_path)
|
||||
try:
|
||||
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
||||
with self._lock:
|
||||
data = {}
|
||||
for speaker_id, info in self.speaker_db.items():
|
||||
data[speaker_id] = {
|
||||
"embedding": info["embedding"].tolist(),
|
||||
"env": info.get("env", ""),
|
||||
"registered_at": info.get("registered_at", 0.0)
|
||||
}
|
||||
with open(db_path, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
self._log("info", f"已保存 {len(data)} 个已注册说话人到: {db_path}")
|
||||
except Exception as e:
|
||||
self._log("error", f"保存声纹数据库失败: {e}")
|
||||
|
||||
def extract_embedding(self, audio_array: np.ndarray, sample_rate: int = 16000) -> tuple[np.ndarray | None, bool]:
|
||||
try:
|
||||
if len(audio_array) == 0:
|
||||
return None, False
|
||||
# 确保是int16格式
|
||||
if audio_array.dtype != np.int16:
|
||||
audio_array = audio_array.astype(np.int16)
|
||||
# 转换为float32并归一化到[-1, 1]
|
||||
audio_float = audio_array.astype(np.float32) / 32768.0
|
||||
# 调用模型提取embedding
|
||||
result = self.model.generate(input=audio_float, cache={})
|
||||
if result and len(result) > 0 and "spk_embedding" in result[0]:
|
||||
embedding = result[0]["spk_embedding"]
|
||||
if embedding is not None and len(embedding) > 0:
|
||||
embedding_array = np.array(embedding, dtype=np.float32)
|
||||
if embedding_array.ndim > 1:
|
||||
embedding_array = embedding_array.flatten()
|
||||
return embedding_array, True
|
||||
return None, False
|
||||
except Exception as e:
|
||||
self._log("error", f"提取声纹特征失败: {e}")
|
||||
return None, False
|
||||
|
||||
def match_speaker(self, embedding: np.ndarray) -> tuple[str | None, SpeakerState, float, float]:
|
||||
if embedding is None or len(embedding) == 0:
|
||||
return None, SpeakerState.UNKNOWN, 0.0, float(self.threshold)
|
||||
|
||||
with self._lock:
|
||||
if len(self.speaker_db) == 0:
|
||||
return None, SpeakerState.UNKNOWN, 0.0, float(self.threshold)
|
||||
try:
|
||||
best_speaker_id = None
|
||||
best_score = 0.0
|
||||
with self._lock:
|
||||
for speaker_id, info in self.speaker_db.items():
|
||||
stored_embedding = info["embedding"]
|
||||
# 计算余弦相似度
|
||||
dot_product = np.dot(embedding, stored_embedding)
|
||||
norm_embedding = np.linalg.norm(embedding)
|
||||
norm_stored = np.linalg.norm(stored_embedding)
|
||||
|
||||
if norm_embedding > 0 and norm_stored > 0:
|
||||
score = dot_product / (norm_embedding * norm_stored)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_speaker_id = speaker_id
|
||||
|
||||
state = SpeakerState.VERIFIED if best_score >= self.threshold else SpeakerState.REJECTED
|
||||
return best_speaker_id, state, float(best_score), float(self.threshold)
|
||||
except Exception as e:
|
||||
self._log("error", f"匹配说话人失败: {e}")
|
||||
return None, SpeakerState.ERROR, 0.0, float(self.threshold)
|
||||
|
||||
def register_speaker(self, speaker_id: str, embedding: np.ndarray, env: str = "") -> bool:
|
||||
if embedding is None or len(embedding) == 0:
|
||||
return False
|
||||
|
||||
try:
|
||||
with self._lock:
|
||||
self.speaker_db[speaker_id] = {
|
||||
"embedding": np.array(embedding, dtype=np.float32),
|
||||
"env": env,
|
||||
"registered_at": time.time()
|
||||
}
|
||||
self._log("info", f"已注册说话人: {speaker_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
self._log("error", f"注册说话人失败: {e}")
|
||||
return False
|
||||
|
||||
def get_speaker_count(self) -> int:
|
||||
with self._lock:
|
||||
return len(self.speaker_db)
|
||||
|
||||
def get_speaker_list(self) -> list[str]:
|
||||
with self._lock:
|
||||
return list(self.speaker_db.keys())
|
||||
|
||||
def remove_speaker(self, speaker_id: str) -> bool:
|
||||
with self._lock:
|
||||
if speaker_id in self.speaker_db:
|
||||
del self.speaker_db[speaker_id]
|
||||
self._log("info", f"已删除说话人: {speaker_id}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def cleanup(self):
|
||||
try:
|
||||
self.save_speakers()
|
||||
if hasattr(self, 'model') and self.model:
|
||||
del self.model
|
||||
except Exception as e:
|
||||
self._log("error", f"清理资源失败: {e}")
|
||||
|
||||
256
robot_speaker/perception/audio_pipeline.py
Normal file
256
robot_speaker/perception/audio_pipeline.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
音频处理模块:录音 + VAD
|
||||
"""
|
||||
import time
|
||||
import pyaudio
|
||||
import webrtcvad
|
||||
import struct
|
||||
import queue
|
||||
|
||||
|
||||
class VADDetector:
|
||||
"""VAD语音检测器"""
|
||||
|
||||
def __init__(self, mode: int, sample_rate: int):
|
||||
self.vad = webrtcvad.Vad(mode)
|
||||
self.sample_rate = sample_rate
|
||||
|
||||
|
||||
class AudioRecorder:
|
||||
"""音频录音器 - 录音线程"""
|
||||
|
||||
def __init__(self, device_index: int, sample_rate: int, channels: int,
|
||||
chunk: int, vad_detector: VADDetector,
|
||||
audio_queue: queue.Queue, # 音频队列:录音线程 → ASR线程
|
||||
silence_duration_ms: int = 1000,
|
||||
min_energy_threshold: int = 300, # 音频能量 > 300:有语音
|
||||
heartbeat_interval: float = 2.0,
|
||||
on_heartbeat=None,
|
||||
is_playing=None,
|
||||
on_new_segment=None, # 检测到新的人声段
|
||||
on_speech_start=None, # 检测到人声开始
|
||||
on_speech_end=None, # 检测到静音结束(说话结束)
|
||||
stop_flag=None,
|
||||
on_audio_chunk=None, # 音频chunk回调(用于声纹录音等,可选)
|
||||
should_put_to_queue=None, # 检查是否应该将音频放入队列(用于阻止ASR,可选)
|
||||
get_silence_threshold=None, # 获取动态静音阈值(毫秒,可选)
|
||||
logger=None):
|
||||
self.device_index = device_index
|
||||
self.sample_rate = sample_rate
|
||||
self.channels = channels
|
||||
self.chunk = chunk
|
||||
self.vad_detector = vad_detector
|
||||
self.audio_queue = audio_queue
|
||||
self.silence_duration_ms = int(silence_duration_ms)
|
||||
self.min_energy_threshold = int(min_energy_threshold)
|
||||
self.heartbeat_interval = heartbeat_interval
|
||||
|
||||
self.on_heartbeat = on_heartbeat
|
||||
self.is_playing = is_playing or (lambda: False)
|
||||
self.on_new_segment = on_new_segment
|
||||
self.on_speech_start = on_speech_start
|
||||
self.on_speech_end = on_speech_end
|
||||
self.stop_flag = stop_flag or (lambda: False)
|
||||
self.on_audio_chunk = on_audio_chunk # 音频chunk回调(用于声纹录音等)
|
||||
self.should_put_to_queue = should_put_to_queue or (lambda: True) # 默认允许放入队列
|
||||
self.get_silence_threshold = get_silence_threshold # 动态静音阈值回调
|
||||
self.logger = logger
|
||||
self.audio = pyaudio.PyAudio()
|
||||
|
||||
# 自动查找 iFLYTEK 麦克风设备
|
||||
try:
|
||||
count = self.audio.get_device_count()
|
||||
found_index = -1
|
||||
if self.logger:
|
||||
self.logger.info(f"开始扫描音频设备 (总数: {count})...")
|
||||
|
||||
for i in range(count):
|
||||
device_info = self.audio.get_device_info_by_index(i)
|
||||
device_name = device_info.get('name', '')
|
||||
max_input_channels = device_info.get('maxInputChannels', 0)
|
||||
|
||||
if self.logger:
|
||||
try:
|
||||
self.logger.info(f"扫描设备 [{i}]: Name='{device_name}', MaxInput={max_input_channels}, Rate={int(device_info.get('defaultSampleRate'))}")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 检查是否包含 iFLYTEK 且支持录音(输入通道 > 0)
|
||||
if 'iFLYTEK' in device_name and max_input_channels > 0:
|
||||
found_index = i
|
||||
if self.logger:
|
||||
self.logger.info(f"已自动定位到麦克风设备: {device_name} (Index: {i})")
|
||||
break
|
||||
|
||||
if found_index != -1:
|
||||
self.device_index = found_index
|
||||
else:
|
||||
if self.logger:
|
||||
self.logger.warning(f"未自动检测到 iFLYTEK 设备,请检查USB连接,或执行 'arecord -l' 确认系统是否识别到录音设备,将继续使用配置的索引: {self.device_index}")
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"设备自动检测过程出错: {e}")
|
||||
|
||||
self.format = pyaudio.paInt16
|
||||
self._debug_counter = 0
|
||||
|
||||
def record_with_vad(self):
|
||||
"""录音线程:VAD + 能量检测"""
|
||||
if self.on_heartbeat:
|
||||
self.on_heartbeat()
|
||||
|
||||
try:
|
||||
stream = self.audio.open(
|
||||
format=self.format,
|
||||
channels=self.channels,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
input_device_index=self.device_index if self.device_index >= 0 else None,
|
||||
frames_per_buffer=self.chunk
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"无法打开音频输入设备: {e}")
|
||||
|
||||
# VAD检测窗口, 最快 0.5s 内发现说话
|
||||
window_sec = 0.5
|
||||
# 连续 1s 没有检测到语音,就判定为静音状态
|
||||
no_speech_threshold = max(self.silence_duration_ms / 1000.0, 0.1)
|
||||
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
audio_buffer = [] # VAD 滑动窗口
|
||||
last_active_time = time.time() # 静音计时基准
|
||||
in_speech_segment = False # 是否处于语音段中(从检测到人声开始,直到静音超时结束)
|
||||
|
||||
try:
|
||||
while not self.stop_flag():
|
||||
# exception_on_overflow=False, 宁可丢帧,也不阻塞
|
||||
data = stream.read(self.chunk, exception_on_overflow=False)
|
||||
processed_data = data
|
||||
|
||||
# 检查是否应该将音频放入队列(用于阻止ASR,例如无声纹文件时需要注册)
|
||||
if self.should_put_to_queue():
|
||||
# 队列满时丢弃最旧的数据,ASR 跟不上时系统仍然听得见
|
||||
if self.audio_queue.full():
|
||||
self.audio_queue.get_nowait()
|
||||
# 使用处理后的音频数据(经过回声消除)
|
||||
self.audio_queue.put_nowait(processed_data)
|
||||
|
||||
# 音频chunk回调(用于声纹录音等,仅在需要时调用)
|
||||
if self.on_audio_chunk:
|
||||
# 回调使用处理后的音频数据
|
||||
self.on_audio_chunk(processed_data)
|
||||
|
||||
# VAD检测使用处理后的音频(经过回声消除)
|
||||
audio_buffer.append(processed_data) # 只用于 VAD,不用于 ASR
|
||||
|
||||
# VAD检测窗口
|
||||
now = time.time()
|
||||
if len(audio_buffer) * self.chunk / self.sample_rate >= window_sec:
|
||||
raw_audio = b''.join(audio_buffer)
|
||||
energy = self._calculate_energy(raw_audio)
|
||||
vad_result = self._check_activity(raw_audio)
|
||||
|
||||
self._debug_counter += 1
|
||||
if self._debug_counter >= 10:
|
||||
if self.logger:
|
||||
self.logger.info(f"[VAD调试] 能量={energy:.1f}, 阈值={self.min_energy_threshold}, VAD结果={vad_result}")
|
||||
self._debug_counter = 0
|
||||
|
||||
if vad_result:
|
||||
last_active_time = now
|
||||
|
||||
if not in_speech_segment: # 上一轮没说话,本轮开始说话
|
||||
in_speech_segment = True
|
||||
if self.on_speech_start:
|
||||
self.on_speech_start()
|
||||
|
||||
# 检测当前 TTS 是否在播放
|
||||
if self.is_playing() and self.on_new_segment:
|
||||
self.on_new_segment() # 打断 TTS的回调
|
||||
else:
|
||||
if in_speech_segment:
|
||||
# 处于语音段中,但当前帧为静音,检查静音时长
|
||||
silence_duration = now - last_active_time
|
||||
|
||||
# 动态获取静音阈值(如果提供回调函数)
|
||||
if self.get_silence_threshold:
|
||||
current_silence_ms = self.get_silence_threshold()
|
||||
current_no_speech_threshold = max(current_silence_ms / 1000.0, 0.1)
|
||||
else:
|
||||
current_no_speech_threshold = no_speech_threshold
|
||||
|
||||
# 添加调试日志
|
||||
if self.logger and silence_duration < current_no_speech_threshold:
|
||||
self.logger.debug(f"[VAD] 静音中: {silence_duration:.3f}秒 < {current_no_speech_threshold:.3f}秒阈值")
|
||||
|
||||
if silence_duration >= current_no_speech_threshold:
|
||||
if self.on_speech_end:
|
||||
if self.logger:
|
||||
self.logger.debug(f"[VAD] 触发speech_end: 静音持续时间 {silence_duration:.3f}秒 >= 阈值 {current_no_speech_threshold:.3f}秒")
|
||||
self.on_speech_end() # 通知系统用户停止说话
|
||||
in_speech_segment = False
|
||||
|
||||
if self.on_heartbeat and now - last_heartbeat_time >= self.heartbeat_interval:
|
||||
self.on_heartbeat()
|
||||
last_heartbeat_time = now
|
||||
|
||||
audio_buffer = []
|
||||
finally:
|
||||
if stream.is_active():
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
|
||||
@staticmethod
|
||||
def _calculate_energy(audio_chunk: bytes) -> float:
|
||||
"""计算音频能量(RMS)"""
|
||||
if not audio_chunk:
|
||||
return 0.0
|
||||
# 计算样本数:音频字节数 // 2(因为是16位PCM,1个样本=2字节)
|
||||
n = len(audio_chunk) // 2
|
||||
if n <= 0:
|
||||
return 0.0
|
||||
# 把字节数据解包为16位有符号整数(小端序)
|
||||
samples = struct.unpack(f'<{n}h', audio_chunk[: n * 2])
|
||||
if not samples:
|
||||
return 0.0
|
||||
return (sum(s * s for s in samples) / len(samples)) ** 0.5
|
||||
|
||||
def _check_activity(self, audio_data: bytes) -> bool:
|
||||
"""VAD + 能量检测:先VAD检测,能量作为辅助判断"""
|
||||
energy = self._calculate_energy(audio_data)
|
||||
|
||||
rate = 0.4 # 连续人声经验值
|
||||
num = 0
|
||||
|
||||
# 采样率:16000 Hz, 帧时长:20ms=0.02s, 每帧采样点数=16000×0.02=320samples
|
||||
# 每帧字节数=320×2=640bytes
|
||||
bytes_per_sample = 2 # paInt16
|
||||
frame_samples = int(self.sample_rate * 0.02)
|
||||
frame_bytes = frame_samples * bytes_per_sample
|
||||
|
||||
if frame_bytes <= 0 or len(audio_data) < frame_bytes:
|
||||
return False
|
||||
|
||||
total_frames = len(audio_data) // frame_bytes
|
||||
required = max(1, int(total_frames * rate))
|
||||
|
||||
for i in range(0, len(audio_data), frame_bytes):
|
||||
chunk = audio_data[i:i + frame_bytes]
|
||||
if len(chunk) == frame_bytes:
|
||||
if self.vad_detector.vad.is_speech(chunk, sample_rate=self.sample_rate):
|
||||
num += 1
|
||||
|
||||
# 语音开头能量高, 中后段(拖音、尾音)能量下降
|
||||
vad_result = num >= required
|
||||
if vad_result and energy < self.min_energy_threshold * 0.5:
|
||||
return False
|
||||
|
||||
return vad_result
|
||||
|
||||
def cleanup(self):
|
||||
"""清理资源"""
|
||||
if hasattr(self, 'audio') and self.audio:
|
||||
self.audio.terminate()
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
import rclpy
|
||||
from rclpy.node import Node
|
||||
from example_interfaces.msg import String
|
||||
import threading
|
||||
from queue import Queue
|
||||
import time
|
||||
import espeakng
|
||||
import pyttsx3
|
||||
|
||||
|
||||
class RobotSpeakerNode(Node):
|
||||
def __init__(self, node_name):
|
||||
super().__init__(node_name)
|
||||
self.novels_queue_ = Queue()
|
||||
self.novel_subscriber_ = self.create_subscription(
|
||||
String, 'robot_msg', self.novel_callback, 10)
|
||||
self.speech_thread_ = threading.Thread(target=self.speak_thread)
|
||||
self.speech_thread_.start()
|
||||
|
||||
def novel_callback(self, msg):
|
||||
self.novels_queue_.put(msg.data)
|
||||
|
||||
def speak_thread(self):
|
||||
# 初始化引擎
|
||||
engine = pyttsx3.init()
|
||||
# 调整参数
|
||||
engine.setProperty('rate', 150) # 语速(150更自然)
|
||||
engine.setProperty('volume', 1.0) # 音量(0.0-1.0)
|
||||
|
||||
# 选择中文音色(修正:使用 languages 属性,且是列表)
|
||||
voices = engine.getProperty('voices')
|
||||
for voice in voices:
|
||||
# 检查语音支持的语言列表中是否包含中文('zh' 或 'zh-CN' 等)
|
||||
if any('zh' in lang for lang in voice.languages):
|
||||
engine.setProperty('voice', voice.id)
|
||||
self.get_logger().info(f'已选择中文语音:{voice.id}')
|
||||
break
|
||||
else:
|
||||
self.get_logger().warning('未找到中文语音库,将使用默认语音')
|
||||
|
||||
while rclpy.ok():
|
||||
if self.novels_queue_.qsize() > 0:
|
||||
text = self.novels_queue_.get()
|
||||
engine.say(text)
|
||||
engine.runAndWait() # 等待语音播放完成
|
||||
else:
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
|
||||
def main(args=None):
|
||||
rclpy.init(args=args)
|
||||
node = RobotSpeakerNode("robot_speaker_node")
|
||||
rclpy.spin(node)
|
||||
rclpy.shutdown()
|
||||
22
robot_speaker/services/__init__.py
Normal file
22
robot_speaker/services/__init__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""
|
||||
Service节点模块
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
703
robot_speaker/services/asr_audio_node.py
Normal file
703
robot_speaker/services/asr_audio_node.py
Normal file
@@ -0,0 +1,703 @@
|
||||
import rclpy
|
||||
from rclpy.node import Node
|
||||
from interfaces.srv import ASRRecognize, AudioData, VADEvent
|
||||
import threading
|
||||
import queue
|
||||
import time
|
||||
import pyaudio
|
||||
import yaml
|
||||
import os
|
||||
import collections
|
||||
import numpy as np
|
||||
import base64
|
||||
import dashscope
|
||||
from dashscope.audio.qwen_omni import OmniRealtimeConversation, OmniRealtimeCallback
|
||||
from dashscope.audio.qwen_omni.omni_realtime import TranscriptionParams, MultiModality
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
|
||||
class AudioRecorder:
|
||||
def __init__(self, device_index: int, sample_rate: int, channels: int,
|
||||
chunk: int, audio_queue: queue.Queue, stop_event, logger=None):
|
||||
self.device_index = device_index
|
||||
self.sample_rate = sample_rate
|
||||
self.channels = channels
|
||||
self.chunk = chunk
|
||||
self.audio_queue = audio_queue
|
||||
self.stop_event = stop_event
|
||||
self.logger = logger
|
||||
self.audio = pyaudio.PyAudio()
|
||||
|
||||
original_index = self.device_index
|
||||
try:
|
||||
for i in range(self.audio.get_device_count()):
|
||||
device_info = self.audio.get_device_info_by_index(i)
|
||||
if 'iFLYTEK' in device_info['name'] and device_info['maxInputChannels'] > 0:
|
||||
self.device_index = i
|
||||
if self.logger:
|
||||
self.logger.info(f"[ASR-Recorder] 已自动定位到麦克风设备: {device_info['name']} (Index: {i})")
|
||||
break
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"[ASR-Recorder] 设备自动检测过程出错: {e}")
|
||||
|
||||
if self.device_index == original_index and original_index == -1:
|
||||
self.device_index = 0
|
||||
if self.logger:
|
||||
self.logger.info("[ASR-Recorder] 未找到 iFLYTEK 设备,使用系统默认输入设备")
|
||||
self.format = pyaudio.paInt16
|
||||
|
||||
def record(self):
|
||||
if self.logger:
|
||||
self.logger.info(f"[ASR-Recorder] 录音线程启动,设备索引: {self.device_index}")
|
||||
stream = None
|
||||
try:
|
||||
stream = self.audio.open(
|
||||
format=self.format,
|
||||
channels=self.channels,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
input_device_index=self.device_index if self.device_index >= 0 else None,
|
||||
frames_per_buffer=self.chunk
|
||||
)
|
||||
if self.logger:
|
||||
self.logger.info("[ASR-Recorder] 音频输入设备已打开")
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"[ASR-Recorder] 无法打开音频输入设备: {e}")
|
||||
return
|
||||
try:
|
||||
while not self.stop_event.is_set():
|
||||
try:
|
||||
data = stream.read(self.chunk, exception_on_overflow=False)
|
||||
|
||||
if self.audio_queue.full():
|
||||
self.audio_queue.get_nowait()
|
||||
self.audio_queue.put_nowait(data)
|
||||
except OSError as e:
|
||||
if self.logger:
|
||||
self.logger.debug(f"[ASR-Recorder] 录音设备错误: {e}")
|
||||
break
|
||||
except KeyboardInterrupt:
|
||||
if self.logger:
|
||||
self.logger.info("[ASR-Recorder] 录音线程收到中断信号")
|
||||
finally:
|
||||
if stream is not None:
|
||||
try:
|
||||
if stream.is_active():
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
except Exception as e:
|
||||
pass
|
||||
if self.logger:
|
||||
self.logger.info("[ASR-Recorder] 录音线程已退出")
|
||||
|
||||
|
||||
class DashScopeASR:
|
||||
def __init__(self, api_key: str, sample_rate: int, model: str, url: str, logger=None):
|
||||
dashscope.api_key = api_key
|
||||
self.sample_rate = sample_rate
|
||||
self.model = model
|
||||
self.url = url
|
||||
self.logger = logger
|
||||
|
||||
self.conversation = None
|
||||
self.running = False
|
||||
self.on_sentence_end = None
|
||||
self.on_speech_started = None
|
||||
self.on_speech_stopped = None
|
||||
|
||||
self._stop_lock = threading.Lock()
|
||||
self._final_result_event = threading.Event()
|
||||
self._pending_commit = False
|
||||
|
||||
# ========== 连接生命周期管理: 解决 DashScope ASR WebSocket 连接超时导致的识别不稳定 ==========
|
||||
self._connection_start_time = None # 连接创建时间
|
||||
self._last_audio_time = None # 最后一次发送音频的时间
|
||||
self._recognition_count = 0 # 识别次数计数
|
||||
self._audio_send_count = 0 # 音频发送次数计数
|
||||
self._last_audio_send_success = True # 最后一次音频发送是否成功
|
||||
self._consecutive_send_failures = 0 # 连续发送失败次数
|
||||
|
||||
# 配置参数
|
||||
self.MAX_CONNECTION_AGE = 300 # 连接最大存活时间:5分钟
|
||||
self.MAX_IDLE_TIME = 180 # 最大空闲时间:3分钟
|
||||
self.MAX_RECOGNITIONS = 30 # 最大识别次数:30次后重建连接
|
||||
self.MAX_CONSECUTIVE_FAILURES = 3 # 最大连续失败次数
|
||||
|
||||
def _log(self, level: str, msg: str):
|
||||
if not self.logger:
|
||||
return
|
||||
try:
|
||||
if level == "debug":
|
||||
self.logger.debug(msg)
|
||||
elif level == "warning":
|
||||
self.logger.warn(msg)
|
||||
elif level == "error":
|
||||
self.logger.error(msg)
|
||||
elif level == "info":
|
||||
self.logger.info(msg)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _should_reconnect(self) -> tuple[bool, str]:
|
||||
if not self.running or not self.conversation:
|
||||
return False, ""
|
||||
current_time = time.time()
|
||||
# 检查1:连接时间
|
||||
if self._connection_start_time:
|
||||
connection_age = current_time - self._connection_start_time
|
||||
if connection_age > self.MAX_CONNECTION_AGE:
|
||||
return True, f"连接已存活{connection_age:.0f}秒,超过{self.MAX_CONNECTION_AGE}秒阈值"
|
||||
# 检查2:空闲时间
|
||||
if self._last_audio_time:
|
||||
idle_time = current_time - self._last_audio_time
|
||||
if idle_time > self.MAX_IDLE_TIME:
|
||||
return True, f"连接已空闲{idle_time:.0f}秒,超过{self.MAX_IDLE_TIME}秒阈值"
|
||||
# 检查3:识别次数
|
||||
if self._recognition_count >= self.MAX_RECOGNITIONS:
|
||||
return True, f"已完成{self._recognition_count}次识别,达到重连阈值"
|
||||
# 检查4:连续发送失败
|
||||
if self._consecutive_send_failures >= self.MAX_CONSECUTIVE_FAILURES:
|
||||
return True, f"连续{self._consecutive_send_failures}次音频发送失败"
|
||||
|
||||
return False, ""
|
||||
|
||||
def _reset_connection_stats(self):
|
||||
self._connection_start_time = time.time()
|
||||
self._last_audio_time = time.time()
|
||||
self._recognition_count = 0
|
||||
self._audio_send_count = 0
|
||||
self._last_audio_send_success = True
|
||||
self._consecutive_send_failures = 0
|
||||
|
||||
def start(self):
|
||||
if self.running:
|
||||
return False
|
||||
|
||||
try:
|
||||
callback = _ASRCallback(self)
|
||||
self.conversation = OmniRealtimeConversation(
|
||||
model=self.model,
|
||||
url=self.url,
|
||||
callback=callback
|
||||
)
|
||||
callback.conversation = self.conversation
|
||||
|
||||
self.conversation.connect()
|
||||
|
||||
transcription_params = TranscriptionParams(
|
||||
language='zh',
|
||||
sample_rate=self.sample_rate,
|
||||
input_audio_format="pcm",
|
||||
)
|
||||
|
||||
self.conversation.update_session(
|
||||
output_modalities=[MultiModality.TEXT],
|
||||
enable_input_audio_transcription=True,
|
||||
transcription_params=transcription_params,
|
||||
enable_turn_detection=True,
|
||||
turn_detection_type='server_vad',
|
||||
prefix_padding_ms=1000,
|
||||
turn_detection_threshold=0.3,
|
||||
turn_detection_silence_duration_ms=800,
|
||||
)
|
||||
|
||||
self.running = True
|
||||
self._reset_connection_stats()
|
||||
self._log("info", f"[ASR] 已启动 | 连接ID:{id(self.conversation)}")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.running = False
|
||||
self._log("error", f"[ASR] 启动失败: {e}")
|
||||
if self.conversation:
|
||||
try:
|
||||
self.conversation.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.conversation = None
|
||||
return False
|
||||
|
||||
def send_audio(self, audio_chunk: bytes):
|
||||
should_reconnect, reason = self._should_reconnect()
|
||||
if should_reconnect:
|
||||
self._log("warning", f"[ASR] 检测到需要重连: {reason}")
|
||||
self.running = False
|
||||
try:
|
||||
if self.conversation:
|
||||
self.conversation.close()
|
||||
except:
|
||||
pass
|
||||
self.conversation = None
|
||||
time.sleep(1.0)
|
||||
if not self.start():
|
||||
self._log("error", "[ASR] 自动重连失败")
|
||||
return False
|
||||
self._log("info", "[ASR] 自动重连成功")
|
||||
import threading
|
||||
self._log("debug", f"[ASR] send_audio 被调用 | 线程:{threading.current_thread().name} | running:{self.running} | conversation:{self.conversation is not None}")
|
||||
if not self.running or not self.conversation:
|
||||
self._log("debug", f"[ASR] send_audio 跳过 | running:{self.running} | conversation:{self.conversation is not None}")
|
||||
return False
|
||||
try:
|
||||
audio_b64 = base64.b64encode(audio_chunk).decode('ascii')
|
||||
self.conversation.append_audio(audio_b64)
|
||||
self._last_audio_time = time.time()
|
||||
self._audio_send_count += 1
|
||||
self._last_audio_send_success = True
|
||||
self._consecutive_send_failures = 0
|
||||
self._log("debug", f"[ASR] 音频发送成功 | 总计:{self._audio_send_count} | 连接年龄:{time.time() - self._connection_start_time:.1f}秒")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
self._last_audio_send_success = False
|
||||
self._consecutive_send_failures += 1
|
||||
|
||||
error_msg = str(e)
|
||||
error_type = type(e).__name__
|
||||
if "Connection is already closed" in error_msg or "WebSocketConnectionClosedException" in error_type or "ConnectionClosed" in error_type or "websocket" in error_msg.lower():
|
||||
self._log("warning", f"[ASR] WebSocket 连接已断开 | 错误:{error_msg} | 连续失败:{self._consecutive_send_failures}次")
|
||||
self.running = False
|
||||
try:
|
||||
if self.conversation:
|
||||
self.conversation.close()
|
||||
except:
|
||||
pass
|
||||
self.conversation = None
|
||||
else:
|
||||
self._log("error", f"[ASR] send_audio 异常 | 错误:{error_msg} | 类型:{error_type} | 连续失败:{self._consecutive_send_failures}次")
|
||||
|
||||
return False
|
||||
|
||||
def stop_current_recognition(self):
|
||||
import threading
|
||||
self._log("debug", f"[ASR] stop_current_recognition 被调用 | 线程:{threading.current_thread().name} | running:{self.running}")
|
||||
if not self._stop_lock.acquire(blocking=False):
|
||||
self._log("debug", f"[ASR] 锁获取失败,有其他线程正在执行 stop_current_recognition")
|
||||
return False
|
||||
|
||||
self._final_result_event.clear()
|
||||
self._pending_commit = True
|
||||
|
||||
try:
|
||||
self._log("debug", f"[ASR] 获得锁,开始停止识别 | conversation:{self.conversation is not None}")
|
||||
if not self.running or not self.conversation:
|
||||
self._log("debug", f"[ASR] 无法停止 | running:{self.running} | conversation:{self.conversation is not None}")
|
||||
return False
|
||||
|
||||
self._recognition_count += 1
|
||||
should_reconnect, reason = self._should_reconnect()
|
||||
if should_reconnect:
|
||||
self._log("info", f"[ASR] 识别完成后检测到需要重连: {reason}")
|
||||
|
||||
self._final_result_event.clear()
|
||||
self._pending_commit = True
|
||||
|
||||
try:
|
||||
self.conversation.commit()
|
||||
self._final_result_event.wait(timeout=3.0)
|
||||
except Exception as e:
|
||||
self._log("debug", f"[ASR] commit 异常: {e}")
|
||||
|
||||
self._log("debug", f"[ASR] 准备关闭旧连接 | conversation_id:{id(self.conversation)}")
|
||||
self.running = False
|
||||
|
||||
old_conversation = self.conversation
|
||||
self.conversation = None
|
||||
|
||||
self._log("debug", f"[ASR] conversation已设为None,准备关闭旧连接")
|
||||
try:
|
||||
old_conversation.close()
|
||||
self._log("debug", f"[ASR] 旧连接已关闭")
|
||||
except Exception as e:
|
||||
self._log("warning", f"[ASR] 关闭连接异常: {e}")
|
||||
self._log("debug", f"[ASR] 连接已关闭,等待下次语音活动时重连")
|
||||
return True
|
||||
|
||||
finally:
|
||||
self._pending_commit = False
|
||||
self._stop_lock.release()
|
||||
self._log("debug", f"[ASR] stop_current_recognition 完成,锁已释放")
|
||||
|
||||
def stop(self):
|
||||
with self._stop_lock:
|
||||
self.running = False
|
||||
self._final_result_event.set()
|
||||
if self.conversation:
|
||||
try:
|
||||
self.conversation.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.conversation = None
|
||||
self._log("info", "[ASR] 已完全停止")
|
||||
|
||||
|
||||
class _ASRCallback(OmniRealtimeCallback):
|
||||
def __init__(self, asr_client: DashScopeASR):
|
||||
self.asr_client = asr_client
|
||||
self.conversation = None
|
||||
|
||||
|
||||
def on_event(self, response):
|
||||
try:
|
||||
event_type = response['type']
|
||||
if event_type == 'conversation.item.input_audio_transcription.completed':
|
||||
transcript = response['transcript']
|
||||
if transcript.strip() and self.asr_client.on_sentence_end:
|
||||
self.asr_client.on_sentence_end(transcript.strip())
|
||||
if self.asr_client._pending_commit:
|
||||
self.asr_client._final_result_event.set()
|
||||
|
||||
elif event_type == 'input_audio_buffer.speech_started':
|
||||
if self.asr_client.logger:
|
||||
self.asr_client.logger.info("[ASR] 检测到语音开始")
|
||||
if self.asr_client.on_speech_started:
|
||||
self.asr_client.on_speech_started()
|
||||
|
||||
elif event_type == 'input_audio_buffer.speech_stopped':
|
||||
if self.asr_client.logger:
|
||||
self.asr_client.logger.info("[ASR] 检测到语音结束")
|
||||
if self.asr_client.on_speech_stopped:
|
||||
self.asr_client.on_speech_stopped()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class ASRAudioNode(Node):
|
||||
def __init__(self):
|
||||
super().__init__('asr_audio_node')
|
||||
self._load_config()
|
||||
|
||||
self.audio_queue = queue.Queue(maxsize=100)
|
||||
self.stop_event = threading.Event()
|
||||
self._shutdown_in_progress = False
|
||||
|
||||
self._init_components()
|
||||
|
||||
self.recognize_service = self.create_service(
|
||||
ASRRecognize, '/asr/recognize', self._recognize_callback
|
||||
)
|
||||
self.audio_data_service = self.create_service(
|
||||
AudioData, '/asr/audio_data', self._audio_data_callback
|
||||
)
|
||||
self.vad_event_service = self.create_service(
|
||||
VADEvent, '/vad/event', self._vad_event_callback
|
||||
)
|
||||
|
||||
self._last_result = None
|
||||
self._result_event = threading.Event()
|
||||
self._last_result_time = None
|
||||
self.vad_event_queue = queue.Queue()
|
||||
self.audio_buffer = collections.deque(maxlen=240000)
|
||||
self.audio_recording = False
|
||||
self.audio_lock = threading.Lock()
|
||||
|
||||
# ========== 异常识别检测 ==========
|
||||
self._abnormal_results = ["嗯。", "。", "啊。", "哦。"] # 异常识别结果列表
|
||||
self._consecutive_abnormal_count = 0 # 连续异常识别次数
|
||||
self.MAX_CONSECUTIVE_ABNORMAL = 5 # 最大连续异常次数
|
||||
|
||||
self.recording_thread = threading.Thread(
|
||||
target=self.audio_recorder.record, name="RecordingThread", daemon=True
|
||||
)
|
||||
self.recording_thread.start()
|
||||
|
||||
self.asr_thread = threading.Thread(
|
||||
target=self._asr_worker, name="ASRThread", daemon=True
|
||||
)
|
||||
self.asr_thread.start()
|
||||
|
||||
self.get_logger().info("ASR Audio节点已启动")
|
||||
|
||||
def _load_config(self):
|
||||
config_file = os.path.join(
|
||||
get_package_share_directory('robot_speaker'),
|
||||
'config',
|
||||
'voice.yaml'
|
||||
)
|
||||
with open(config_file, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
mic = config['audio']['microphone']
|
||||
self.input_device_index = mic['device_index']
|
||||
self.sample_rate = mic['sample_rate']
|
||||
self.channels = mic['channels']
|
||||
self.chunk = mic['chunk']
|
||||
|
||||
dashscope = config['dashscope']
|
||||
self.dashscope_api_key = dashscope['api_key']
|
||||
self.asr_model = dashscope['asr']['model']
|
||||
self.asr_url = dashscope['asr']['url']
|
||||
|
||||
def _init_components(self):
|
||||
self.audio_recorder = AudioRecorder(
|
||||
device_index=self.input_device_index,
|
||||
sample_rate=self.sample_rate,
|
||||
channels=self.channels,
|
||||
chunk=self.chunk,
|
||||
audio_queue=self.audio_queue,
|
||||
stop_event=self.stop_event,
|
||||
logger=self.get_logger()
|
||||
)
|
||||
|
||||
self.asr_client = DashScopeASR(
|
||||
api_key=self.dashscope_api_key,
|
||||
sample_rate=self.sample_rate,
|
||||
model=self.asr_model,
|
||||
url=self.asr_url,
|
||||
logger=self.get_logger()
|
||||
)
|
||||
|
||||
self.asr_client.on_sentence_end = self._on_asr_result
|
||||
self.asr_client.on_speech_started = lambda: self._put_vad_event("speech_started")
|
||||
self.asr_client.on_speech_stopped = lambda: (self._clear_result(), self._put_vad_event("speech_stopped"))
|
||||
self.asr_client.start()
|
||||
|
||||
def _on_asr_result(self, text: str):
|
||||
if not text or not text.strip():
|
||||
return
|
||||
|
||||
self._last_result = text.strip()
|
||||
self._last_result_time = time.time()
|
||||
self._result_event.set()
|
||||
|
||||
is_abnormal = self._last_result in self._abnormal_results and len(self._last_result) <= 2
|
||||
if is_abnormal:
|
||||
self._consecutive_abnormal_count += 1
|
||||
self.get_logger().warn(f"[ASR] 检测到异常识别结果: '{self._last_result}' | 连续异常:{self._consecutive_abnormal_count}次")
|
||||
# 如果连续多次异常,强制重置 ASR 连接
|
||||
if self._consecutive_abnormal_count >= self.MAX_CONSECUTIVE_ABNORMAL:
|
||||
self.get_logger().error(f"[ASR] 连续{self._consecutive_abnormal_count}次异常识别,标记需要重连")
|
||||
self.asr_client._consecutive_send_failures = self.asr_client.MAX_CONSECUTIVE_FAILURES
|
||||
self._consecutive_abnormal_count = 0
|
||||
else:
|
||||
# 正常识别,重置异常计数
|
||||
self._consecutive_abnormal_count = 0
|
||||
try:
|
||||
self.get_logger().info(f"[ASR] 识别结果: {self._last_result}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _put_vad_event(self, event_type):
|
||||
try:
|
||||
self.vad_event_queue.put(event_type, timeout=0.1)
|
||||
except queue.Full:
|
||||
try:
|
||||
self.get_logger().warn(f"[ASR] VAD事件队列已满,丢弃{event_type}事件")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _audio_data_callback(self, request, response):
|
||||
import threading
|
||||
self.get_logger().debug(f"[ASR-AudioData] 回调触发 | command:{request.command} | 线程:{threading.current_thread().name}")
|
||||
response.sample_rate = self.sample_rate
|
||||
response.channels = self.channels
|
||||
|
||||
if request.command == "start":
|
||||
with self.audio_lock:
|
||||
self.get_logger().debug(f"[ASR-AudioData] start命令 | 旧buffer大小:{len(self.audio_buffer)} | recording:{self.audio_recording}")
|
||||
self.audio_buffer.clear()
|
||||
self.audio_recording = True
|
||||
self.get_logger().debug(f"[ASR-AudioData] buffer已清空,recording=True")
|
||||
response.success = True
|
||||
response.message = "开始录音"
|
||||
response.samples = 0
|
||||
return response
|
||||
|
||||
if request.command == "stop":
|
||||
self.get_logger().debug(f"[ASR-AudioData] stop命令 | recording:{self.audio_recording}")
|
||||
with self.audio_lock:
|
||||
self.audio_recording = False
|
||||
audio_list = list(self.audio_buffer)
|
||||
self.get_logger().debug(f"[ASR-AudioData] 读取buffer | 大小:{len(audio_list)}")
|
||||
self.audio_buffer.clear()
|
||||
if len(audio_list) > 0:
|
||||
audio_array = np.array(audio_list, dtype=np.int16)
|
||||
response.success = True
|
||||
response.audio_data = audio_array.tobytes()
|
||||
response.samples = len(audio_list)
|
||||
response.message = f"录音完成{len(audio_list)}样本"
|
||||
self.get_logger().debug(f"[ASR-AudioData] 返回音频 | samples:{len(audio_list)}")
|
||||
else:
|
||||
response.success = False
|
||||
response.message = "缓冲区为空"
|
||||
response.samples = 0
|
||||
self.get_logger().debug(f"[ASR-AudioData] buffer为空!")
|
||||
return response
|
||||
|
||||
if request.command == "get":
|
||||
with self.audio_lock:
|
||||
audio_list = list(self.audio_buffer)
|
||||
if len(audio_list) > 0:
|
||||
audio_array = np.array(audio_list, dtype=np.int16)
|
||||
response.success = True
|
||||
response.audio_data = audio_array.tobytes()
|
||||
response.samples = len(audio_list)
|
||||
response.message = f"获取到{len(audio_list)}样本"
|
||||
else:
|
||||
response.success = False
|
||||
response.message = "缓冲区为空"
|
||||
response.samples = 0
|
||||
return response
|
||||
|
||||
def _vad_event_callback(self, request, response):
|
||||
timeout = request.timeout_ms / 1000.0 if request.timeout_ms > 0 else None
|
||||
try:
|
||||
event = self.vad_event_queue.get(timeout=timeout)
|
||||
response.success = True
|
||||
response.event = event
|
||||
response.message = "收到VAD事件"
|
||||
except queue.Empty:
|
||||
response.success = False
|
||||
response.event = "none"
|
||||
response.message = "等待超时"
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
self.get_logger().info("[ASR-VAD] 收到中断信号,正在关闭")
|
||||
except Exception:
|
||||
pass
|
||||
response.success = False
|
||||
response.event = "none"
|
||||
response.message = "节点正在关闭"
|
||||
self.stop_event.set()
|
||||
return response
|
||||
|
||||
def _clear_result(self):
|
||||
self._last_result = None
|
||||
self._last_result_time = None
|
||||
self._result_event.clear()
|
||||
|
||||
def _return_result(self, response, text, message):
|
||||
response.success = True
|
||||
response.text = text
|
||||
response.message = message
|
||||
self._clear_result()
|
||||
return response
|
||||
|
||||
def _recognize_callback(self, request, response):
|
||||
if request.command == "stop":
|
||||
if self.asr_client.running:
|
||||
self.asr_client.stop_current_recognition()
|
||||
response.success = True
|
||||
response.text = ""
|
||||
response.message = "识别已停止"
|
||||
return response
|
||||
|
||||
if request.command == "reset":
|
||||
self.asr_client.stop_current_recognition()
|
||||
time.sleep(0.1)
|
||||
self.asr_client.start()
|
||||
response.success = True
|
||||
response.text = ""
|
||||
response.message = "识别器已重置"
|
||||
return response
|
||||
|
||||
if self.asr_client.running:
|
||||
current_time = time.time()
|
||||
if (self._last_result and self._last_result_time and
|
||||
(current_time - self._last_result_time) < 0.3) or (self._result_event.is_set() and self._last_result):
|
||||
return self._return_result(response, self._last_result, "返回最近识别结果")
|
||||
if self._result_event.wait(timeout=2.0) and self._last_result:
|
||||
return self._return_result(response, self._last_result, "识别成功(等待中)")
|
||||
self.asr_client.stop_current_recognition()
|
||||
time.sleep(0.2)
|
||||
|
||||
self._clear_result()
|
||||
|
||||
if not self.asr_client.running and not self.asr_client.start():
|
||||
response.success = False
|
||||
response.text = ""
|
||||
response.message = "ASR启动失败"
|
||||
return response
|
||||
if self._result_event.wait(timeout=5.0) and self._last_result:
|
||||
response.success = True
|
||||
response.text = self._last_result
|
||||
response.message = "识别成功"
|
||||
else:
|
||||
response.success = False
|
||||
response.text = ""
|
||||
response.message = "识别超时" if not self._result_event.is_set() else "识别结果为空"
|
||||
self._clear_result()
|
||||
return response
|
||||
|
||||
def _asr_worker(self):
|
||||
while not self.stop_event.is_set():
|
||||
try:
|
||||
audio_chunk = self.audio_queue.get(timeout=0.1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
self.get_logger().info("[ASR-Worker] 收到中断信号")
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
if self.audio_recording:
|
||||
self.get_logger().debug(f"[ASR-Worker] 收到音频chunk | recording:{self.audio_recording} | buffer_size:{len(self.audio_buffer)}")
|
||||
try:
|
||||
audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
|
||||
with self.audio_lock:
|
||||
self.audio_buffer.extend(audio_array)
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[ASR-Worker] buffer写入异常 | 错误:{e}")
|
||||
pass
|
||||
|
||||
if self.asr_client.running:
|
||||
self.asr_client.send_audio(audio_chunk)
|
||||
else:
|
||||
if not self.asr_client.start():
|
||||
time.sleep(1.0)
|
||||
|
||||
def destroy_node(self):
|
||||
if self._shutdown_in_progress:
|
||||
return
|
||||
self._shutdown_in_progress = True
|
||||
try:
|
||||
self.get_logger().info("ASR Audio节点正在关闭...")
|
||||
except Exception:
|
||||
pass
|
||||
self.stop_event.set()
|
||||
if hasattr(self, 'recording_thread') and self.recording_thread.is_alive():
|
||||
self.recording_thread.join(timeout=1.0)
|
||||
if hasattr(self, 'asr_thread') and self.asr_thread.is_alive():
|
||||
self.asr_thread.join(timeout=1.0)
|
||||
try:
|
||||
if hasattr(self, 'audio_recorder'):
|
||||
self.audio_recorder.audio.terminate()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if hasattr(self, 'asr_client'):
|
||||
self.asr_client.stop()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
super().destroy_node()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def main(args=None):
|
||||
rclpy.init(args=args)
|
||||
node = ASRAudioNode()
|
||||
try:
|
||||
rclpy.spin(node)
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
node.get_logger().info("收到中断信号,正在关闭节点")
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
node.destroy_node()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
rclpy.shutdown()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
341
robot_speaker/services/tts_audio_node.py
Normal file
341
robot_speaker/services/tts_audio_node.py
Normal file
@@ -0,0 +1,341 @@
|
||||
import rclpy
|
||||
from rclpy.node import Node
|
||||
from rclpy.callback_groups import ReentrantCallbackGroup
|
||||
from interfaces.srv import TTSSynthesize
|
||||
import threading
|
||||
import yaml
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
import dashscope
|
||||
from dashscope.audio.tts_v2 import SpeechSynthesizer, ResultCallback, AudioFormat
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
|
||||
class DashScopeTTSClient:
|
||||
def __init__(self, api_key: str,
|
||||
model: str,
|
||||
voice: str,
|
||||
card_index: int,
|
||||
device_index: int,
|
||||
output_sample_rate: int,
|
||||
output_channels: int,
|
||||
output_volume: float,
|
||||
tts_source_sample_rate: int,
|
||||
tts_source_channels: int,
|
||||
tts_ffmpeg_thread_queue_size: int,
|
||||
force_stop_delay: float,
|
||||
cleanup_timeout: float,
|
||||
terminate_timeout: float,
|
||||
logger):
|
||||
dashscope.api_key = api_key
|
||||
self.model = model
|
||||
self.voice = voice
|
||||
self.card_index = card_index
|
||||
self.device_index = device_index
|
||||
self.output_sample_rate = output_sample_rate
|
||||
self.output_channels = output_channels
|
||||
self.output_volume = output_volume
|
||||
self.tts_source_sample_rate = tts_source_sample_rate
|
||||
self.tts_source_channels = tts_source_channels
|
||||
self.tts_ffmpeg_thread_queue_size = tts_ffmpeg_thread_queue_size
|
||||
self.force_stop_delay = force_stop_delay
|
||||
self.cleanup_timeout = cleanup_timeout
|
||||
self.terminate_timeout = terminate_timeout
|
||||
self.logger = logger
|
||||
self.current_ffmpeg_pid = None
|
||||
self._current_callback = None
|
||||
|
||||
self.alsa_device = f"plughw:{card_index},{device_index}" if (
|
||||
card_index >= 0 and device_index >= 0
|
||||
) else "default"
|
||||
|
||||
|
||||
def force_stop(self):
|
||||
if self._current_callback:
|
||||
self._current_callback._interrupted = True
|
||||
if not self.current_ffmpeg_pid:
|
||||
if self.logger:
|
||||
self.logger.warn("[TTS] force_stop: current_ffmpeg_pid is None")
|
||||
return
|
||||
pid = self.current_ffmpeg_pid
|
||||
try:
|
||||
if self.logger:
|
||||
self.logger.info(f"[TTS] force_stop: 正在kill进程 {pid}")
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
time.sleep(self.force_stop_delay)
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
if self.logger:
|
||||
self.logger.info(f"[TTS] force_stop: 已发送SIGKILL到进程 {pid}")
|
||||
except ProcessLookupError:
|
||||
if self.logger:
|
||||
self.logger.info(f"[TTS] force_stop: 进程 {pid} 已退出")
|
||||
except (ProcessLookupError, OSError) as e:
|
||||
if self.logger:
|
||||
self.logger.warn(f"[TTS] force_stop: kill进程失败 {pid}: {e}")
|
||||
finally:
|
||||
self.current_ffmpeg_pid = None
|
||||
self._current_callback = None
|
||||
|
||||
def synthesize(self, text: str, voice: str = None,
|
||||
on_chunk=None,
|
||||
interrupt_check=None) -> bool:
|
||||
callback = _TTSCallback(self, interrupt_check, on_chunk)
|
||||
self._current_callback = callback
|
||||
voice_to_use = voice if voice and voice.strip() else self.voice
|
||||
|
||||
if not voice_to_use or not voice_to_use.strip():
|
||||
if self.logger:
|
||||
self.logger.error(f"[TTS] Voice参数无效: '{voice_to_use}'")
|
||||
self._current_callback = None
|
||||
return False
|
||||
synthesizer = SpeechSynthesizer(
|
||||
model=self.model,
|
||||
voice=voice_to_use,
|
||||
format=AudioFormat.PCM_22050HZ_MONO_16BIT,
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
try:
|
||||
synthesizer.streaming_call(text)
|
||||
synthesizer.streaming_complete()
|
||||
finally:
|
||||
callback.cleanup()
|
||||
self._current_callback = None
|
||||
|
||||
return not callback._interrupted
|
||||
|
||||
|
||||
class _TTSCallback(ResultCallback):
|
||||
def __init__(self, tts_client: DashScopeTTSClient,
|
||||
interrupt_check=None,
|
||||
on_chunk=None):
|
||||
self.tts_client = tts_client
|
||||
self.interrupt_check = interrupt_check
|
||||
self.on_chunk = on_chunk
|
||||
self._proc = None
|
||||
self._interrupted = False
|
||||
self._cleaned_up = False
|
||||
|
||||
def on_open(self):
|
||||
ffmpeg_cmd = [
|
||||
'ffmpeg',
|
||||
'-f', 's16le',
|
||||
'-ar', str(self.tts_client.tts_source_sample_rate),
|
||||
'-ac', str(self.tts_client.tts_source_channels),
|
||||
'-i', 'pipe:0',
|
||||
'-f', 'alsa',
|
||||
'-ar', str(self.tts_client.output_sample_rate),
|
||||
'-ac', str(self.tts_client.output_channels),
|
||||
'-acodec', 'pcm_s16le',
|
||||
'-fflags', 'nobuffer',
|
||||
'-flags', 'low_delay',
|
||||
'-avioflags', 'direct',
|
||||
self.tts_client.alsa_device
|
||||
]
|
||||
|
||||
insert_pos = ffmpeg_cmd.index('-i')
|
||||
ffmpeg_cmd.insert(insert_pos, str(self.tts_client.tts_ffmpeg_thread_queue_size))
|
||||
ffmpeg_cmd.insert(insert_pos, '-thread_queue_size')
|
||||
|
||||
if self.tts_client.output_volume != 1.0:
|
||||
acodec_idx = ffmpeg_cmd.index('-acodec')
|
||||
ffmpeg_cmd.insert(acodec_idx, f'volume={self.tts_client.output_volume}')
|
||||
ffmpeg_cmd.insert(acodec_idx, '-af')
|
||||
|
||||
self._proc = subprocess.Popen(
|
||||
ffmpeg_cmd,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
self.tts_client.current_ffmpeg_pid = self._proc.pid
|
||||
|
||||
def on_data(self, data: bytes) -> None:
|
||||
if self._interrupted:
|
||||
return
|
||||
|
||||
if self.interrupt_check and self.interrupt_check():
|
||||
self._interrupted = True
|
||||
if self._proc:
|
||||
self._proc.terminate()
|
||||
return
|
||||
|
||||
if self._proc and self._proc.stdin and not self._interrupted:
|
||||
try:
|
||||
self._proc.stdin.write(data)
|
||||
self._proc.stdin.flush()
|
||||
except BrokenPipeError:
|
||||
self._interrupted = True
|
||||
except OSError:
|
||||
self._interrupted = True
|
||||
|
||||
if self.on_chunk and not self._interrupted:
|
||||
self.on_chunk(data)
|
||||
|
||||
def cleanup(self):
|
||||
if self._cleaned_up or not self._proc:
|
||||
return
|
||||
self._cleaned_up = True
|
||||
|
||||
if self._proc.stdin and not self._proc.stdin.closed:
|
||||
self._proc.stdin.close()
|
||||
|
||||
if self._proc.poll() is None:
|
||||
self._proc.wait(timeout=self.tts_client.cleanup_timeout)
|
||||
if self._proc.poll() is None:
|
||||
self._proc.terminate()
|
||||
self._proc.wait(timeout=self.tts_client.terminate_timeout)
|
||||
if self._proc.poll() is None:
|
||||
self._proc.kill()
|
||||
|
||||
if self.tts_client.current_ffmpeg_pid == self._proc.pid:
|
||||
self.tts_client.current_ffmpeg_pid = None
|
||||
|
||||
|
||||
class TTSAudioNode(Node):
|
||||
def __init__(self):
|
||||
super().__init__('tts_audio_node')
|
||||
self._load_config()
|
||||
self._init_tts_client()
|
||||
|
||||
self.callback_group = ReentrantCallbackGroup()
|
||||
self.synthesize_service = self.create_service(
|
||||
TTSSynthesize, '/tts/synthesize', self._synthesize_callback,
|
||||
callback_group=self.callback_group
|
||||
)
|
||||
|
||||
self.interrupt_event = threading.Event()
|
||||
self.playing_lock = threading.Lock()
|
||||
self.is_playing = False
|
||||
|
||||
self.get_logger().info("[TTS] TTS Audio节点已启动")
|
||||
|
||||
def _load_config(self):
|
||||
config_file = os.path.join(
|
||||
get_package_share_directory('robot_speaker'),
|
||||
'config',
|
||||
'voice.yaml'
|
||||
)
|
||||
with open(config_file, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
audio = config['audio']
|
||||
soundcard = audio['soundcard']
|
||||
tts_audio = audio['tts']
|
||||
dashscope = config['dashscope']
|
||||
|
||||
self.output_card_index = soundcard['card_index']
|
||||
self.output_device_index = soundcard['device_index']
|
||||
self.output_sample_rate = soundcard['sample_rate']
|
||||
self.output_channels = soundcard['channels']
|
||||
self.output_volume = soundcard['volume']
|
||||
|
||||
self.tts_source_sample_rate = tts_audio['source_sample_rate']
|
||||
self.tts_source_channels = tts_audio['source_channels']
|
||||
self.tts_ffmpeg_thread_queue_size = tts_audio['ffmpeg_thread_queue_size']
|
||||
self.force_stop_delay = tts_audio['force_stop_delay']
|
||||
self.cleanup_timeout = tts_audio['cleanup_timeout']
|
||||
self.terminate_timeout = tts_audio['terminate_timeout']
|
||||
self.interrupt_wait = tts_audio['interrupt_wait']
|
||||
|
||||
self.dashscope_api_key = dashscope['api_key']
|
||||
self.tts_model = dashscope['tts']['model']
|
||||
self.tts_voice = dashscope['tts']['voice']
|
||||
|
||||
def _init_tts_client(self):
|
||||
self.tts_client = DashScopeTTSClient(
|
||||
api_key=self.dashscope_api_key,
|
||||
model=self.tts_model,
|
||||
voice=self.tts_voice,
|
||||
card_index=self.output_card_index,
|
||||
device_index=self.output_device_index,
|
||||
output_sample_rate=self.output_sample_rate,
|
||||
output_channels=self.output_channels,
|
||||
output_volume=self.output_volume,
|
||||
tts_source_sample_rate=self.tts_source_sample_rate,
|
||||
tts_source_channels=self.tts_source_channels,
|
||||
tts_ffmpeg_thread_queue_size=self.tts_ffmpeg_thread_queue_size,
|
||||
force_stop_delay=self.force_stop_delay,
|
||||
cleanup_timeout=self.cleanup_timeout,
|
||||
terminate_timeout=self.terminate_timeout,
|
||||
logger=self.get_logger()
|
||||
)
|
||||
|
||||
def _synthesize_callback(self, request, response):
|
||||
command = request.command if request.command else "synthesize"
|
||||
|
||||
if command == "interrupt":
|
||||
with self.playing_lock:
|
||||
was_playing = self.is_playing
|
||||
has_pid = self.tts_client.current_ffmpeg_pid is not None
|
||||
if was_playing or has_pid:
|
||||
self.interrupt_event.set()
|
||||
self.tts_client.force_stop()
|
||||
self.is_playing = False
|
||||
response.success = True
|
||||
response.message = "已中断播放"
|
||||
response.status = "interrupted"
|
||||
else:
|
||||
response.success = False
|
||||
response.message = "没有正在播放的内容"
|
||||
response.status = "none"
|
||||
return response
|
||||
|
||||
if not request.text or not request.text.strip():
|
||||
response.success = False
|
||||
response.message = "文本为空"
|
||||
response.status = "error"
|
||||
return response
|
||||
|
||||
with self.playing_lock:
|
||||
if self.is_playing:
|
||||
self.tts_client.force_stop()
|
||||
time.sleep(self.interrupt_wait)
|
||||
self.is_playing = True
|
||||
|
||||
self.interrupt_event.clear()
|
||||
|
||||
def synthesize_worker():
|
||||
try:
|
||||
success = self.tts_client.synthesize(
|
||||
request.text.strip(),
|
||||
voice=request.voice if request.voice else None,
|
||||
interrupt_check=lambda: self.interrupt_event.is_set()
|
||||
)
|
||||
with self.playing_lock:
|
||||
self.is_playing = False
|
||||
if self.get_logger():
|
||||
if success:
|
||||
self.get_logger().info("[TTS] 合成并播放成功")
|
||||
else:
|
||||
self.get_logger().info("[TTS] 播放被中断")
|
||||
except Exception as e:
|
||||
with self.playing_lock:
|
||||
self.is_playing = False
|
||||
if self.get_logger():
|
||||
self.get_logger().error(f"[TTS] 合成失败: {e}")
|
||||
|
||||
thread = threading.Thread(target=synthesize_worker, daemon=True)
|
||||
thread.start()
|
||||
|
||||
response.success = True
|
||||
response.message = "合成任务已启动"
|
||||
response.status = "playing"
|
||||
return response
|
||||
|
||||
|
||||
def main(args=None):
|
||||
rclpy.init(args=args)
|
||||
node = TTSAudioNode()
|
||||
rclpy.spin(node)
|
||||
node.destroy_node()
|
||||
rclpy.shutdown()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
26
setup.py
26
setup.py
@@ -1,26 +1,38 @@
|
||||
from setuptools import find_packages, setup
|
||||
from setuptools import setup, find_packages
|
||||
import os
|
||||
from glob import glob
|
||||
|
||||
package_name = 'robot_speaker'
|
||||
|
||||
setup(
|
||||
name=package_name,
|
||||
version='0.0.0',
|
||||
packages=[package_name],
|
||||
version='0.0.1',
|
||||
packages=find_packages(where='.'),
|
||||
package_dir={'': '.'},
|
||||
data_files=[
|
||||
('share/ament_index/resource_index/packages',
|
||||
['resource/' + package_name]),
|
||||
('share/' + package_name, ['package.xml']),
|
||||
(os.path.join('share', package_name, 'launch'), glob('launch/*.launch.py')),
|
||||
(os.path.join('share', package_name, 'config'), glob('config/*.yaml') + glob('config/*.json')),
|
||||
(os.path.join('share', package_name, 'srv'), glob('srv/*.srv')),
|
||||
],
|
||||
install_requires=[
|
||||
'setuptools',
|
||||
'pypinyin',
|
||||
],
|
||||
install_requires=['setuptools'],
|
||||
zip_safe=True,
|
||||
maintainer='mzebra',
|
||||
maintainer_email='mzebra@foxmail.com',
|
||||
description='TODO: Package description',
|
||||
description='语音识别和合成ROS2包',
|
||||
license='Apache-2.0',
|
||||
tests_require=['pytest'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'robot_speaker_node=robot_speaker.robot_speaker_node:main'
|
||||
'robot_speaker_node = robot_speaker.core.robot_speaker_node:main',
|
||||
'register_speaker_node = robot_speaker.core.register_speaker_node:main',
|
||||
'skill_bridge_node = robot_speaker.bridge.skill_bridge_node:main',
|
||||
'asr_audio_node = robot_speaker.services.asr_audio_node:main',
|
||||
'tts_audio_node = robot_speaker.services.tts_audio_node:main',
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
10
srv/ASRRecognize.srv
Normal file
10
srv/ASRRecognize.srv
Normal file
@@ -0,0 +1,10 @@
|
||||
# 请求:启动识别
|
||||
string command # "start" (默认), "stop", "reset"
|
||||
---
|
||||
# 响应:识别结果
|
||||
bool success
|
||||
string text # 识别文本(空字符串表示未识别到)
|
||||
string message # 状态消息
|
||||
|
||||
|
||||
|
||||
27
srv/AudioData.srv
Normal file
27
srv/AudioData.srv
Normal file
@@ -0,0 +1,27 @@
|
||||
# 请求:获取音频数据
|
||||
string command # "start" (开始录音), "stop" (停止并返回), "get" (获取当前缓冲区)
|
||||
int32 duration_ms # 录音时长(毫秒),仅用于start命令
|
||||
---
|
||||
# 响应:音频数据
|
||||
bool success
|
||||
uint8[] audio_data # PCM音频数据(int16格式)
|
||||
int32 sample_rate
|
||||
int32 channels
|
||||
int32 samples # 样本数
|
||||
string message
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
14
srv/TTSSynthesize.srv
Normal file
14
srv/TTSSynthesize.srv
Normal file
@@ -0,0 +1,14 @@
|
||||
# 请求:合成文本或中断命令
|
||||
string command # "synthesize" (默认), "interrupt"
|
||||
string text
|
||||
string voice # 可选,默认使用配置
|
||||
---
|
||||
# 响应:合成状态
|
||||
bool success
|
||||
string message
|
||||
string status # "playing", "completed", "interrupted"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
11
srv/VADEvent.srv
Normal file
11
srv/VADEvent.srv
Normal file
@@ -0,0 +1,11 @@
|
||||
# 请求:等待VAD事件
|
||||
string command # "wait" (等待下一个事件)
|
||||
int32 timeout_ms # 超时时间(毫秒),0表示无限等待
|
||||
---
|
||||
# 响应:VAD事件
|
||||
bool success
|
||||
string event # "speech_started", "speech_stopped", "none"
|
||||
string message
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user