Compare commits
3 Commits
develop
...
feature-de
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e5714e3a8b | ||
|
|
293e69e9f2 | ||
| 6d101b9d9e |
@@ -32,7 +32,7 @@ audio:
|
||||
channels: 2 # 输出声道数:立体声(2声道,FL+FR)
|
||||
volume: 1.0 # 音量比例(0.0-1.0,0.2表示20%音量)
|
||||
echo_cancellation:
|
||||
enable: false # 是否启用回声消除
|
||||
enabled: false # 是否启用回声消除(true/false)
|
||||
max_duration_ms: 500 # 参考信号缓冲区最大时长(毫秒)
|
||||
tts:
|
||||
source_sample_rate: 22050 # TTS服务固定输出采样率(DashScope服务固定值,不可修改)
|
||||
|
||||
@@ -9,6 +9,8 @@
|
||||
|
||||
<depend>rclpy</depend>
|
||||
<depend>std_msgs</depend>
|
||||
<depend>ament_index_python</depend>
|
||||
<depend>interfaces</depend>
|
||||
|
||||
<exec_depend>python3-pyaudio</exec_depend>
|
||||
<exec_depend>python3-requests</exec_depend>
|
||||
|
||||
2
robot_speaker/bridge/__init__.py
Normal file
2
robot_speaker/bridge/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Bridge package for connecting LLM outputs to brain execution.
|
||||
|
||||
136
robot_speaker/bridge/skill_bridge_node.py
Normal file
136
robot_speaker/bridge/skill_bridge_node.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
桥接LLM技能序列到小脑ExecuteBtAction,并转发反馈/结果。
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import rclpy
|
||||
from rclpy.node import Node
|
||||
from rclpy.action import ActionClient
|
||||
from std_msgs.msg import String
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
from interfaces.action import ExecuteBtAction
|
||||
|
||||
|
||||
class SkillBridgeNode(Node):
|
||||
def __init__(self):
|
||||
super().__init__('skill_bridge_node')
|
||||
self._action_client = ActionClient(self, ExecuteBtAction, '/execute_bt_action')
|
||||
self._current_epoch = 1
|
||||
self._allowed_skills = self._load_allowed_skills()
|
||||
|
||||
self.skill_seq_sub = self.create_subscription(
|
||||
String, '/llm_skill_sequence', self._on_skill_sequence_received, 10
|
||||
)
|
||||
self.feedback_pub = self.create_publisher(String, '/skill_execution_feedback', 10)
|
||||
self.result_pub = self.create_publisher(String, '/skill_execution_result', 10)
|
||||
|
||||
self.get_logger().info('SkillBridgeNode started')
|
||||
|
||||
def _on_skill_sequence_received(self, msg: String):
|
||||
raw = (msg.data or "").strip()
|
||||
if not raw:
|
||||
return
|
||||
if not self._allowed_skills:
|
||||
self.get_logger().warning("No skill whitelist loaded; reject all sequences")
|
||||
return
|
||||
sequence, invalid = self._extract_skill_sequence(raw)
|
||||
if invalid:
|
||||
self.get_logger().warning(f"Rejected sequence with invalid skills: {invalid}")
|
||||
return
|
||||
if not sequence:
|
||||
self.get_logger().warning(f"Invalid skill sequence: {raw}")
|
||||
return
|
||||
self._send_skill_sequence(sequence)
|
||||
|
||||
def _load_allowed_skills(self) -> set[str]:
|
||||
try:
|
||||
brain_share = get_package_share_directory("brain")
|
||||
skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
|
||||
if not os.path.exists(skill_path):
|
||||
return set()
|
||||
import yaml
|
||||
with open(skill_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or []
|
||||
return {str(entry["name"]) for entry in data if isinstance(entry, dict) and entry.get("name")}
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"Load skills failed: {e}")
|
||||
return set()
|
||||
|
||||
def _extract_skill_sequence(self, text: str) -> tuple[str, list[str]]:
|
||||
# Accept CSV/space/semicolon and filter by CamelCase tokens
|
||||
tokens = re.split(r'[,\s;]+', text.strip())
|
||||
skills = [t for t in tokens if re.match(r'^[A-Z][A-Za-z0-9]*$', t)]
|
||||
if not skills:
|
||||
return "", []
|
||||
invalid = [s for s in skills if s not in self._allowed_skills]
|
||||
return ",".join(skills), invalid
|
||||
|
||||
def _send_skill_sequence(self, skill_sequence: str):
|
||||
if not self._action_client.wait_for_server(timeout_sec=2.0):
|
||||
self.get_logger().error('ExecuteBtAction server unavailable')
|
||||
return
|
||||
goal = ExecuteBtAction.Goal()
|
||||
goal.epoch = self._current_epoch
|
||||
self._current_epoch += 1
|
||||
goal.action_name = skill_sequence
|
||||
goal.calls = []
|
||||
|
||||
self.get_logger().info(f"Dispatch skill sequence: {skill_sequence}")
|
||||
send_future = self._action_client.send_goal_async(goal, feedback_callback=self._feedback_callback)
|
||||
rclpy.spin_until_future_complete(self, send_future, timeout_sec=5.0)
|
||||
if not send_future.done():
|
||||
self.get_logger().warning("Send goal timed out")
|
||||
return
|
||||
goal_handle = send_future.result()
|
||||
if not goal_handle or not goal_handle.accepted:
|
||||
self.get_logger().error("Goal rejected")
|
||||
return
|
||||
result_future = goal_handle.get_result_async()
|
||||
rclpy.spin_until_future_complete(self, result_future)
|
||||
if result_future.done():
|
||||
self._handle_result(result_future.result())
|
||||
|
||||
def _feedback_callback(self, feedback_msg):
|
||||
fb = feedback_msg.feedback
|
||||
payload = {
|
||||
"stage": fb.stage,
|
||||
"current_skill": fb.current_skill,
|
||||
"progress": float(fb.progress),
|
||||
"detail": fb.detail,
|
||||
"epoch": int(fb.epoch),
|
||||
}
|
||||
msg = String()
|
||||
msg.data = json.dumps(payload, ensure_ascii=True)
|
||||
self.feedback_pub.publish(msg)
|
||||
|
||||
def _handle_result(self, result_wrapper):
|
||||
result = result_wrapper.result
|
||||
if not result:
|
||||
return
|
||||
payload = {
|
||||
"success": bool(result.success),
|
||||
"message": result.message,
|
||||
"total_skills": int(result.total_skills),
|
||||
"succeeded_skills": int(result.succeeded_skills),
|
||||
}
|
||||
msg = String()
|
||||
msg.data = json.dumps(payload, ensure_ascii=True)
|
||||
self.result_pub.publish(msg)
|
||||
|
||||
|
||||
|
||||
def main(args=None):
|
||||
rclpy.init(args=args)
|
||||
node = SkillBridgeNode()
|
||||
rclpy.spin(node)
|
||||
node.destroy_node()
|
||||
rclpy.shutdown()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@@ -2,3 +2,4 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
import os
|
||||
import yaml
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
from pypinyin import pinyin, Style
|
||||
|
||||
@@ -24,6 +27,24 @@ class IntentRouter:
|
||||
self.kb_keywords = [
|
||||
"ni shi shei", "ni de ming zi"
|
||||
]
|
||||
self._cached_skill_names: list[str] | None = None
|
||||
|
||||
def _load_brain_skill_names(self) -> list[str]:
|
||||
if self._cached_skill_names is not None:
|
||||
return self._cached_skill_names
|
||||
skill_names: list[str] = []
|
||||
try:
|
||||
brain_share = get_package_share_directory("brain")
|
||||
skill_path = os.path.join(brain_share, "config", "robot_skills.yaml")
|
||||
with open(skill_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or []
|
||||
for entry in data:
|
||||
if isinstance(entry, dict) and entry.get("name"):
|
||||
skill_names.append(str(entry["name"]))
|
||||
except Exception:
|
||||
skill_names = []
|
||||
self._cached_skill_names = skill_names
|
||||
return skill_names
|
||||
|
||||
def to_pinyin(self, text: str) -> str:
|
||||
chars = [c for c in text if '\u4e00' <= c <= '\u9fa5']
|
||||
@@ -63,9 +84,19 @@ class IntentRouter:
|
||||
return "head"
|
||||
|
||||
def build_skill_prompt(self) -> str:
|
||||
skills = self._load_brain_skill_names()
|
||||
skills_text = ", ".join(skills) if skills else ""
|
||||
skill_guard = (
|
||||
"【技能限制】只能使用以下技能名称:" + skills_text
|
||||
if skills_text
|
||||
else "【技能限制】技能列表不可用,请不要输出任何技能名称。"
|
||||
)
|
||||
return (
|
||||
"你是机器人任务规划器。\n"
|
||||
"本任务必须拍照。请根据用户请求选择使用哪个相机拍照(默认头部相机),并结合当前环境信息生成简洁、可执行的技能序列。"
|
||||
"本任务必须拍照。请根据用户请求选择使用哪个相机拍照(默认头部相机),并结合当前环境信息生成简洁、可执行的技能序列。\n"
|
||||
"【重要】如果对话历史中包含【执行结果】或【执行状态】,请参考上一轮技能序列的执行情况,根据成功/失败信息调整本次技能序列。\n"
|
||||
"【输出格式要求】只输出逗号分隔的技能名称,不要任何解释说明。\n"
|
||||
+ skill_guard
|
||||
)
|
||||
|
||||
def build_chat_prompt(self, need_camera: bool) -> str:
|
||||
|
||||
@@ -171,7 +171,7 @@ class RegisterSpeakerNode(Node):
|
||||
self.output_volume = soundcard['volume']
|
||||
|
||||
echo = audio.get('echo_cancellation', {})
|
||||
self.audio_echo_cancellation_enabled = echo['enable']
|
||||
self.audio_echo_cancellation_enabled = echo.get('enabled', True) # 默认启用
|
||||
self.audio_echo_cancellation_max_duration_ms = echo.get('max_duration_ms', 200)
|
||||
|
||||
tts_audio = audio.get('tts', {})
|
||||
|
||||
@@ -96,6 +96,16 @@ class RobotSpeakerNode(Node):
|
||||
self.interrupt_sub = self.create_subscription(
|
||||
String, 'interrupt_command', self.callbacks.handle_interrupt_command, self.system_interrupt_command_queue_depth
|
||||
)
|
||||
self.skill_sequence_pub = self.create_publisher(String, '/llm_skill_sequence', 10)
|
||||
self.skill_feedback_sub = self.create_subscription(
|
||||
String, '/skill_execution_feedback', self._on_skill_feedback, 10
|
||||
)
|
||||
self.skill_result_sub = self.create_subscription(
|
||||
String, '/skill_execution_result', self._on_skill_result, 10
|
||||
)
|
||||
|
||||
self.latest_skill_feedback = None
|
||||
self.latest_skill_result = None
|
||||
|
||||
# 启动线程
|
||||
self._start_threads()
|
||||
@@ -129,7 +139,7 @@ class RobotSpeakerNode(Node):
|
||||
self.output_sample_rate = soundcard['sample_rate']
|
||||
self.output_channels = soundcard['channels']
|
||||
self.output_volume = soundcard['volume']
|
||||
self.audio_echo_cancellation_enabled = echo['enable']
|
||||
self.audio_echo_cancellation_enabled = echo.get('enabled', True) # 默认启用
|
||||
self.audio_echo_cancellation_max_duration_ms = echo['max_duration_ms']
|
||||
self.audio_tts_source_sample_rate = tts_audio['source_sample_rate']
|
||||
self.audio_tts_source_channels = tts_audio['source_channels']
|
||||
@@ -243,7 +253,7 @@ class RobotSpeakerNode(Node):
|
||||
on_audio_chunk=self.callbacks.on_audio_chunk_for_sv if self.sv_enabled else None, # 声纹录音回调
|
||||
should_put_to_queue=self.callbacks.should_put_audio_to_queue, # 检查是否应该将音频放入队列
|
||||
get_silence_threshold=self.callbacks.get_silence_threshold, # 动态静音阈值回调
|
||||
enable_echo_cancellation=self.audio_echo_cancellation_enabled, # 启用回声消除
|
||||
enable_echo_cancellation=self.audio_echo_cancellation_enabled, # 从配置文件读取
|
||||
reference_signal_buffer=self.reference_signal_buffer, # 传递参考信号缓冲区
|
||||
logger=self.get_logger()
|
||||
)
|
||||
@@ -452,7 +462,13 @@ class RobotSpeakerNode(Node):
|
||||
self.get_logger().error(f"图像编码失败: {e}")
|
||||
return ""
|
||||
|
||||
def _llm_process_stream_with_camera(self, user_text: str, need_camera: bool, system_prompt: str | None = None) -> str:
|
||||
def _llm_process_stream_with_camera(
|
||||
self,
|
||||
user_text: str,
|
||||
need_camera: bool,
|
||||
system_prompt: str | None = None,
|
||||
suppress_tts: bool = False
|
||||
) -> str:
|
||||
"""LLM流式处理 - 支持多模态(文本+图像)"""
|
||||
if not self.llm_client or not self.history:
|
||||
return ""
|
||||
@@ -531,7 +547,7 @@ class RobotSpeakerNode(Node):
|
||||
tts_text_to_send = ""
|
||||
self.get_logger().warning("[流式TTS] reply和tts_text_buffer都为空,无法发送TTS文本")
|
||||
|
||||
if not self.interrupt_event.is_set() and tts_text_to_send:
|
||||
if not self.interrupt_event.is_set() and tts_text_to_send and not suppress_tts:
|
||||
text_len = len(tts_text_to_send)
|
||||
self.get_logger().info(
|
||||
f"[流式TTS] 发送完整文本到TTS队列: {tts_text_to_send[:100]}... (总长度: {text_len}字符)"
|
||||
@@ -539,6 +555,8 @@ class RobotSpeakerNode(Node):
|
||||
if text_len > 100:
|
||||
self.get_logger().debug(f"[流式TTS] 完整文本内容: {tts_text_to_send}")
|
||||
self._put_tts_text(tts_text_to_send)
|
||||
elif suppress_tts:
|
||||
self.get_logger().info("[流式TTS] suppress_tts开启,跳过TTS输出")
|
||||
|
||||
return reply.strip() if reply else ""
|
||||
|
||||
@@ -723,11 +741,17 @@ class RobotSpeakerNode(Node):
|
||||
reply = self._llm_process_stream_with_camera(
|
||||
text,
|
||||
need_camera=need_camera,
|
||||
system_prompt=system_prompt
|
||||
system_prompt=system_prompt,
|
||||
suppress_tts=(intent == "skill_sequence")
|
||||
)
|
||||
if reply:
|
||||
if self.history:
|
||||
self.history.commit_turn(reply)
|
||||
if intent == "skill_sequence":
|
||||
skill_msg = String()
|
||||
skill_msg.data = reply.strip()
|
||||
self.skill_sequence_pub.publish(skill_msg)
|
||||
self.get_logger().info(f"[技能序列] 已发布: {skill_msg.data}")
|
||||
else:
|
||||
if self.history:
|
||||
self.history.cancel_turn()
|
||||
@@ -772,6 +796,36 @@ class RobotSpeakerNode(Node):
|
||||
|
||||
super().destroy_node()
|
||||
|
||||
def _on_skill_feedback(self, msg: String):
|
||||
try:
|
||||
feedback = json.loads(msg.data)
|
||||
self.latest_skill_feedback = feedback
|
||||
feedback_text = (
|
||||
f"【执行状态】阶段:{feedback.get('stage','')}, "
|
||||
f"技能:{feedback.get('current_skill','')}, "
|
||||
f"进度:{feedback.get('progress', 0):.1%}, "
|
||||
f"详情:{feedback.get('detail','')}"
|
||||
)
|
||||
if self.history:
|
||||
self.history.add_message("system", feedback_text)
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"[技能反馈] 解析失败: {e}")
|
||||
|
||||
def _on_skill_result(self, msg: String):
|
||||
try:
|
||||
result = json.loads(msg.data)
|
||||
self.latest_skill_result = result
|
||||
result_text = (
|
||||
f"【执行结果】{'成功' if result.get('success') else '失败'}, "
|
||||
f"总技能数:{result.get('total_skills', 0)}, "
|
||||
f"成功数:{result.get('succeeded_skills', 0)}, "
|
||||
f"消息:{result.get('message','')}"
|
||||
)
|
||||
if self.history:
|
||||
self.history.add_message("system", result_text)
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"[技能结果] 解析失败: {e}")
|
||||
|
||||
|
||||
def _init_ros(args):
|
||||
rclpy.init(args=args)
|
||||
|
||||
@@ -2,3 +2,4 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -2,3 +2,4 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -10,3 +10,4 @@ class ASRClient:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -116,56 +116,45 @@ class DashScopeASR(ASRClient):
|
||||
|
||||
def stop_current_recognition(self):
|
||||
"""
|
||||
停止当前识别,触发final结果,然后重新启动
|
||||
优化:
|
||||
1. 使用事件代替 sleep,等待 final 回调完成
|
||||
2. 使用锁防止并发调用
|
||||
3. 处理 start() 失败的情况,确保 running 状态正确
|
||||
4. 添加超时机制,避免无限等待
|
||||
触发提交操作获取当前识别结果,但不关闭连接
|
||||
"""
|
||||
if not self.running or not self.conversation:
|
||||
return False
|
||||
|
||||
# 使用锁防止并发调用
|
||||
if not self._stop_lock.acquire(blocking=False):
|
||||
self._log("warning", "stop_current_recognition 正在执行,跳过本次调用")
|
||||
return False
|
||||
|
||||
|
||||
try:
|
||||
if not self.running or not self.conversation:
|
||||
return False
|
||||
|
||||
# 重置事件,准备等待 final 回调
|
||||
self._final_result_event.clear()
|
||||
self._pending_commit = True
|
||||
|
||||
|
||||
# 触发 commit,等待 final 结果
|
||||
self.conversation.commit()
|
||||
|
||||
|
||||
# 等待 final 回调完成(最多等待1秒)
|
||||
if self._final_result_event.wait(timeout=1.0):
|
||||
self._log("debug", "已收到 final 回调,准备关闭连接")
|
||||
self._log("debug", "已收到 final 回调")
|
||||
else:
|
||||
self._log("warning", "等待 final 回调超时,继续执行")
|
||||
|
||||
# 先设置running=False,防止ASR线程继续发送音频
|
||||
self.running = False
|
||||
|
||||
# 关闭当前连接
|
||||
old_conversation = self.conversation
|
||||
self.conversation = None # 立即清空,防止send_audio继续使用
|
||||
try:
|
||||
old_conversation.close()
|
||||
except Exception as e:
|
||||
self._log("warning", f"关闭连接时出错: {e}")
|
||||
|
||||
# 短暂等待,确保连接完全关闭
|
||||
time.sleep(0.1)
|
||||
|
||||
# 重新启动,如果失败则保持 running=False
|
||||
if not self.start():
|
||||
self._log("error", "ASR重启失败,running状态已重置")
|
||||
return False
|
||||
|
||||
# 启动成功,running已在start()中设置为True
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"提交当前识别结果失败: {e}")
|
||||
# 出现错误时尝试重启连接
|
||||
self.running = False
|
||||
try:
|
||||
if self.conversation:
|
||||
self.conversation.close()
|
||||
except:
|
||||
pass
|
||||
self.conversation = None
|
||||
time.sleep(0.1)
|
||||
return self.start()
|
||||
|
||||
finally:
|
||||
self._pending_commit = False
|
||||
self._stop_lock.release()
|
||||
|
||||
@@ -2,3 +2,4 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -12,3 +12,4 @@ class LLMClient:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -2,3 +2,4 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -11,3 +11,4 @@ class TTSClient:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -2,3 +2,4 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -107,11 +107,19 @@ class SpeakerVerificationClient:
|
||||
|
||||
temp_wav_path = None
|
||||
try:
|
||||
temp_wav_path = self._write_temp_wav(audio_data, sample_rate)
|
||||
result = self.model.generate(input=temp_wav_path)
|
||||
|
||||
# 限制Torch在推理时使用单线程,避免在多任务环境下(尤其是一边录音一边识别)
|
||||
# 出现的极端CPU竞争和上下文切换开销
|
||||
import torch
|
||||
embedding = result[0]['spk_embedding'].detach().cpu().numpy()[0] # shape [1, 192] -> [192]
|
||||
with torch.inference_mode():
|
||||
# 临时设置,虽然全局已经设置了,但在调用前再次确保
|
||||
# 注意:set_num_threads 是全局的,这里再次确认
|
||||
if torch.get_num_threads() != 1:
|
||||
torch.set_num_threads(1)
|
||||
|
||||
temp_wav_path = self._write_temp_wav(audio_data, sample_rate)
|
||||
result = self.model.generate(input=temp_wav_path)
|
||||
|
||||
embedding = result[0]['spk_embedding'].detach().cpu().numpy()[0] # shape [1, 192] -> [192]
|
||||
|
||||
embedding_dim = len(embedding)
|
||||
if embedding_dim == 0:
|
||||
|
||||
@@ -2,3 +2,4 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user