配置文件增加没有图像skill_sequence/chat_camera是否推理的button,扩充kb_qa的回复,减少闲聊模式的回复长度

This commit is contained in:
lxy
2026-01-21 18:04:26 +08:00
parent ab1fb4f3f8
commit e8a9821ce4
5 changed files with 67 additions and 29 deletions

View File

@@ -23,9 +23,10 @@ pip3 install -r requirements.txt --break-system-packages
## 编译启动
1. 注册声纹
- 启动节点后可以说er gou我现在正在注册声纹这是一段很长的测试语音请把我的声音录进去。
- 正确的注册姿势包含唤醒词二狗不要停顿的说完3秒
- 正确的注册姿势:包含唤醒词二狗,不要停顿的尽量说完3秒
- 现在的逻辑只要识别到二狗就注册,然后退出节点,识别不到二狗继续等待
- 多注册几段,换方向距离注册,可以提高识别相似度,注册方向对声纹相似性影响很大
```bash
cd ~/ros_learn/hivecore_robot_voice
colcon build

View File

@@ -1,11 +1,18 @@
{
"entries": [
{
"id": "robot_identity",
"id": "robot_identity_1",
"patterns": [
"ni shi shui"
],
"answer": "我叫二狗,是蜂核科技的机器人,很高兴为你服务"
},
{
"id": "robot_identity_2",
"patterns": [
"ni jiao sha"
],
"answer": "我叫二狗呀,我是你的好帮手"
},
{
"id": "wake_word",
@@ -13,6 +20,27 @@
"ni de ming zi"
],
"answer": "我的名字是二狗"
},
{
"id": "skill_1",
"patterns": [
"tiao ge wu"
],
"answer": "这个我真不会,我怕跳起来吓到你"
},
{
"id": "skill_2",
"patterns": [
"ni neng gan"
],
"answer": "我可以陪你聊天,也能帮你干活"
},
{
"id": "skill_3",
"patterns": [
"ni hui gan"
],
"answer": "我可以陪你聊天,你也可以发布具体的指令让我干活"
}
]
}

View File

@@ -18,20 +18,20 @@ dashscope:
audio:
microphone:
device_index: 3 # 指向 iFLYTEK-M2 (hw:1,0)
sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz避免重采样可能导致的问题
# device_index: -1 # 使用系统默认输入设备
# sample_rate: 16000
# device_index: 3 # 指向 iFLYTEK-M2 (hw:1,0)
# sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz避免重采样可能导致的问题
device_index: -1 # 使用系统默认输入设备
sample_rate: 16000
channels: 1 # 输入声道数单声道MONO适合语音采集
chunk: 1024
heartbeat_interval: 2.0 # 心跳间隔(秒),用于定期输出录音状态
soundcard:
card_index: 1 # USB Audio Device (card 1)
device_index: 0 # USB Audio [USB Audio] (device 0)
sample_rate: 48000 # 输出采样率48kHziFLYTEK 支持 48000
# card_index: -1 # 使用默认声卡
# device_index: -1 # 使用默认输出设备
# sample_rate: 44100 # 输出采样率:默认 44100
# card_index: 1 # USB Audio Device (card 1)
# device_index: 0 # USB Audio [USB Audio] (device 0)
# sample_rate: 48000 # 输出采样率48kHziFLYTEK 支持 48000
card_index: -1 # 使用默认声卡
device_index: -1 # 使用默认输出设备
sample_rate: 48000 # 输出采样率:默认 44100
channels: 2 # 输出声道数立体声2声道FL+FR
volume: 1.0 # 音量比例0.0-1.00.2表示20%音量)
tts:
@@ -54,10 +54,11 @@ system:
sv_enabled: true # 是否启用声纹识别
# sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
sv_threshold: 0.45 # 声纹识别阈值0.0-1.0,值越小越宽松,值越大越严格)
sv_threshold: 0.40 # 声纹识别阈值0.0-1.0,值越小越宽松,值越大越严格)
# sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json" # 声纹数据库保存路径JSON格式相对于ROS2包share目录
sv_speaker_db_path: "~/ros_learn/hivecore_robot_voice/config/speakers.json" # 声纹数据库保存路径JSON格式相对于ROS2包share目录
sv_buffer_size: 240000 # 声纹验证录音缓冲区大小样本数48kHz下5秒=240000
continue_without_image: false # 多模态意图skill_sequence/chat_camera未获取到图片时是否继续推理
camera:
image:

View File

@@ -36,9 +36,10 @@ class IntentRouter:
"fang xia", "fang zhi", # 放下、放置
"ju qi", "sheng qi", # 举起、升起
"jia zhua", "jia qi", "jia", # 夹爪、夹起、夹
"shen you bi", "shen zuo bi", "shen chu", "shen shou", # 伸右臂、伸左臂、伸出、伸手
]
self.kb_keywords = [
"ni shi shui", "ni de ming zi"
"ni shi shui", "ni de ming zi", "tiao ge wu", "ni jiao sha", "ni hui gan", "ni neng gan"
]
self._cached_skill_names: list[str] | None = None
self._cached_kb_data: list[dict] | None = None
@@ -96,8 +97,15 @@ class IntentRouter:
if text_pinyin is None:
text_pinyin = self.to_pinyin(text)
# 检查动作词(精确匹配)
return any(action in text_pinyin for action in self.action_verbs)
# 检查动作词(精确匹配:动作词必须是完整的单词序列
text_words = text_pinyin.split()
for action in self.action_verbs:
action_words = action.split()
# 检查动作词的单词序列是否是文本单词序列的连续子序列
for i in range(len(text_words) - len(action_words) + 1):
if text_words[i:i+len(action_words)] == action_words:
return True
return False
def check_camera_command(self, text: str, text_pinyin: str | None = None) -> tuple[bool, Optional[str]]:
@@ -184,11 +192,11 @@ class IntentRouter:
if need_camera:
return (
"你是一个智能语音助手。\n"
"请结合图片内容简短回答。"
"请结合图片内容简短回答。不要超过100个token。"
)
return (
"你是一个智能语音助手。\n"
"请自然、简短地与用户对话。"
"请自然、简短地与用户对话。不要超过100个token。"
)
def _load_kb_data(self) -> list[dict]:
@@ -236,12 +244,13 @@ class IntentRouter:
need_camera, camera_mode = self.check_camera_command(text, text_pinyin)
if self.is_skill_sequence_intent(text, text_pinyin):
# 用户没有指定相机模式时,保持 None使用第一个收到的消息
# 技能序列意图总是需要相机,复用 detect_camera_mode用户指定相机就用指定的,否则默认 "top"
skill_camera_mode = self.detect_camera_mode(text, text_pinyin)
return IntentResult(
intent="skill_sequence",
text=text,
need_camera=True,
camera_mode=camera_mode,
camera_mode=skill_camera_mode,
system_prompt=self.build_skill_prompt()
)

View File

@@ -151,6 +151,7 @@ class RobotSpeakerNode(Node):
self.sv_threshold = system['sv_threshold']
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
self.sv_buffer_size = system['sv_buffer_size']
self.continue_without_image = system.get('continue_without_image', True)
camera = config['camera']
self.camera_image_jpeg_quality = camera['image']['jpeg_quality']
@@ -728,15 +729,6 @@ class RobotSpeakerNode(Node):
except Exception as e:
self.get_logger().error(f"[相机] 图像转换失败: {e}")
return None
elif camera_mode is None and len(self.img_msg_cache) > 0:
msg = next(iter(self.img_msg_cache.values()))
try:
cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
self.get_logger().info(f"[相机] 未指定相机位置,使用{msg.position}相机获取图像成功")
return cv_image
except Exception as e:
self.get_logger().error(f"[相机] 图像转换失败: {e}")
return None
time.sleep(0.1)
with self.img_msg_lock:
@@ -809,6 +801,13 @@ class RobotSpeakerNode(Node):
actual_position = camera_mode if camera_mode and camera_mode in self.img_msg_cache else (list(self.img_msg_cache.keys())[0] if self.img_msg_cache else "unknown")
self.get_logger().info(f"[相机] 已拍照,使用{actual_position}相机 (期望位置={camera_mode_str})")
if not image_base64_list and intent in ["skill_sequence", "chat_camera"]:
if not self.continue_without_image:
self.get_logger().warning(f"[多模态] {intent}意图未获取到图片跳过推理continue_without_image=False")
return ""
else:
self.get_logger().info(f"[多模态] {intent}意图未获取到图片继续推理continue_without_image=True")
if image_base64_list:
self.get_logger().info(f"[LLM] 准备发送给LLM: {len(image_base64_list)}张图片,用户文本: {user_text[:50]}")
for idx, img_b64 in enumerate(image_base64_list):