配置文件增加没有图像skill_sequence/chat_camera是否推理的button,扩充kb_qa的回复,减少闲聊模式的回复长度
This commit is contained in:
@@ -23,9 +23,10 @@ pip3 install -r requirements.txt --break-system-packages
|
||||
## 编译启动
|
||||
1. 注册声纹
|
||||
- 启动节点后可以说:er gou我现在正在注册声纹,这是一段很长的测试语音,请把我的声音录进去。
|
||||
- 正确的注册姿势:包含唤醒词二狗,不要停顿的说完3秒
|
||||
- 正确的注册姿势:包含唤醒词二狗,不要停顿的尽量说完3秒
|
||||
|
||||
- 现在的逻辑只要识别到二狗就注册,然后退出节点,识别不到二狗继续等待
|
||||
- 多注册几段,换方向距离注册,可以提高识别相似度,注册方向对声纹相似性影响很大
|
||||
```bash
|
||||
cd ~/ros_learn/hivecore_robot_voice
|
||||
colcon build
|
||||
|
||||
@@ -1,11 +1,18 @@
|
||||
{
|
||||
"entries": [
|
||||
{
|
||||
"id": "robot_identity",
|
||||
"id": "robot_identity_1",
|
||||
"patterns": [
|
||||
"ni shi shui"
|
||||
],
|
||||
"answer": "我叫二狗,是蜂核科技的机器人,很高兴为你服务"
|
||||
},
|
||||
{
|
||||
"id": "robot_identity_2",
|
||||
"patterns": [
|
||||
"ni jiao sha"
|
||||
],
|
||||
"answer": "我叫二狗呀,我是你的好帮手"
|
||||
},
|
||||
{
|
||||
"id": "wake_word",
|
||||
@@ -13,6 +20,27 @@
|
||||
"ni de ming zi"
|
||||
],
|
||||
"answer": "我的名字是二狗"
|
||||
},
|
||||
{
|
||||
"id": "skill_1",
|
||||
"patterns": [
|
||||
"tiao ge wu"
|
||||
],
|
||||
"answer": "这个我真不会,我怕跳起来吓到你"
|
||||
},
|
||||
{
|
||||
"id": "skill_2",
|
||||
"patterns": [
|
||||
"ni neng gan"
|
||||
],
|
||||
"answer": "我可以陪你聊天,也能帮你干活"
|
||||
},
|
||||
{
|
||||
"id": "skill_3",
|
||||
"patterns": [
|
||||
"ni hui gan"
|
||||
],
|
||||
"answer": "我可以陪你聊天,你也可以发布具体的指令让我干活"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -18,20 +18,20 @@ dashscope:
|
||||
|
||||
audio:
|
||||
microphone:
|
||||
device_index: 3 # 指向 iFLYTEK-M2 (hw:1,0)
|
||||
sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz,避免重采样可能导致的问题
|
||||
# device_index: -1 # 使用系统默认输入设备
|
||||
# sample_rate: 16000
|
||||
# device_index: 3 # 指向 iFLYTEK-M2 (hw:1,0)
|
||||
# sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz,避免重采样可能导致的问题
|
||||
device_index: -1 # 使用系统默认输入设备
|
||||
sample_rate: 16000
|
||||
channels: 1 # 输入声道数:单声道(MONO,适合语音采集)
|
||||
chunk: 1024
|
||||
heartbeat_interval: 2.0 # 心跳间隔(秒),用于定期输出录音状态
|
||||
soundcard:
|
||||
card_index: 1 # USB Audio Device (card 1)
|
||||
device_index: 0 # USB Audio [USB Audio] (device 0)
|
||||
sample_rate: 48000 # 输出采样率:48kHz(iFLYTEK 支持 48000)
|
||||
# card_index: -1 # 使用默认声卡
|
||||
# device_index: -1 # 使用默认输出设备
|
||||
# sample_rate: 44100 # 输出采样率:默认 44100
|
||||
# card_index: 1 # USB Audio Device (card 1)
|
||||
# device_index: 0 # USB Audio [USB Audio] (device 0)
|
||||
# sample_rate: 48000 # 输出采样率:48kHz(iFLYTEK 支持 48000)
|
||||
card_index: -1 # 使用默认声卡
|
||||
device_index: -1 # 使用默认输出设备
|
||||
sample_rate: 48000 # 输出采样率:默认 44100
|
||||
channels: 2 # 输出声道数:立体声(2声道,FL+FR)
|
||||
volume: 1.0 # 音量比例(0.0-1.0,0.2表示20%音量)
|
||||
tts:
|
||||
@@ -54,10 +54,11 @@ system:
|
||||
sv_enabled: true # 是否启用声纹识别
|
||||
# sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
|
||||
sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
|
||||
sv_threshold: 0.45 # 声纹识别阈值(0.0-1.0,值越小越宽松,值越大越严格)
|
||||
sv_threshold: 0.40 # 声纹识别阈值(0.0-1.0,值越小越宽松,值越大越严格)
|
||||
# sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json" # 声纹数据库保存路径(JSON格式,相对于ROS2包share目录)
|
||||
sv_speaker_db_path: "~/ros_learn/hivecore_robot_voice/config/speakers.json" # 声纹数据库保存路径(JSON格式,相对于ROS2包share目录)
|
||||
sv_buffer_size: 240000 # 声纹验证录音缓冲区大小(样本数,48kHz下5秒=240000)
|
||||
continue_without_image: false # 多模态意图(skill_sequence/chat_camera)未获取到图片时是否继续推理
|
||||
|
||||
camera:
|
||||
image:
|
||||
|
||||
@@ -36,9 +36,10 @@ class IntentRouter:
|
||||
"fang xia", "fang zhi", # 放下、放置
|
||||
"ju qi", "sheng qi", # 举起、升起
|
||||
"jia zhua", "jia qi", "jia", # 夹爪、夹起、夹
|
||||
"shen you bi", "shen zuo bi", "shen chu", "shen shou", # 伸右臂、伸左臂、伸出、伸手
|
||||
]
|
||||
self.kb_keywords = [
|
||||
"ni shi shui", "ni de ming zi"
|
||||
"ni shi shui", "ni de ming zi", "tiao ge wu", "ni jiao sha", "ni hui gan", "ni neng gan"
|
||||
]
|
||||
self._cached_skill_names: list[str] | None = None
|
||||
self._cached_kb_data: list[dict] | None = None
|
||||
@@ -96,8 +97,15 @@ class IntentRouter:
|
||||
if text_pinyin is None:
|
||||
text_pinyin = self.to_pinyin(text)
|
||||
|
||||
# 检查动作词(精确匹配)
|
||||
return any(action in text_pinyin for action in self.action_verbs)
|
||||
# 检查动作词(精确匹配:动作词必须是完整的单词序列)
|
||||
text_words = text_pinyin.split()
|
||||
for action in self.action_verbs:
|
||||
action_words = action.split()
|
||||
# 检查动作词的单词序列是否是文本单词序列的连续子序列
|
||||
for i in range(len(text_words) - len(action_words) + 1):
|
||||
if text_words[i:i+len(action_words)] == action_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def check_camera_command(self, text: str, text_pinyin: str | None = None) -> tuple[bool, Optional[str]]:
|
||||
@@ -184,11 +192,11 @@ class IntentRouter:
|
||||
if need_camera:
|
||||
return (
|
||||
"你是一个智能语音助手。\n"
|
||||
"请结合图片内容简短回答。"
|
||||
"请结合图片内容简短回答。不要超过100个token。"
|
||||
)
|
||||
return (
|
||||
"你是一个智能语音助手。\n"
|
||||
"请自然、简短地与用户对话。"
|
||||
"请自然、简短地与用户对话。不要超过100个token。"
|
||||
)
|
||||
|
||||
def _load_kb_data(self) -> list[dict]:
|
||||
@@ -236,12 +244,13 @@ class IntentRouter:
|
||||
need_camera, camera_mode = self.check_camera_command(text, text_pinyin)
|
||||
|
||||
if self.is_skill_sequence_intent(text, text_pinyin):
|
||||
# 用户没有指定相机模式时,保持 None,使用第一个收到的消息
|
||||
# 技能序列意图总是需要相机,复用 detect_camera_mode:用户指定了相机就用指定的,否则默认 "top"
|
||||
skill_camera_mode = self.detect_camera_mode(text, text_pinyin)
|
||||
return IntentResult(
|
||||
intent="skill_sequence",
|
||||
text=text,
|
||||
need_camera=True,
|
||||
camera_mode=camera_mode,
|
||||
camera_mode=skill_camera_mode,
|
||||
system_prompt=self.build_skill_prompt()
|
||||
)
|
||||
|
||||
|
||||
@@ -151,6 +151,7 @@ class RobotSpeakerNode(Node):
|
||||
self.sv_threshold = system['sv_threshold']
|
||||
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
|
||||
self.sv_buffer_size = system['sv_buffer_size']
|
||||
self.continue_without_image = system.get('continue_without_image', True)
|
||||
|
||||
camera = config['camera']
|
||||
self.camera_image_jpeg_quality = camera['image']['jpeg_quality']
|
||||
@@ -728,15 +729,6 @@ class RobotSpeakerNode(Node):
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[相机] 图像转换失败: {e}")
|
||||
return None
|
||||
elif camera_mode is None and len(self.img_msg_cache) > 0:
|
||||
msg = next(iter(self.img_msg_cache.values()))
|
||||
try:
|
||||
cv_image = self.cv_bridge.imgmsg_to_cv2(msg.image_color, desired_encoding='rgb8')
|
||||
self.get_logger().info(f"[相机] 未指定相机位置,使用{msg.position}相机获取图像成功")
|
||||
return cv_image
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[相机] 图像转换失败: {e}")
|
||||
return None
|
||||
time.sleep(0.1)
|
||||
|
||||
with self.img_msg_lock:
|
||||
@@ -809,6 +801,13 @@ class RobotSpeakerNode(Node):
|
||||
actual_position = camera_mode if camera_mode and camera_mode in self.img_msg_cache else (list(self.img_msg_cache.keys())[0] if self.img_msg_cache else "unknown")
|
||||
self.get_logger().info(f"[相机] 已拍照,使用{actual_position}相机 (期望位置={camera_mode_str})")
|
||||
|
||||
if not image_base64_list and intent in ["skill_sequence", "chat_camera"]:
|
||||
if not self.continue_without_image:
|
||||
self.get_logger().warning(f"[多模态] {intent}意图未获取到图片,跳过推理(continue_without_image=False)")
|
||||
return ""
|
||||
else:
|
||||
self.get_logger().info(f"[多模态] {intent}意图未获取到图片,继续推理(continue_without_image=True)")
|
||||
|
||||
if image_base64_list:
|
||||
self.get_logger().info(f"[LLM] 准备发送给LLM: {len(image_base64_list)}张图片,用户文本: {user_text[:50]}")
|
||||
for idx, img_b64 in enumerate(image_base64_list):
|
||||
|
||||
Reference in New Issue
Block a user