fix deploy issues

2026-01-19 09:09:28 +08:00
parent 9fd658990c
commit c282f9b4de
9 changed files with 135 additions and 32 deletions
--- a/README.md
+++ b/README.md
@@ -9,19 +9,28 @@ https://bailian.console.aliyun.com/?tab=model#/api-key
 1. 系统依赖
 ```bash
 sudo apt-get update
-sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg
+sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg swig meson ninja-build build-essential pkg-config libwebrtc-audio-processing-dev
 ```

 2. Python依赖
 ```bash
 cd ~/ros_learn/hivecore_robot_voice
+# 在 Python 3.10 环境下，需要单独安装 aec-audio-processing 以跳过版本检查
+pip3 install aec-audio-processing --no-binary :all: --ignore-requires-python --break-system-packages
 pip3 install -r requirements.txt --break-system-packages
 ```

 ## 编译启动
 1. 注册声纹
  - 启动节点后可以说：二狗今天天气真好开始注册声纹
-  - 注意：要包含唤醒词，语句不要停顿，尽量大于3秒
+  - 正确的注册姿势：
+    方法A（推荐）：唤醒后停顿一下，然后说一段长句子。
+    用户："二狗"
+    机器：（日志提示等待声纹语音）
+    用户："我现在正在注册声纹，这是一段很长的测试语音，请把我的声音录进去。"（持续说 3-5 秒）
+    方法B（连贯说）：一口气说很长的一句话。
+    用户："二狗你好，我是你的主人，请记住我的声音，这是一段用来注册的长语音。"
+  - 注意：要包含唤醒词，语句不要停顿，尽量大于1.5秒
 ```bash
 cd ~/ros_learn/hivecore_robot_voice
 colcon build
--- a/config/voice.yaml
+++ b/config/voice.yaml
@@ -18,8 +18,8 @@ dashscope:

 audio:
  microphone:
-    device_index: -1  # -1 表示使用默认设备
-    sample_rate: 16000  # 输入采样率：16kHz（语音识别常用采样率）
+    device_index: 3  # 指向 iFLYTEK-M2 (hw:1,0)
+    sample_rate: 48000  # 尝试使用硬件原生采样率 48kHz，避免重采样可能导致的问题
    channels: 1  # 输入声道数：单声道（MONO，适合语音采集）
    chunk: 1024
    heartbeat_interval: 2.0  # 心跳间隔（秒），用于定期输出录音状态
@@ -28,7 +28,7 @@ audio:
    device_index: 0  # USB Audio [USB Audio] (device 0)
    # card_index: -1  # 使用默认声卡
    # device_index: -1  # 使用默认输出设备
-    sample_rate: 44100  # 输出采样率：44.1kHz（支持48000或44100）
+    sample_rate: 48000  # 输出采样率：48kHz（iFLYTEK 支持 48000）
    channels: 2  # 输出声道数：立体声（2声道，FL+FR）
    volume: 1.0  # 音量比例（0.0-1.0，0.2表示20%音量）
  echo_cancellation:
@@ -41,7 +41,7 @@ audio:
 vad:
  vad_mode: 3  # VAD模式：0-3，3最严格
  silence_duration_ms: 1000  # 静音持续时长（毫秒）
-  min_energy_threshold: 300  # 最小能量阈值
+  min_energy_threshold: 200  # 最小能量阈值

 system:
  use_llm: true  # 是否使用LLM
@@ -51,10 +51,10 @@ system:
  shutup_keywords: "bi zui"  # 闭嘴指令关键词（拼音，逗号分隔）
  interrupt_command_queue_depth: 10  # 中断命令订阅的队列深度（QoS）
  sv_enabled: true  # 是否启用声纹识别
-  sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common"  # 声纹模型路径
+  sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
  sv_threshold: 0.55  # 声纹识别阈值（0.0-1.0，值越小越宽松，值越大越严格）
-  sv_speaker_db_path: "config/speakers.json"  # 声纹数据库保存路径（JSON格式，相对于ROS2包share目录）
-  sv_buffer_size: 64000  # 声纹验证录音缓冲区大小（样本数）
+  sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json"  # 声纹数据库保存路径（JSON格式，相对于ROS2包share目录）
+  sv_buffer_size: 240000  # 声纹验证录音缓冲区大小（样本数，48kHz下5秒=240000）
  sv_registration_silence_threshold_ms: 500  # 声纹注册状态下的静音阈值（毫秒）

 camera:
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ numpy>=1.24.0
 PyYAML>=6.0
 aec-audio-processing
 modelscope>=1.33.0
+funasr>=1.0.0
 datasets==3.6.0


--- a/robot_speaker/core/node_workers.py
+++ b/robot_speaker/core/node_workers.py
@@ -60,9 +60,9 @@ class NodeWorkers:
                        current_seq = node.sv_result_seq
                        if not node.sv_result_cv.wait_for(
                            lambda: node.sv_result_seq > current_seq,
-                            timeout=2.0
+                            timeout=15.0
                        ):
-                            node.get_logger().warning("[主线程] CHECK_VOICE状态：声纹结果未ready（超时2秒），拒绝本轮")
+                            node.get_logger().warning("[主线程] CHECK_VOICE状态：声纹结果未ready（超时15秒），拒绝本轮")
                            with node.sv_lock:
                                node.sv_audio_buffer.clear()
                            node._change_state(ConversationState.IDLE, "声纹结果未ready")
--- a/robot_speaker/core/register_speaker_node.py
+++ b/robot_speaker/core/register_speaker_node.py
@@ -16,6 +16,8 @@ from ament_index_python.packages import get_package_share_directory
 from robot_speaker.perception.audio_pipeline import VADDetector, AudioRecorder
 from robot_speaker.perception.speaker_verifier import SpeakerVerificationClient
 from robot_speaker.models.asr.dashscope import DashScopeASR
+from robot_speaker.models.tts.dashscope import DashScopeTTSClient
+from robot_speaker.core.types import TTSRequest
 from pypinyin import pinyin, Style


@@ -100,6 +102,15 @@ class RegisterSpeakerNode(Node):
            logger=self.get_logger()
        )

+        self.tts_client = DashScopeTTSClient(
+            api_key=self.dashscope_api_key,
+            model=self.tts_model,
+            voice=self.tts_voice,
+            card_index=self.output_card_index,
+            device_index=self.output_device_index,
+            logger=self.get_logger()
+        )
+
        self.get_logger().info("声纹注册节点启动，请说'er gou......'唤醒注册")
        self.recording_thread = threading.Thread(
            target=self.audio_recorder.record_with_vad,
@@ -122,12 +133,15 @@ class RegisterSpeakerNode(Node):
        dashscope = config['dashscope']
        audio = config['audio']
        mic = audio['microphone']
+        soundcard = audio['soundcard']
        vad = config['vad']
        system = config['system']

        self.dashscope_api_key = dashscope['api_key']
        self.asr_model = dashscope['asr']['model']
        self.asr_url = dashscope['asr']['url']
+        self.tts_model = dashscope['tts']['model']
+        self.tts_voice = dashscope['tts']['voice']
        
        self.input_device_index = mic['device_index']
        self.sample_rate = mic['sample_rate']
@@ -135,13 +149,16 @@ class RegisterSpeakerNode(Node):
        self.chunk = mic['chunk']
        self.audio_microphone_heartbeat_interval = mic['heartbeat_interval']

+        self.output_card_index = soundcard['card_index']
+        self.output_device_index = soundcard['device_index']
+
        self.vad_mode = vad['vad_mode']
        self.silence_duration_ms = vad['silence_duration_ms']
        self.min_energy_threshold = vad['min_energy_threshold']

        self.sv_model_path = os.path.expanduser(system['sv_model_path'])
        self.sv_threshold = system['sv_threshold']
-        self.sv_speaker_db_path = system['sv_speaker_db_path']
+        self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
        self.sv_buffer_size = system['sv_buffer_size']
        self.wake_word = system['wake_word']

@@ -183,7 +200,8 @@ class RegisterSpeakerNode(Node):
        # 等待声纹语音时，用户说话结束，使用当前音频（即使不足3秒）
        if self.waiting_for_voiceprint:
            self._process_voiceprint_audio(use_current_audio_if_short=True)
-    
+            return  # 处理完毕后直接返回，防止重复调用
+
    def _process_voiceprint_audio(self, use_current_audio_if_short: bool = False):
        """处理声纹音频：使用用户完整的第一段语音进行注册
        
@@ -200,8 +218,7 @@ class RegisterSpeakerNode(Node):
        buffer_sec = buffer_size / self.sample_rate
        self.get_logger().info(f"[注册录音] 当前音频长度: {buffer_sec:.2f}秒")

-        # 需要3秒音频
-        required_samples = int(self.sample_rate * 3.0)
+        required_samples = int(self.sample_rate * 3)
        
        # 如果音频不足3秒
        if buffer_size < required_samples:
@@ -215,9 +232,19 @@ class RegisterSpeakerNode(Node):
                self.processing = False
                return
        else:
-            # 音频达到3秒，截取最后3秒
-            audio_to_use = audio_list[-required_samples:]
-            self.get_logger().info(f"[注册录音] 使用完整的第一段语音，截取最后3秒用于注册")
+            # 策略优化：不再强行截取最后3秒，因为唤醒词检测有延迟，
+            # "er gou" 可能在缓冲区的中间偏后位置。
+            # 为了防止截取到尾部的静音，并在包含完整唤醒词，
+            # 我们截取最近的 3.0 秒（或者全部，如果不足3秒），
+            # 这样能最大程度包含有效语音 "二狗"。
+            target_samples = int(self.sample_rate * 3.0)
+            if buffer_size > target_samples:
+                audio_to_use = audio_list[-target_samples:]
+            else:
+                audio_to_use = audio_list
+            
+            duration = len(audio_to_use) / self.sample_rate
+            self.get_logger().info(f"[注册录音] 使用最近 {duration:.2f} 秒音频用于注册（覆盖唤醒词）")
        
        # 清空缓冲区
        with self.buffer_lock:
@@ -237,6 +264,16 @@ class RegisterSpeakerNode(Node):
            speaker_id = f"user_{int(time.time())}"
            if self.sv_client.register_speaker(speaker_id, embedding):
                self.get_logger().info(f"[注册录音] 注册成功，用户ID: {speaker_id}，准备退出")
+                
+                # 播放成功提示
+                try:
+                    self.get_logger().info("[注册录音] 播放注册成功提示")
+                    request = TTSRequest(text="声纹注册成功", voice=self.tts_voice)
+                    self.tts_client.synthesize(request)
+                    time.sleep(0.5) 
+                except Exception as e:
+                    self.get_logger().error(f"[注册录音] 播放提示失败: {e}")
+                
                self.stop_event.set()
            else:
                self.get_logger().error("[注册录音] 注册失败")
@@ -264,7 +301,7 @@ class RegisterSpeakerNode(Node):
            
            # 使用更低的阈值来检测人声（降低阈值，避免误判静音）
            # 阈值可以动态调整，或者使用自适应阈值
-            threshold = self.min_energy_threshold * 0.5  # 降低阈值到原来的50%
+            threshold = self.min_energy_threshold * 0.30  # 降低阈值到原来的20%
            
            # 如果能量超过阈值，认为是人声
            if energy >= threshold:
--- a/robot_speaker/core/robot_speaker_node.py
+++ b/robot_speaker/core/robot_speaker_node.py
@@ -165,7 +165,7 @@ class RobotSpeakerNode(Node):
        self.sv_enabled = system['sv_enabled']
        self.sv_model_path = os.path.expanduser(system['sv_model_path'])
        self.sv_threshold = system['sv_threshold']
-        self.sv_speaker_db_path = system['sv_speaker_db_path']
+        self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])  # 展开用户目录
        self.sv_buffer_size = system['sv_buffer_size']
        
        # 相机参数
--- a/robot_speaker/models/tts/dashscope.py
+++ b/robot_speaker/models/tts/dashscope.py
@@ -121,10 +121,14 @@ class _TTSCallback(ResultCallback):
            '-fflags', 'nobuffer',     # 减少缓冲
            '-flags', 'low_delay',     # 低延迟
            '-avioflags', 'direct',    # 尝试直通写入 ALSA，减少延迟
-            '-thread_queue_size', str(self.tts_client.tts_ffmpeg_thread_queue_size),  # 输入线程队列大小（从配置文件读取）
            self.tts_client.alsa_device
        ]
        
+        # 将 -thread_queue_size 放到输入文件之前
+        insert_pos = ffmpeg_cmd.index('-i')
+        ffmpeg_cmd.insert(insert_pos, str(self.tts_client.tts_ffmpeg_thread_queue_size))
+        ffmpeg_cmd.insert(insert_pos, '-thread_queue_size')
+        
        # 添加音量调节filter（如果音量不是1.0）
        if self.tts_client.output_volume != 1.0:
            # 在输出编码前插入音量filter
--- a/robot_speaker/perception/audio_pipeline.py
+++ b/robot_speaker/perception/audio_pipeline.py
@@ -24,7 +24,7 @@ class AudioRecorder:
                 chunk: int, vad_detector: VADDetector,
                 audio_queue: queue.Queue,  # 音频队列：录音线程 → ASR线程
                 silence_duration_ms: int = 1000,
-                 min_energy_threshold: int = 300, # 音频能量 > 300：有语音
+                 min_energy_threshold: int = 200, # 音频能量 > 200：有语音
                 heartbeat_interval: float = 2.0,
                 on_heartbeat=None,
                 is_playing=None,
@@ -59,6 +59,42 @@ class AudioRecorder:
        self.get_silence_threshold = get_silence_threshold  # 动态静音阈值回调
        self.logger = logger
        self.audio = pyaudio.PyAudio()
+
+        # 自动查找 iFLYTEK 麦克风设备
+        try:
+            count = self.audio.get_device_count()
+            found_index = -1
+            if self.logger:
+                self.logger.info(f"开始扫描音频设备 (总数: {count})...")
+
+            for i in range(count):
+                device_info = self.audio.get_device_info_by_index(i)
+                device_name = device_info.get('name', '')
+                max_input_channels = device_info.get('maxInputChannels', 0)
+                
+                if self.logger:
+                    try:
+                        self.logger.info(f"扫描设备 [{i}]: Name='{device_name}', MaxInput={max_input_channels}, Rate={int(device_info.get('defaultSampleRate'))}")
+                    except:
+                        pass
+
+                # 检查是否包含 iFLYTEK 且支持录音（输入通道 > 0）
+                if 'iFLYTEK' in device_name and max_input_channels > 0:
+                    found_index = i
+                    if self.logger:
+                        self.logger.info(f"已自动定位到麦克风设备: {device_name} (Index: {i})")
+                    break
+            
+            if found_index != -1:
+                self.device_index = found_index
+            else:
+                if self.logger:
+                    self.logger.warning(f"未自动检测到 iFLYTEK 设备，将继续使用配置的索引: {self.device_index}")
+
+        except Exception as e:
+            if self.logger:
+                self.logger.error(f"设备自动检测过程出错: {e}")
+
        self.format = pyaudio.paInt16
        self._debug_counter = 0
        
@@ -121,8 +157,7 @@ class AudioRecorder:

        audio_buffer = [] # VAD 滑动窗口
        last_active_time = time.time() # 静音计时基准
-        was_speaking = False # 上一窗口状态
-        is_speaking = False # 当前窗口状态
+        in_speech_segment = False # 是否处于语音段中（从检测到人声开始，直到静音超时结束）

        try:
            while not self.stop_flag():
@@ -171,13 +206,11 @@ class AudioRecorder:
                            self.logger.info(f"[VAD调试] 能量={energy:.1f}, 阈值={self.min_energy_threshold}, VAD结果={vad_result}")
                        self._debug_counter = 0

-                    was_speaking = is_speaking
-                    is_speaking = vad_result
-
-                    if is_speaking:
+                    if vad_result:
                        last_active_time = now
                        
-                        if not was_speaking: # 上一轮没说话，本轮开始说话
+                        if not in_speech_segment: # 上一轮没说话，本轮开始说话
+                            in_speech_segment = True
                            if self.on_speech_start:
                                self.on_speech_start()
                            
@@ -185,8 +218,10 @@ class AudioRecorder:
                            if self.is_playing() and self.on_new_segment:
                                self.on_new_segment() # 打断 TTS的回调
                    else:
-                        if was_speaking:
+                        if in_speech_segment:
+                            # 处于语音段中，但当前帧为静音，检查静音时长
                            silence_duration = now - last_active_time
+                            
                            # 动态获取静音阈值（如果提供回调函数）
                            if self.get_silence_threshold:
                                current_silence_ms = self.get_silence_threshold()
@@ -203,6 +238,7 @@ class AudioRecorder:
                                    if self.logger:
                                        self.logger.debug(f"[VAD] 触发speech_end: 静音持续时间 {silence_duration:.3f}秒 >= 阈值 {current_no_speech_threshold:.3f}秒")
                                    self.on_speech_end() # 通知系统用户停止说话
+                                in_speech_segment = False
                        
                        if self.on_heartbeat and now - last_heartbeat_time >= self.heartbeat_interval:
                            self.on_heartbeat()
@@ -256,7 +292,7 @@ class AudioRecorder:

        # 语音开头能量高, 中后段（拖音、尾音）能量下降
        vad_result = num >= required
-        if vad_result and energy < self.min_energy_threshold * 0.5:
+        if vad_result and energy < self.min_energy_threshold * 0.3:
            return False
        
        return vad_result
--- a/robot_speaker/perception/speaker_verifier.py
+++ b/robot_speaker/perception/speaker_verifier.py
@@ -32,7 +32,8 @@ class SpeakerVerificationClient:
        
        from funasr import AutoModel
        model_path = os.path.expanduser(self.model_path)
-        self.model = AutoModel(model=model_path, device="cpu")
+        # 禁用自动更新检查，防止每次初始化都联网检查
+        self.model = AutoModel(model=model_path, device="cpu", disable_update=True)
        if self.logger:
            self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
        
@@ -82,6 +83,21 @@ class SpeakerVerificationClient:
        """
        提取说话人embedding（低频调用，一句话只调用一次）
        """
+        # 降采样到 16000Hz (如果需要)
+        # Cam++ 等模型通常只支持 16k，如果传入 48k 会导致内部重采样极慢或计算量剧增
+        target_sr = 16000
+        if sample_rate > target_sr:
+            if sample_rate % target_sr == 0:
+                step = sample_rate // target_sr
+                audio_data = audio_data[::step]
+                sample_rate = target_sr
+            else:
+                # 简单的非整数倍降采样可能导致问题，但对于语音验证通常 48k->16k 是整数倍
+                # 如果不是，此处暂不处理，依赖 funasr 内部处理，或者简单的步长取整
+                step = int(sample_rate / target_sr)
+                audio_data = audio_data[::step]
+                sample_rate = target_sr
+        
        if len(audio_data) < int(sample_rate * 0.5):
            return None, False