fix deploy issues

This commit is contained in:
NuoDaJia02
2026-01-19 09:09:28 +08:00
parent 9fd658990c
commit c282f9b4de
9 changed files with 135 additions and 32 deletions

View File

@@ -9,19 +9,28 @@ https://bailian.console.aliyun.com/?tab=model#/api-key
1. 系统依赖
```bash
sudo apt-get update
sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg
sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg swig meson ninja-build build-essential pkg-config libwebrtc-audio-processing-dev
```
2. Python依赖
```bash
cd ~/ros_learn/hivecore_robot_voice
# 在 Python 3.10 环境下,需要单独安装 aec-audio-processing 以跳过版本检查
pip3 install aec-audio-processing --no-binary :all: --ignore-requires-python --break-system-packages
pip3 install -r requirements.txt --break-system-packages
```
## 编译启动
1. 注册声纹
- 启动节点后可以说:二狗今天天气真好开始注册声纹
- 注意要包含唤醒词语句不要停顿尽量大于3秒
- 正确的注册姿势:
方法A推荐唤醒后停顿一下然后说一段长句子。
用户:"二狗"
机器:(日志提示等待声纹语音)
用户:"我现在正在注册声纹,这是一段很长的测试语音,请把我的声音录进去。"(持续说 3-5 秒)
方法B连贯说一口气说很长的一句话。
用户:"二狗你好,我是你的主人,请记住我的声音,这是一段用来注册的长语音。"
- 注意要包含唤醒词语句不要停顿尽量大于1.5秒
```bash
cd ~/ros_learn/hivecore_robot_voice
colcon build

View File

@@ -18,8 +18,8 @@ dashscope:
audio:
microphone:
device_index: -1 # -1 表示使用默认设备
sample_rate: 16000 # 输入采样率16kHz语音识别常用采样率
device_index: 3 # 指向 iFLYTEK-M2 (hw:1,0)
sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz避免重采样可能导致的问题
channels: 1 # 输入声道数单声道MONO适合语音采集
chunk: 1024
heartbeat_interval: 2.0 # 心跳间隔(秒),用于定期输出录音状态
@@ -28,7 +28,7 @@ audio:
device_index: 0 # USB Audio [USB Audio] (device 0)
# card_index: -1 # 使用默认声卡
# device_index: -1 # 使用默认输出设备
sample_rate: 44100 # 输出采样率44.1kHz支持48000或44100
sample_rate: 48000 # 输出采样率48kHziFLYTEK 支持 48000
channels: 2 # 输出声道数立体声2声道FL+FR
volume: 1.0 # 音量比例0.0-1.00.2表示20%音量)
echo_cancellation:
@@ -41,7 +41,7 @@ audio:
vad:
vad_mode: 3 # VAD模式0-33最严格
silence_duration_ms: 1000 # 静音持续时长(毫秒)
min_energy_threshold: 300 # 最小能量阈值
min_energy_threshold: 200 # 最小能量阈值
system:
use_llm: true # 是否使用LLM
@@ -51,10 +51,10 @@ system:
shutup_keywords: "bi zui" # 闭嘴指令关键词(拼音,逗号分隔)
interrupt_command_queue_depth: 10 # 中断命令订阅的队列深度QoS
sv_enabled: true # 是否启用声纹识别
sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
sv_threshold: 0.55 # 声纹识别阈值0.0-1.0,值越小越宽松,值越大越严格)
sv_speaker_db_path: "config/speakers.json" # 声纹数据库保存路径JSON格式相对于ROS2包share目录
sv_buffer_size: 64000 # 声纹验证录音缓冲区大小(样本数)
sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json" # 声纹数据库保存路径JSON格式相对于ROS2包share目录
sv_buffer_size: 240000 # 声纹验证录音缓冲区大小(样本数48kHz下5秒=240000
sv_registration_silence_threshold_ms: 500 # 声纹注册状态下的静音阈值(毫秒)
camera:

View File

@@ -10,6 +10,7 @@ numpy>=1.24.0
PyYAML>=6.0
aec-audio-processing
modelscope>=1.33.0
funasr>=1.0.0
datasets==3.6.0

View File

@@ -60,9 +60,9 @@ class NodeWorkers:
current_seq = node.sv_result_seq
if not node.sv_result_cv.wait_for(
lambda: node.sv_result_seq > current_seq,
timeout=2.0
timeout=15.0
):
node.get_logger().warning("[主线程] CHECK_VOICE状态声纹结果未ready超时2秒),拒绝本轮")
node.get_logger().warning("[主线程] CHECK_VOICE状态声纹结果未ready超时15秒),拒绝本轮")
with node.sv_lock:
node.sv_audio_buffer.clear()
node._change_state(ConversationState.IDLE, "声纹结果未ready")

View File

@@ -16,6 +16,8 @@ from ament_index_python.packages import get_package_share_directory
from robot_speaker.perception.audio_pipeline import VADDetector, AudioRecorder
from robot_speaker.perception.speaker_verifier import SpeakerVerificationClient
from robot_speaker.models.asr.dashscope import DashScopeASR
from robot_speaker.models.tts.dashscope import DashScopeTTSClient
from robot_speaker.core.types import TTSRequest
from pypinyin import pinyin, Style
@@ -100,6 +102,15 @@ class RegisterSpeakerNode(Node):
logger=self.get_logger()
)
self.tts_client = DashScopeTTSClient(
api_key=self.dashscope_api_key,
model=self.tts_model,
voice=self.tts_voice,
card_index=self.output_card_index,
device_index=self.output_device_index,
logger=self.get_logger()
)
self.get_logger().info("声纹注册节点启动,请说'er gou......'唤醒注册")
self.recording_thread = threading.Thread(
target=self.audio_recorder.record_with_vad,
@@ -122,12 +133,15 @@ class RegisterSpeakerNode(Node):
dashscope = config['dashscope']
audio = config['audio']
mic = audio['microphone']
soundcard = audio['soundcard']
vad = config['vad']
system = config['system']
self.dashscope_api_key = dashscope['api_key']
self.asr_model = dashscope['asr']['model']
self.asr_url = dashscope['asr']['url']
self.tts_model = dashscope['tts']['model']
self.tts_voice = dashscope['tts']['voice']
self.input_device_index = mic['device_index']
self.sample_rate = mic['sample_rate']
@@ -135,13 +149,16 @@ class RegisterSpeakerNode(Node):
self.chunk = mic['chunk']
self.audio_microphone_heartbeat_interval = mic['heartbeat_interval']
self.output_card_index = soundcard['card_index']
self.output_device_index = soundcard['device_index']
self.vad_mode = vad['vad_mode']
self.silence_duration_ms = vad['silence_duration_ms']
self.min_energy_threshold = vad['min_energy_threshold']
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
self.sv_threshold = system['sv_threshold']
self.sv_speaker_db_path = system['sv_speaker_db_path']
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
self.sv_buffer_size = system['sv_buffer_size']
self.wake_word = system['wake_word']
@@ -183,7 +200,8 @@ class RegisterSpeakerNode(Node):
# 等待声纹语音时用户说话结束使用当前音频即使不足3秒
if self.waiting_for_voiceprint:
self._process_voiceprint_audio(use_current_audio_if_short=True)
return # 处理完毕后直接返回,防止重复调用
def _process_voiceprint_audio(self, use_current_audio_if_short: bool = False):
"""处理声纹音频:使用用户完整的第一段语音进行注册
@@ -200,8 +218,7 @@ class RegisterSpeakerNode(Node):
buffer_sec = buffer_size / self.sample_rate
self.get_logger().info(f"[注册录音] 当前音频长度: {buffer_sec:.2f}")
# 需要3秒音频
required_samples = int(self.sample_rate * 3.0)
required_samples = int(self.sample_rate * 3)
# 如果音频不足3秒
if buffer_size < required_samples:
@@ -215,9 +232,19 @@ class RegisterSpeakerNode(Node):
self.processing = False
return
else:
# 音频达到3秒截取最后3秒
audio_to_use = audio_list[-required_samples:]
self.get_logger().info(f"[注册录音] 使用完整的第一段语音截取最后3秒用于注册")
# 策略优化不再强行截取最后3秒因为唤醒词检测有延迟
# "er gou" 可能在缓冲区的中间偏后位置。
# 为了防止截取到尾部的静音,并在包含完整唤醒词,
# 我们截取最近的 3.0 秒或者全部如果不足3秒
# 这样能最大程度包含有效语音 "二狗"。
target_samples = int(self.sample_rate * 3.0)
if buffer_size > target_samples:
audio_to_use = audio_list[-target_samples:]
else:
audio_to_use = audio_list
duration = len(audio_to_use) / self.sample_rate
self.get_logger().info(f"[注册录音] 使用最近 {duration:.2f} 秒音频用于注册(覆盖唤醒词)")
# 清空缓冲区
with self.buffer_lock:
@@ -237,6 +264,16 @@ class RegisterSpeakerNode(Node):
speaker_id = f"user_{int(time.time())}"
if self.sv_client.register_speaker(speaker_id, embedding):
self.get_logger().info(f"[注册录音] 注册成功用户ID: {speaker_id},准备退出")
# 播放成功提示
try:
self.get_logger().info("[注册录音] 播放注册成功提示")
request = TTSRequest(text="声纹注册成功", voice=self.tts_voice)
self.tts_client.synthesize(request)
time.sleep(0.5)
except Exception as e:
self.get_logger().error(f"[注册录音] 播放提示失败: {e}")
self.stop_event.set()
else:
self.get_logger().error("[注册录音] 注册失败")
@@ -264,7 +301,7 @@ class RegisterSpeakerNode(Node):
# 使用更低的阈值来检测人声(降低阈值,避免误判静音)
# 阈值可以动态调整,或者使用自适应阈值
threshold = self.min_energy_threshold * 0.5 # 降低阈值到原来的50%
threshold = self.min_energy_threshold * 0.30 # 降低阈值到原来的20%
# 如果能量超过阈值,认为是人声
if energy >= threshold:

View File

@@ -165,7 +165,7 @@ class RobotSpeakerNode(Node):
self.sv_enabled = system['sv_enabled']
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
self.sv_threshold = system['sv_threshold']
self.sv_speaker_db_path = system['sv_speaker_db_path']
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path']) # 展开用户目录
self.sv_buffer_size = system['sv_buffer_size']
# 相机参数

View File

@@ -121,10 +121,14 @@ class _TTSCallback(ResultCallback):
'-fflags', 'nobuffer', # 减少缓冲
'-flags', 'low_delay', # 低延迟
'-avioflags', 'direct', # 尝试直通写入 ALSA减少延迟
'-thread_queue_size', str(self.tts_client.tts_ffmpeg_thread_queue_size), # 输入线程队列大小(从配置文件读取)
self.tts_client.alsa_device
]
# 将 -thread_queue_size 放到输入文件之前
insert_pos = ffmpeg_cmd.index('-i')
ffmpeg_cmd.insert(insert_pos, str(self.tts_client.tts_ffmpeg_thread_queue_size))
ffmpeg_cmd.insert(insert_pos, '-thread_queue_size')
# 添加音量调节filter如果音量不是1.0
if self.tts_client.output_volume != 1.0:
# 在输出编码前插入音量filter

View File

@@ -24,7 +24,7 @@ class AudioRecorder:
chunk: int, vad_detector: VADDetector,
audio_queue: queue.Queue, # 音频队列:录音线程 → ASR线程
silence_duration_ms: int = 1000,
min_energy_threshold: int = 300, # 音频能量 > 300有语音
min_energy_threshold: int = 200, # 音频能量 > 200有语音
heartbeat_interval: float = 2.0,
on_heartbeat=None,
is_playing=None,
@@ -59,6 +59,42 @@ class AudioRecorder:
self.get_silence_threshold = get_silence_threshold # 动态静音阈值回调
self.logger = logger
self.audio = pyaudio.PyAudio()
# 自动查找 iFLYTEK 麦克风设备
try:
count = self.audio.get_device_count()
found_index = -1
if self.logger:
self.logger.info(f"开始扫描音频设备 (总数: {count})...")
for i in range(count):
device_info = self.audio.get_device_info_by_index(i)
device_name = device_info.get('name', '')
max_input_channels = device_info.get('maxInputChannels', 0)
if self.logger:
try:
self.logger.info(f"扫描设备 [{i}]: Name='{device_name}', MaxInput={max_input_channels}, Rate={int(device_info.get('defaultSampleRate'))}")
except:
pass
# 检查是否包含 iFLYTEK 且支持录音(输入通道 > 0
if 'iFLYTEK' in device_name and max_input_channels > 0:
found_index = i
if self.logger:
self.logger.info(f"已自动定位到麦克风设备: {device_name} (Index: {i})")
break
if found_index != -1:
self.device_index = found_index
else:
if self.logger:
self.logger.warning(f"未自动检测到 iFLYTEK 设备,将继续使用配置的索引: {self.device_index}")
except Exception as e:
if self.logger:
self.logger.error(f"设备自动检测过程出错: {e}")
self.format = pyaudio.paInt16
self._debug_counter = 0
@@ -121,8 +157,7 @@ class AudioRecorder:
audio_buffer = [] # VAD 滑动窗口
last_active_time = time.time() # 静音计时基准
was_speaking = False # 上一窗口状态
is_speaking = False # 当前窗口状态
in_speech_segment = False # 是否处于语音段中(从检测到人声开始,直到静音超时结束)
try:
while not self.stop_flag():
@@ -171,13 +206,11 @@ class AudioRecorder:
self.logger.info(f"[VAD调试] 能量={energy:.1f}, 阈值={self.min_energy_threshold}, VAD结果={vad_result}")
self._debug_counter = 0
was_speaking = is_speaking
is_speaking = vad_result
if is_speaking:
if vad_result:
last_active_time = now
if not was_speaking: # 上一轮没说话,本轮开始说话
if not in_speech_segment: # 上一轮没说话,本轮开始说话
in_speech_segment = True
if self.on_speech_start:
self.on_speech_start()
@@ -185,8 +218,10 @@ class AudioRecorder:
if self.is_playing() and self.on_new_segment:
self.on_new_segment() # 打断 TTS的回调
else:
if was_speaking:
if in_speech_segment:
# 处于语音段中,但当前帧为静音,检查静音时长
silence_duration = now - last_active_time
# 动态获取静音阈值(如果提供回调函数)
if self.get_silence_threshold:
current_silence_ms = self.get_silence_threshold()
@@ -203,6 +238,7 @@ class AudioRecorder:
if self.logger:
self.logger.debug(f"[VAD] 触发speech_end: 静音持续时间 {silence_duration:.3f}秒 >= 阈值 {current_no_speech_threshold:.3f}")
self.on_speech_end() # 通知系统用户停止说话
in_speech_segment = False
if self.on_heartbeat and now - last_heartbeat_time >= self.heartbeat_interval:
self.on_heartbeat()
@@ -256,7 +292,7 @@ class AudioRecorder:
# 语音开头能量高, 中后段(拖音、尾音)能量下降
vad_result = num >= required
if vad_result and energy < self.min_energy_threshold * 0.5:
if vad_result and energy < self.min_energy_threshold * 0.3:
return False
return vad_result

View File

@@ -32,7 +32,8 @@ class SpeakerVerificationClient:
from funasr import AutoModel
model_path = os.path.expanduser(self.model_path)
self.model = AutoModel(model=model_path, device="cpu")
# 禁用自动更新检查,防止每次初始化都联网检查
self.model = AutoModel(model=model_path, device="cpu", disable_update=True)
if self.logger:
self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
@@ -82,6 +83,21 @@ class SpeakerVerificationClient:
"""
提取说话人embedding低频调用一句话只调用一次
"""
# 降采样到 16000Hz (如果需要)
# Cam++ 等模型通常只支持 16k如果传入 48k 会导致内部重采样极慢或计算量剧增
target_sr = 16000
if sample_rate > target_sr:
if sample_rate % target_sr == 0:
step = sample_rate // target_sr
audio_data = audio_data[::step]
sample_rate = target_sr
else:
# 简单的非整数倍降采样可能导致问题,但对于语音验证通常 48k->16k 是整数倍
# 如果不是,此处暂不处理,依赖 funasr 内部处理,或者简单的步长取整
step = int(sample_rate / target_sr)
audio_data = audio_data[::step]
sample_rate = target_sr
if len(audio_data) < int(sample_rate * 0.5):
return None, False