fix: Optimize voice interaction pipeline

1. register_speaker_node: Enable AEC to match main node for better SV accuracy.
2. tts/dashscope: Fix ffmpeg argument order (input option thread_queue_size).
3. asr/dashscope: Keep WebSocket connection alive to reduce latency.
4. speaker_verifier: Force single-thread inference to avoid CPU contention.
This commit is contained in:
NuoDaJia02
2026-01-19 16:17:27 +08:00
parent 293e69e9f2
commit e5714e3a8b
2 changed files with 35 additions and 38 deletions

View File

@@ -116,56 +116,45 @@ class DashScopeASR(ASRClient):
def stop_current_recognition(self):
"""
停止当前识别触发final结果然后重新启动
优化:
1. 使用事件代替 sleep等待 final 回调完成
2. 使用锁防止并发调用
3. 处理 start() 失败的情况,确保 running 状态正确
4. 添加超时机制,避免无限等待
触发提交操作获取当前识别结果,但不关闭连接
"""
if not self.running or not self.conversation:
return False
# 使用锁防止并发调用
if not self._stop_lock.acquire(blocking=False):
self._log("warning", "stop_current_recognition 正在执行,跳过本次调用")
return False
try:
if not self.running or not self.conversation:
return False
# 重置事件,准备等待 final 回调
self._final_result_event.clear()
self._pending_commit = True
# 触发 commit等待 final 结果
self.conversation.commit()
# 等待 final 回调完成最多等待1秒
if self._final_result_event.wait(timeout=1.0):
self._log("debug", "已收到 final 回调,准备关闭连接")
self._log("debug", "已收到 final 回调")
else:
self._log("warning", "等待 final 回调超时,继续执行")
# 先设置running=False防止ASR线程继续发送音频
self.running = False
# 关闭当前连接
old_conversation = self.conversation
self.conversation = None # 立即清空防止send_audio继续使用
try:
old_conversation.close()
except Exception as e:
self._log("warning", f"关闭连接时出错: {e}")
# 短暂等待,确保连接完全关闭
time.sleep(0.1)
# 重新启动,如果失败则保持 running=False
if not self.start():
self._log("error", "ASR重启失败running状态已重置")
return False
# 启动成功running已在start()中设置为True
return True
except Exception as e:
self._log("error", f"提交当前识别结果失败: {e}")
# 出现错误时尝试重启连接
self.running = False
try:
if self.conversation:
self.conversation.close()
except:
pass
self.conversation = None
time.sleep(0.1)
return self.start()
finally:
self._pending_commit = False
self._stop_lock.release()

View File

@@ -107,11 +107,19 @@ class SpeakerVerificationClient:
temp_wav_path = None
try:
temp_wav_path = self._write_temp_wav(audio_data, sample_rate)
result = self.model.generate(input=temp_wav_path)
# 限制Torch在推理时使用单线程避免在多任务环境下尤其是一边录音一边识别
# 出现的极端CPU竞争和上下文切换开销
import torch
embedding = result[0]['spk_embedding'].detach().cpu().numpy()[0] # shape [1, 192] -> [192]
with torch.inference_mode():
# 临时设置,虽然全局已经设置了,但在调用前再次确保
# 注意set_num_threads 是全局的,这里再次确认
if torch.get_num_threads() != 1:
torch.set_num_threads(1)
temp_wav_path = self._write_temp_wav(audio_data, sample_rate)
result = self.model.generate(input=temp_wav_path)
embedding = result[0]['spk_embedding'].detach().cpu().numpy()[0] # shape [1, 192] -> [192]
embedding_dim = len(embedding)
if embedding_dim == 0: