Merge branch 'feature-deploy' into develop
# Conflicts: # config/voice.yaml # robot_speaker/core/robot_speaker_node.py
This commit is contained in:
13
README.md
13
README.md
@@ -9,19 +9,28 @@ https://bailian.console.aliyun.com/?tab=model#/api-key
|
||||
1. 系统依赖
|
||||
```bash
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg
|
||||
sudo apt-get install -y python3-pyaudio portaudio19-dev alsa-utils ffmpeg swig meson ninja-build build-essential pkg-config libwebrtc-audio-processing-dev
|
||||
```
|
||||
|
||||
2. Python依赖
|
||||
```bash
|
||||
cd ~/ros_learn/hivecore_robot_voice
|
||||
# 在 Python 3.10 环境下,需要单独安装 aec-audio-processing 以跳过版本检查
|
||||
pip3 install aec-audio-processing --no-binary :all: --ignore-requires-python --break-system-packages
|
||||
pip3 install -r requirements.txt --break-system-packages
|
||||
```
|
||||
|
||||
## 编译启动
|
||||
1. 注册声纹
|
||||
- 启动节点后可以说:二狗今天天气真好开始注册声纹
|
||||
- 注意:要包含唤醒词,语句不要停顿,尽量大于3秒
|
||||
- 正确的注册姿势:
|
||||
方法A(推荐):唤醒后停顿一下,然后说一段长句子。
|
||||
用户:"二狗"
|
||||
机器:(日志提示等待声纹语音)
|
||||
用户:"我现在正在注册声纹,这是一段很长的测试语音,请把我的声音录进去。"(持续说 3-5 秒)
|
||||
方法B(连贯说):一口气说很长的一句话。
|
||||
用户:"二狗你好,我是你的主人,请记住我的声音,这是一段用来注册的长语音。"
|
||||
- 注意:要包含唤醒词,语句不要停顿,尽量大于1.5秒
|
||||
```bash
|
||||
cd ~/ros_learn/hivecore_robot_voice
|
||||
colcon build
|
||||
|
||||
@@ -18,8 +18,8 @@ dashscope:
|
||||
|
||||
audio:
|
||||
microphone:
|
||||
device_index: -1 # -1 表示使用默认设备
|
||||
sample_rate: 16000 # 输入采样率:16kHz(语音识别常用采样率)
|
||||
device_index: 3 # 指向 iFLYTEK-M2 (hw:1,0)
|
||||
sample_rate: 48000 # 尝试使用硬件原生采样率 48kHz,避免重采样可能导致的问题
|
||||
channels: 1 # 输入声道数:单声道(MONO,适合语音采集)
|
||||
chunk: 1024
|
||||
heartbeat_interval: 2.0 # 心跳间隔(秒),用于定期输出录音状态
|
||||
@@ -28,11 +28,11 @@ audio:
|
||||
device_index: 0 # USB Audio [USB Audio] (device 0)
|
||||
# card_index: -1 # 使用默认声卡
|
||||
# device_index: -1 # 使用默认输出设备
|
||||
sample_rate: 44100 # 输出采样率:44.1kHz(支持48000或44100)
|
||||
sample_rate: 48000 # 输出采样率:48kHz(iFLYTEK 支持 48000)
|
||||
channels: 2 # 输出声道数:立体声(2声道,FL+FR)
|
||||
volume: 1.0 # 音量比例(0.0-1.0,0.2表示20%音量)
|
||||
echo_cancellation:
|
||||
enabled: false # 是否启用回声消除(true/false)
|
||||
enable: false # 是否启用回声消除
|
||||
max_duration_ms: 500 # 参考信号缓冲区最大时长(毫秒)
|
||||
tts:
|
||||
source_sample_rate: 22050 # TTS服务固定输出采样率(DashScope服务固定值,不可修改)
|
||||
@@ -52,10 +52,10 @@ system:
|
||||
shutup_keywords: "bi zui" # 闭嘴指令关键词(拼音,逗号分隔)
|
||||
interrupt_command_queue_depth: 10 # 中断命令订阅的队列深度(QoS)
|
||||
sv_enabled: true # 是否启用声纹识别
|
||||
sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
|
||||
sv_model_path: "~/hivecore_robot_os1/voice_model" # 声纹模型路径
|
||||
sv_threshold: 0.55 # 声纹识别阈值(0.0-1.0,值越小越宽松,值越大越严格)
|
||||
sv_speaker_db_path: "config/speakers.json" # 声纹数据库保存路径(JSON格式,相对于ROS2包share目录)
|
||||
sv_buffer_size: 64000 # 声纹验证录音缓冲区大小(样本数)
|
||||
sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json" # 声纹数据库保存路径(JSON格式,相对于ROS2包share目录)
|
||||
sv_buffer_size: 240000 # 声纹验证录音缓冲区大小(样本数,48kHz下5秒=240000)
|
||||
sv_registration_silence_threshold_ms: 500 # 声纹注册状态下的静音阈值(毫秒)
|
||||
|
||||
camera:
|
||||
|
||||
@@ -10,6 +10,7 @@ numpy>=1.24.0
|
||||
PyYAML>=6.0
|
||||
aec-audio-processing
|
||||
modelscope>=1.33.0
|
||||
funasr>=1.0.0
|
||||
datasets==3.6.0
|
||||
|
||||
|
||||
|
||||
@@ -60,9 +60,9 @@ class NodeWorkers:
|
||||
current_seq = node.sv_result_seq
|
||||
if not node.sv_result_cv.wait_for(
|
||||
lambda: node.sv_result_seq > current_seq,
|
||||
timeout=2.0
|
||||
timeout=15.0
|
||||
):
|
||||
node.get_logger().warning("[主线程] CHECK_VOICE状态:声纹结果未ready(超时2秒),拒绝本轮")
|
||||
node.get_logger().warning("[主线程] CHECK_VOICE状态:声纹结果未ready(超时15秒),拒绝本轮")
|
||||
with node.sv_lock:
|
||||
node.sv_audio_buffer.clear()
|
||||
node._change_state(ConversationState.IDLE, "声纹结果未ready")
|
||||
@@ -119,7 +119,16 @@ class NodeWorkers:
|
||||
node = self.node
|
||||
node.get_logger().info("[声纹识别线程] 启动")
|
||||
|
||||
min_audio_samples = 8000
|
||||
# 动态计算最小音频样本数,确保降采样到16kHz后≥0.5秒
|
||||
target_sr = 16000 # CAM++模型目标采样率
|
||||
min_duration_seconds = 0.5
|
||||
min_samples_at_target_sr = int(target_sr * min_duration_seconds) # 8000样本@16kHz
|
||||
|
||||
if node.sample_rate >= target_sr:
|
||||
downsample_step = int(node.sample_rate / target_sr)
|
||||
min_audio_samples = min_samples_at_target_sr * downsample_step
|
||||
else:
|
||||
min_audio_samples = int(node.sample_rate * min_duration_seconds)
|
||||
|
||||
while not node.stop_event.is_set():
|
||||
try:
|
||||
|
||||
@@ -15,7 +15,10 @@ from ament_index_python.packages import get_package_share_directory
|
||||
|
||||
from robot_speaker.perception.audio_pipeline import VADDetector, AudioRecorder
|
||||
from robot_speaker.perception.speaker_verifier import SpeakerVerificationClient
|
||||
from robot_speaker.perception.echo_cancellation import ReferenceSignalBuffer
|
||||
from robot_speaker.models.asr.dashscope import DashScopeASR
|
||||
from robot_speaker.models.tts.dashscope import DashScopeTTSClient
|
||||
from robot_speaker.core.types import TTSRequest
|
||||
from pypinyin import pinyin, Style
|
||||
|
||||
|
||||
@@ -42,6 +45,13 @@ class RegisterSpeakerNode(Node):
|
||||
sample_rate=self.sample_rate
|
||||
)
|
||||
|
||||
# 创建参考信号缓冲区(用于回声消除)
|
||||
self.reference_signal_buffer = ReferenceSignalBuffer(
|
||||
max_duration_ms=self.audio_echo_cancellation_max_duration_ms,
|
||||
sample_rate=self.sample_rate,
|
||||
channels=self.output_channels
|
||||
) if self.audio_echo_cancellation_enabled else None
|
||||
|
||||
self.audio_recorder = AudioRecorder(
|
||||
device_index=self.input_device_index,
|
||||
sample_rate=self.sample_rate,
|
||||
@@ -61,8 +71,8 @@ class RegisterSpeakerNode(Node):
|
||||
on_audio_chunk=self._on_audio_chunk,
|
||||
should_put_to_queue=self._should_put_to_queue,
|
||||
get_silence_threshold=lambda: self.silence_duration_ms,
|
||||
enable_echo_cancellation=False,
|
||||
reference_signal_buffer=None,
|
||||
enable_echo_cancellation=self.audio_echo_cancellation_enabled, # 启用回声消除,保持与主程序一致
|
||||
reference_signal_buffer=self.reference_signal_buffer,
|
||||
logger=self.get_logger()
|
||||
)
|
||||
|
||||
@@ -100,6 +110,22 @@ class RegisterSpeakerNode(Node):
|
||||
logger=self.get_logger()
|
||||
)
|
||||
|
||||
self.tts_client = DashScopeTTSClient(
|
||||
api_key=self.dashscope_api_key,
|
||||
model=self.tts_model,
|
||||
voice=self.tts_voice,
|
||||
card_index=self.output_card_index,
|
||||
device_index=self.output_device_index,
|
||||
output_sample_rate=self.output_sample_rate,
|
||||
output_channels=self.output_channels,
|
||||
output_volume=self.output_volume,
|
||||
tts_source_sample_rate=self.audio_tts_source_sample_rate,
|
||||
tts_source_channels=self.audio_tts_source_channels,
|
||||
tts_ffmpeg_thread_queue_size=self.audio_tts_ffmpeg_thread_queue_size,
|
||||
reference_signal_buffer=self.reference_signal_buffer,
|
||||
logger=self.get_logger()
|
||||
)
|
||||
|
||||
self.get_logger().info("声纹注册节点启动,请说'er gou......'唤醒注册")
|
||||
self.recording_thread = threading.Thread(
|
||||
target=self.audio_recorder.record_with_vad,
|
||||
@@ -122,12 +148,15 @@ class RegisterSpeakerNode(Node):
|
||||
dashscope = config['dashscope']
|
||||
audio = config['audio']
|
||||
mic = audio['microphone']
|
||||
soundcard = audio['soundcard']
|
||||
vad = config['vad']
|
||||
system = config['system']
|
||||
|
||||
self.dashscope_api_key = dashscope['api_key']
|
||||
self.asr_model = dashscope['asr']['model']
|
||||
self.asr_url = dashscope['asr']['url']
|
||||
self.tts_model = dashscope['tts']['model']
|
||||
self.tts_voice = dashscope['tts']['voice']
|
||||
|
||||
self.input_device_index = mic['device_index']
|
||||
self.sample_rate = mic['sample_rate']
|
||||
@@ -135,13 +164,28 @@ class RegisterSpeakerNode(Node):
|
||||
self.chunk = mic['chunk']
|
||||
self.audio_microphone_heartbeat_interval = mic['heartbeat_interval']
|
||||
|
||||
self.output_card_index = soundcard['card_index']
|
||||
self.output_device_index = soundcard['device_index']
|
||||
self.output_sample_rate = soundcard['sample_rate']
|
||||
self.output_channels = soundcard['channels']
|
||||
self.output_volume = soundcard['volume']
|
||||
|
||||
echo = audio.get('echo_cancellation', {})
|
||||
self.audio_echo_cancellation_enabled = echo['enable']
|
||||
self.audio_echo_cancellation_max_duration_ms = echo.get('max_duration_ms', 200)
|
||||
|
||||
tts_audio = audio.get('tts', {})
|
||||
self.audio_tts_source_sample_rate = tts_audio.get('source_sample_rate', 22050)
|
||||
self.audio_tts_source_channels = tts_audio.get('source_channels', 1)
|
||||
self.audio_tts_ffmpeg_thread_queue_size = tts_audio.get('ffmpeg_thread_queue_size', 5)
|
||||
|
||||
self.vad_mode = vad['vad_mode']
|
||||
self.silence_duration_ms = vad['silence_duration_ms']
|
||||
self.min_energy_threshold = vad['min_energy_threshold']
|
||||
|
||||
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
|
||||
self.sv_threshold = system['sv_threshold']
|
||||
self.sv_speaker_db_path = system['sv_speaker_db_path']
|
||||
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path'])
|
||||
self.sv_buffer_size = system['sv_buffer_size']
|
||||
self.wake_word = system['wake_word']
|
||||
|
||||
@@ -183,7 +227,8 @@ class RegisterSpeakerNode(Node):
|
||||
# 等待声纹语音时,用户说话结束,使用当前音频(即使不足3秒)
|
||||
if self.waiting_for_voiceprint:
|
||||
self._process_voiceprint_audio(use_current_audio_if_short=True)
|
||||
|
||||
return # 处理完毕后直接返回,防止重复调用
|
||||
|
||||
def _process_voiceprint_audio(self, use_current_audio_if_short: bool = False):
|
||||
"""处理声纹音频:使用用户完整的第一段语音进行注册
|
||||
|
||||
@@ -200,8 +245,7 @@ class RegisterSpeakerNode(Node):
|
||||
buffer_sec = buffer_size / self.sample_rate
|
||||
self.get_logger().info(f"[注册录音] 当前音频长度: {buffer_sec:.2f}秒")
|
||||
|
||||
# 需要3秒音频
|
||||
required_samples = int(self.sample_rate * 3.0)
|
||||
required_samples = int(self.sample_rate * 3)
|
||||
|
||||
# 如果音频不足3秒
|
||||
if buffer_size < required_samples:
|
||||
@@ -215,9 +259,19 @@ class RegisterSpeakerNode(Node):
|
||||
self.processing = False
|
||||
return
|
||||
else:
|
||||
# 音频达到3秒,截取最后3秒
|
||||
audio_to_use = audio_list[-required_samples:]
|
||||
self.get_logger().info(f"[注册录音] 使用完整的第一段语音,截取最后3秒用于注册")
|
||||
# 策略优化:不再强行截取最后3秒,因为唤醒词检测有延迟,
|
||||
# "er gou" 可能在缓冲区的中间偏后位置。
|
||||
# 为了防止截取到尾部的静音,并在包含完整唤醒词,
|
||||
# 我们截取最近的 3.0 秒(或者全部,如果不足3秒),
|
||||
# 这样能最大程度包含有效语音 "二狗"。
|
||||
target_samples = int(self.sample_rate * 3.0)
|
||||
if buffer_size > target_samples:
|
||||
audio_to_use = audio_list[-target_samples:]
|
||||
else:
|
||||
audio_to_use = audio_list
|
||||
|
||||
duration = len(audio_to_use) / self.sample_rate
|
||||
self.get_logger().info(f"[注册录音] 使用最近 {duration:.2f} 秒音频用于注册(覆盖唤醒词)")
|
||||
|
||||
# 清空缓冲区
|
||||
with self.buffer_lock:
|
||||
@@ -237,6 +291,16 @@ class RegisterSpeakerNode(Node):
|
||||
speaker_id = f"user_{int(time.time())}"
|
||||
if self.sv_client.register_speaker(speaker_id, embedding):
|
||||
self.get_logger().info(f"[注册录音] 注册成功,用户ID: {speaker_id},准备退出")
|
||||
|
||||
# 播放成功提示
|
||||
try:
|
||||
self.get_logger().info("[注册录音] 播放注册成功提示")
|
||||
request = TTSRequest(text="声纹注册成功", voice=self.tts_voice)
|
||||
self.tts_client.synthesize(request)
|
||||
time.sleep(5)
|
||||
except Exception as e:
|
||||
self.get_logger().error(f"[注册录音] 播放提示失败: {e}")
|
||||
|
||||
self.stop_event.set()
|
||||
else:
|
||||
self.get_logger().error("[注册录音] 注册失败")
|
||||
@@ -264,7 +328,7 @@ class RegisterSpeakerNode(Node):
|
||||
|
||||
# 使用更低的阈值来检测人声(降低阈值,避免误判静音)
|
||||
# 阈值可以动态调整,或者使用自适应阈值
|
||||
threshold = self.min_energy_threshold * 0.5 # 降低阈值到原来的50%
|
||||
threshold = self.min_energy_threshold * 0.50 # 降低阈值到原来的50%
|
||||
|
||||
# 如果能量超过阈值,认为是人声
|
||||
if energy >= threshold:
|
||||
|
||||
@@ -96,16 +96,6 @@ class RobotSpeakerNode(Node):
|
||||
self.interrupt_sub = self.create_subscription(
|
||||
String, 'interrupt_command', self.callbacks.handle_interrupt_command, self.system_interrupt_command_queue_depth
|
||||
)
|
||||
self.skill_sequence_pub = self.create_publisher(String, '/llm_skill_sequence', 10)
|
||||
self.skill_feedback_sub = self.create_subscription(
|
||||
String, '/skill_execution_feedback', self._on_skill_feedback, 10
|
||||
)
|
||||
self.skill_result_sub = self.create_subscription(
|
||||
String, '/skill_execution_result', self._on_skill_result, 10
|
||||
)
|
||||
|
||||
self.latest_skill_feedback = None
|
||||
self.latest_skill_result = None
|
||||
|
||||
# 启动线程
|
||||
self._start_threads()
|
||||
@@ -139,7 +129,7 @@ class RobotSpeakerNode(Node):
|
||||
self.output_sample_rate = soundcard['sample_rate']
|
||||
self.output_channels = soundcard['channels']
|
||||
self.output_volume = soundcard['volume']
|
||||
self.audio_echo_cancellation_enabled = echo.get('enabled', True) # 默认启用
|
||||
self.audio_echo_cancellation_enabled = echo['enable']
|
||||
self.audio_echo_cancellation_max_duration_ms = echo['max_duration_ms']
|
||||
self.audio_tts_source_sample_rate = tts_audio['source_sample_rate']
|
||||
self.audio_tts_source_channels = tts_audio['source_channels']
|
||||
@@ -176,7 +166,7 @@ class RobotSpeakerNode(Node):
|
||||
self.sv_enabled = system['sv_enabled']
|
||||
self.sv_model_path = os.path.expanduser(system['sv_model_path'])
|
||||
self.sv_threshold = system['sv_threshold']
|
||||
self.sv_speaker_db_path = system['sv_speaker_db_path']
|
||||
self.sv_speaker_db_path = os.path.expanduser(system['sv_speaker_db_path']) # 展开用户目录
|
||||
self.sv_buffer_size = system['sv_buffer_size']
|
||||
|
||||
# 相机参数
|
||||
@@ -253,7 +243,7 @@ class RobotSpeakerNode(Node):
|
||||
on_audio_chunk=self.callbacks.on_audio_chunk_for_sv if self.sv_enabled else None, # 声纹录音回调
|
||||
should_put_to_queue=self.callbacks.should_put_audio_to_queue, # 检查是否应该将音频放入队列
|
||||
get_silence_threshold=self.callbacks.get_silence_threshold, # 动态静音阈值回调
|
||||
enable_echo_cancellation=self.audio_echo_cancellation_enabled, # 从配置文件读取
|
||||
enable_echo_cancellation=self.audio_echo_cancellation_enabled, # 启用回声消除
|
||||
reference_signal_buffer=self.reference_signal_buffer, # 传递参考信号缓冲区
|
||||
logger=self.get_logger()
|
||||
)
|
||||
@@ -462,13 +452,7 @@ class RobotSpeakerNode(Node):
|
||||
self.get_logger().error(f"图像编码失败: {e}")
|
||||
return ""
|
||||
|
||||
def _llm_process_stream_with_camera(
|
||||
self,
|
||||
user_text: str,
|
||||
need_camera: bool,
|
||||
system_prompt: str | None = None,
|
||||
suppress_tts: bool = False
|
||||
) -> str:
|
||||
def _llm_process_stream_with_camera(self, user_text: str, need_camera: bool, system_prompt: str | None = None) -> str:
|
||||
"""LLM流式处理 - 支持多模态(文本+图像)"""
|
||||
if not self.llm_client or not self.history:
|
||||
return ""
|
||||
@@ -547,7 +531,7 @@ class RobotSpeakerNode(Node):
|
||||
tts_text_to_send = ""
|
||||
self.get_logger().warning("[流式TTS] reply和tts_text_buffer都为空,无法发送TTS文本")
|
||||
|
||||
if not self.interrupt_event.is_set() and tts_text_to_send and not suppress_tts:
|
||||
if not self.interrupt_event.is_set() and tts_text_to_send:
|
||||
text_len = len(tts_text_to_send)
|
||||
self.get_logger().info(
|
||||
f"[流式TTS] 发送完整文本到TTS队列: {tts_text_to_send[:100]}... (总长度: {text_len}字符)"
|
||||
@@ -555,8 +539,6 @@ class RobotSpeakerNode(Node):
|
||||
if text_len > 100:
|
||||
self.get_logger().debug(f"[流式TTS] 完整文本内容: {tts_text_to_send}")
|
||||
self._put_tts_text(tts_text_to_send)
|
||||
elif suppress_tts:
|
||||
self.get_logger().info("[流式TTS] suppress_tts开启,跳过TTS输出")
|
||||
|
||||
return reply.strip() if reply else ""
|
||||
|
||||
@@ -741,17 +723,11 @@ class RobotSpeakerNode(Node):
|
||||
reply = self._llm_process_stream_with_camera(
|
||||
text,
|
||||
need_camera=need_camera,
|
||||
system_prompt=system_prompt,
|
||||
suppress_tts=(intent == "skill_sequence")
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
if reply:
|
||||
if self.history:
|
||||
self.history.commit_turn(reply)
|
||||
if intent == "skill_sequence":
|
||||
skill_msg = String()
|
||||
skill_msg.data = reply.strip()
|
||||
self.skill_sequence_pub.publish(skill_msg)
|
||||
self.get_logger().info(f"[技能序列] 已发布: {skill_msg.data}")
|
||||
else:
|
||||
if self.history:
|
||||
self.history.cancel_turn()
|
||||
@@ -796,36 +772,6 @@ class RobotSpeakerNode(Node):
|
||||
|
||||
super().destroy_node()
|
||||
|
||||
def _on_skill_feedback(self, msg: String):
|
||||
try:
|
||||
feedback = json.loads(msg.data)
|
||||
self.latest_skill_feedback = feedback
|
||||
feedback_text = (
|
||||
f"【执行状态】阶段:{feedback.get('stage','')}, "
|
||||
f"技能:{feedback.get('current_skill','')}, "
|
||||
f"进度:{feedback.get('progress', 0):.1%}, "
|
||||
f"详情:{feedback.get('detail','')}"
|
||||
)
|
||||
if self.history:
|
||||
self.history.add_message("system", feedback_text)
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"[技能反馈] 解析失败: {e}")
|
||||
|
||||
def _on_skill_result(self, msg: String):
|
||||
try:
|
||||
result = json.loads(msg.data)
|
||||
self.latest_skill_result = result
|
||||
result_text = (
|
||||
f"【执行结果】{'成功' if result.get('success') else '失败'}, "
|
||||
f"总技能数:{result.get('total_skills', 0)}, "
|
||||
f"成功数:{result.get('succeeded_skills', 0)}, "
|
||||
f"消息:{result.get('message','')}"
|
||||
)
|
||||
if self.history:
|
||||
self.history.add_message("system", result_text)
|
||||
except Exception as e:
|
||||
self.get_logger().warning(f"[技能结果] 解析失败: {e}")
|
||||
|
||||
|
||||
def _init_ros(args):
|
||||
rclpy.init(args=args)
|
||||
|
||||
@@ -121,10 +121,14 @@ class _TTSCallback(ResultCallback):
|
||||
'-fflags', 'nobuffer', # 减少缓冲
|
||||
'-flags', 'low_delay', # 低延迟
|
||||
'-avioflags', 'direct', # 尝试直通写入 ALSA,减少延迟
|
||||
'-thread_queue_size', str(self.tts_client.tts_ffmpeg_thread_queue_size), # 输入线程队列大小(从配置文件读取)
|
||||
self.tts_client.alsa_device
|
||||
]
|
||||
|
||||
# 将 -thread_queue_size 放到输入文件之前
|
||||
insert_pos = ffmpeg_cmd.index('-i')
|
||||
ffmpeg_cmd.insert(insert_pos, str(self.tts_client.tts_ffmpeg_thread_queue_size))
|
||||
ffmpeg_cmd.insert(insert_pos, '-thread_queue_size')
|
||||
|
||||
# 添加音量调节filter(如果音量不是1.0)
|
||||
if self.tts_client.output_volume != 1.0:
|
||||
# 在输出编码前插入音量filter
|
||||
|
||||
@@ -59,6 +59,42 @@ class AudioRecorder:
|
||||
self.get_silence_threshold = get_silence_threshold # 动态静音阈值回调
|
||||
self.logger = logger
|
||||
self.audio = pyaudio.PyAudio()
|
||||
|
||||
# 自动查找 iFLYTEK 麦克风设备
|
||||
try:
|
||||
count = self.audio.get_device_count()
|
||||
found_index = -1
|
||||
if self.logger:
|
||||
self.logger.info(f"开始扫描音频设备 (总数: {count})...")
|
||||
|
||||
for i in range(count):
|
||||
device_info = self.audio.get_device_info_by_index(i)
|
||||
device_name = device_info.get('name', '')
|
||||
max_input_channels = device_info.get('maxInputChannels', 0)
|
||||
|
||||
if self.logger:
|
||||
try:
|
||||
self.logger.info(f"扫描设备 [{i}]: Name='{device_name}', MaxInput={max_input_channels}, Rate={int(device_info.get('defaultSampleRate'))}")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 检查是否包含 iFLYTEK 且支持录音(输入通道 > 0)
|
||||
if 'iFLYTEK' in device_name and max_input_channels > 0:
|
||||
found_index = i
|
||||
if self.logger:
|
||||
self.logger.info(f"已自动定位到麦克风设备: {device_name} (Index: {i})")
|
||||
break
|
||||
|
||||
if found_index != -1:
|
||||
self.device_index = found_index
|
||||
else:
|
||||
if self.logger:
|
||||
self.logger.warning(f"未自动检测到 iFLYTEK 设备,将继续使用配置的索引: {self.device_index}")
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"设备自动检测过程出错: {e}")
|
||||
|
||||
self.format = pyaudio.paInt16
|
||||
self._debug_counter = 0
|
||||
|
||||
@@ -121,8 +157,7 @@ class AudioRecorder:
|
||||
|
||||
audio_buffer = [] # VAD 滑动窗口
|
||||
last_active_time = time.time() # 静音计时基准
|
||||
was_speaking = False # 上一窗口状态
|
||||
is_speaking = False # 当前窗口状态
|
||||
in_speech_segment = False # 是否处于语音段中(从检测到人声开始,直到静音超时结束)
|
||||
|
||||
try:
|
||||
while not self.stop_flag():
|
||||
@@ -171,13 +206,11 @@ class AudioRecorder:
|
||||
self.logger.info(f"[VAD调试] 能量={energy:.1f}, 阈值={self.min_energy_threshold}, VAD结果={vad_result}")
|
||||
self._debug_counter = 0
|
||||
|
||||
was_speaking = is_speaking
|
||||
is_speaking = vad_result
|
||||
|
||||
if is_speaking:
|
||||
if vad_result:
|
||||
last_active_time = now
|
||||
|
||||
if not was_speaking: # 上一轮没说话,本轮开始说话
|
||||
if not in_speech_segment: # 上一轮没说话,本轮开始说话
|
||||
in_speech_segment = True
|
||||
if self.on_speech_start:
|
||||
self.on_speech_start()
|
||||
|
||||
@@ -185,8 +218,10 @@ class AudioRecorder:
|
||||
if self.is_playing() and self.on_new_segment:
|
||||
self.on_new_segment() # 打断 TTS的回调
|
||||
else:
|
||||
if was_speaking:
|
||||
if in_speech_segment:
|
||||
# 处于语音段中,但当前帧为静音,检查静音时长
|
||||
silence_duration = now - last_active_time
|
||||
|
||||
# 动态获取静音阈值(如果提供回调函数)
|
||||
if self.get_silence_threshold:
|
||||
current_silence_ms = self.get_silence_threshold()
|
||||
@@ -203,6 +238,7 @@ class AudioRecorder:
|
||||
if self.logger:
|
||||
self.logger.debug(f"[VAD] 触发speech_end: 静音持续时间 {silence_duration:.3f}秒 >= 阈值 {current_no_speech_threshold:.3f}秒")
|
||||
self.on_speech_end() # 通知系统用户停止说话
|
||||
in_speech_segment = False
|
||||
|
||||
if self.on_heartbeat and now - last_heartbeat_time >= self.heartbeat_interval:
|
||||
self.on_heartbeat()
|
||||
|
||||
@@ -30,9 +30,14 @@ class SpeakerVerificationClient:
|
||||
self.speaker_db = {} # {speaker_id: {"embedding": np.ndarray, "env": str, "threshold": float, "registered_at": float}}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
# 优化CPU性能:限制Torch使用的线程数,防止多线程竞争导致性能骤降
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
|
||||
from funasr import AutoModel
|
||||
model_path = os.path.expanduser(self.model_path)
|
||||
self.model = AutoModel(model=model_path, device="cpu")
|
||||
# 禁用自动更新检查,防止每次初始化都联网检查
|
||||
self.model = AutoModel(model=model_path, device="cpu", disable_update=True)
|
||||
if self.logger:
|
||||
self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
|
||||
|
||||
@@ -82,6 +87,21 @@ class SpeakerVerificationClient:
|
||||
"""
|
||||
提取说话人embedding(低频调用,一句话只调用一次)
|
||||
"""
|
||||
# 降采样到 16000Hz (如果需要)
|
||||
# Cam++ 等模型通常只支持 16k,如果传入 48k 会导致内部重采样极慢或计算量剧增
|
||||
target_sr = 16000
|
||||
if sample_rate > target_sr:
|
||||
if sample_rate % target_sr == 0:
|
||||
step = sample_rate // target_sr
|
||||
audio_data = audio_data[::step]
|
||||
sample_rate = target_sr
|
||||
else:
|
||||
# 简单的非整数倍降采样可能导致问题,但对于语音验证通常 48k->16k 是整数倍
|
||||
# 如果不是,此处暂不处理,依赖 funasr 内部处理,或者简单的步长取整
|
||||
step = int(sample_rate / target_sr)
|
||||
audio_data = audio_data[::step]
|
||||
sample_rate = target_sr
|
||||
|
||||
if len(audio_data) < int(sample_rate * 0.5):
|
||||
return None, False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user