增加AEC
This commit is contained in:
51
README.md
51
README.md
@@ -28,18 +28,53 @@ ros2 launch robot_speaker voice.launch.py
|
||||
|
||||
## 架构说明
|
||||
[录音线程] - 唯一实时线程
|
||||
├─ 麦克风采集 PCM
|
||||
├─ VAD + 能量检测
|
||||
├─ 检测到人声 → 立即中断TTS
|
||||
└─ 音频chunk → 音频队列
|
||||
├─ 语音 PCM → ASR 音频队列
|
||||
└─ 语音 PCM → 声纹音频队列(旁路,不阻塞)
|
||||
|
||||
[ASR推理线程] - 只做 audio → text
|
||||
└─ 从音频队列取chunk → ASR识别 → sentence_end → 文本队列
|
||||
└─ 从 ASR 音频队列取音频→ 实时 / 流式 ASR → text → 文本队列
|
||||
|
||||
[声纹识别线程] - 非实时、低频(CAM++)
|
||||
├─ 通过回调函数接收音频chunk,写入缓冲区,等待 speech_end 事件触发处理
|
||||
├─ 累积 1~2 秒有效人声(VAD 后)
|
||||
├─ CAM++ 提取 speaker embedding
|
||||
├─ 声纹匹配 / 注册
|
||||
└─ 更新 current_speaker_id(共享状态,只写不控)
|
||||
声纹线程要求:不影响录音,不影响ASR,不控制TTS,只更新当前说话人是谁
|
||||
|
||||
[主线程/处理线程] - 处理业务逻辑
|
||||
├─ 从文本队列取文本
|
||||
├─ 唤醒词处理
|
||||
├─ LLM处理(流式)
|
||||
└─ TTS文本 → TTS队列
|
||||
├─ 从 文本队列 取 ASR 文本
|
||||
├─ 读取 current_speaker_id(只读)
|
||||
├─ 唤醒词处理(结合 speaker_id)
|
||||
├─ 权限 / 身份判断(是否允许继续)
|
||||
├─ VLM处理(文本 / 多模态)
|
||||
└─ TTS播放(启动TTS线程,不等待)
|
||||
|
||||
[TTS播放线程] - 只播放
|
||||
└─ 从TTS队列取文本 → 播放音频,响应中断标志
|
||||
[TTS播放线程] - 只播放(可被中断)
|
||||
├─ 接收 TTS 音频流
|
||||
├─ 播放到输出设备
|
||||
└─ 响应中断标志(由录音线程触发)
|
||||
|
||||
|
||||
## 用到的命令
|
||||
1. 音频设备
|
||||
```bash
|
||||
# 1. 查看所有音频设备
|
||||
cat /proc/asound/cards
|
||||
# 2. 查看 card(1)的流信息(设备参数)
|
||||
cat /proc/asound/card1/stream0
|
||||
```
|
||||
|
||||
2. 相机设备
|
||||
```bash
|
||||
# 1. 查看相机所有基础信息(型号、固件版本、序列号等)
|
||||
rs-enumerate-devices -c
|
||||
```
|
||||
|
||||
3. 模型下载
|
||||
```bash
|
||||
modelscope download --model iic/speech_campplus_sv_zh-cn_16k-common --local_dir [指定路径]
|
||||
```
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
# 相机配置文件
|
||||
# 相机默认一直运行,用户说拍照时自动捕获图像
|
||||
|
||||
camera:
|
||||
serial_number: null # 相机序列号(null表示使用第一个可用设备)
|
||||
|
||||
# RGB流配置
|
||||
rgb:
|
||||
width: 640 # 图像宽度
|
||||
height: 480 # 图像高度
|
||||
fps: 30 # 帧率(支持:6, 10, 15, 30, 60)
|
||||
format: "RGB8" # 图像格式:RGB8, BGR8
|
||||
|
||||
# 图像处理配置
|
||||
image:
|
||||
jpeg_quality: 85 # JPEG压缩质量(0-100,85是质量和大小平衡点)
|
||||
max_size: null # 最大尺寸(null表示不缩放,格式:"1280x720")
|
||||
|
||||
# 相机指令关键词(拼音)
|
||||
commands:
|
||||
capture_keywords: ["pai zhao", "pai ge zhao", "pai zhang", "da kai xiang ji", "kan zhe li", "zhao xiang"] # 拍照相关指令
|
||||
# capture_keywords: ["拍照", "拍张", "打开相机", "看这里", "照相"] # 中文指令(如果ASR直接输出中文)
|
||||
|
||||
201
config/speakers.json
Normal file
201
config/speakers.json
Normal file
@@ -0,0 +1,201 @@
|
||||
{
|
||||
"user_1768311644": {
|
||||
"embedding": [
|
||||
0.017083248123526573,
|
||||
-0.01032772846519947,
|
||||
0.0058503481559455395,
|
||||
0.11945011466741562,
|
||||
0.03864186629652977,
|
||||
-0.16047827899456024,
|
||||
0.008000967092812061,
|
||||
0.10669729858636856,
|
||||
0.13221754133701324,
|
||||
0.06365424394607544,
|
||||
-0.06943577527999878,
|
||||
0.08401959389448166,
|
||||
0.09903465211391449,
|
||||
0.0407508946955204,
|
||||
-0.07486417144536972,
|
||||
0.0010617832886055112,
|
||||
0.12097838521003723,
|
||||
-0.013734623789787292,
|
||||
-0.020789025351405144,
|
||||
-0.02113250270485878,
|
||||
0.008510188199579716,
|
||||
-0.05490498244762421,
|
||||
-0.17027714848518372,
|
||||
0.09569162130355835,
|
||||
-0.07379947602748871,
|
||||
0.05932804197072983,
|
||||
0.0839226171374321,
|
||||
0.004776939284056425,
|
||||
0.050190482288599014,
|
||||
-0.19962339103221893,
|
||||
-0.13987377285957336,
|
||||
0.041607797145843506,
|
||||
0.10067984461784363,
|
||||
0.0684289038181305,
|
||||
0.08163066953420639,
|
||||
-0.029243428260087967,
|
||||
-0.10118222236633301,
|
||||
-0.11619988083839417,
|
||||
-0.10121472179889679,
|
||||
-0.04290663078427315,
|
||||
-0.08373524248600006,
|
||||
0.03493887186050415,
|
||||
0.055566269904375076,
|
||||
-0.11284282803535461,
|
||||
-0.10970190167427063,
|
||||
0.03457016497850418,
|
||||
0.11647575348615646,
|
||||
-0.014930102974176407,
|
||||
-0.04663793370127678,
|
||||
0.0752566009759903,
|
||||
-0.06746217608451843,
|
||||
-0.07642832398414612,
|
||||
0.06518206000328064,
|
||||
0.07191824167966843,
|
||||
0.13557033240795135,
|
||||
0.04906972125172615,
|
||||
0.03679114207625389,
|
||||
0.07466751337051392,
|
||||
0.01071798987686634,
|
||||
-0.07979520410299301,
|
||||
-0.10039637982845306,
|
||||
0.004846179857850075,
|
||||
-0.07325125485658646,
|
||||
-0.08750395476818085,
|
||||
0.05332862585783005,
|
||||
0.10648373514413834,
|
||||
-0.035643525421619415,
|
||||
0.21233271062374115,
|
||||
0.011915713548660278,
|
||||
0.13632774353027344,
|
||||
0.10383394360542297,
|
||||
-0.053550489246845245,
|
||||
0.05719169229269028,
|
||||
0.04600509628653526,
|
||||
0.043678827583789825,
|
||||
-0.03646669536828995,
|
||||
0.08175459504127502,
|
||||
0.042513635009527206,
|
||||
-0.09215544164180756,
|
||||
-0.06402364373207092,
|
||||
-0.10830589383840561,
|
||||
0.03379691392183304,
|
||||
0.07699205726385117,
|
||||
-0.11046901345252991,
|
||||
-0.016612332314252853,
|
||||
-0.02984754927456379,
|
||||
0.00998819898813963,
|
||||
-0.05820641294121742,
|
||||
0.007753593847155571,
|
||||
-0.016712933778762817,
|
||||
0.0014505418948829174,
|
||||
-0.04807407408952713,
|
||||
-0.048170242458581924,
|
||||
-0.0531715452671051,
|
||||
0.019113507121801376,
|
||||
0.08439801633358002,
|
||||
0.010585008189082146,
|
||||
-0.07400234043598175,
|
||||
0.10156761854887009,
|
||||
-0.018891986459493637,
|
||||
-0.052156757563352585,
|
||||
0.1302887201309204,
|
||||
0.08590760082006454,
|
||||
0.13382190465927124,
|
||||
-0.1498136967420578,
|
||||
-0.030552342534065247,
|
||||
-0.09281301498413086,
|
||||
0.10279291868209839,
|
||||
0.015315898694097996,
|
||||
-0.014133274555206299,
|
||||
-0.01298056822270155,
|
||||
0.06241781264543533,
|
||||
0.017693962901830673,
|
||||
0.0007682808791287243,
|
||||
0.029756756499409676,
|
||||
0.12711282074451447,
|
||||
-0.0695323497056961,
|
||||
0.01649993099272251,
|
||||
0.08811338990926743,
|
||||
-0.06976141035556793,
|
||||
-0.0763985738158226,
|
||||
-0.10730905085802078,
|
||||
0.0256052203476429,
|
||||
0.05183263123035431,
|
||||
0.0947495624423027,
|
||||
0.007070058956742287,
|
||||
-0.0505177341401577,
|
||||
-0.009485805407166481,
|
||||
0.003954170271754265,
|
||||
0.014901814050972462,
|
||||
-0.08098141849040985,
|
||||
0.03615008667111397,
|
||||
-0.09673020988702774,
|
||||
0.06970252841711044,
|
||||
0.009914563037455082,
|
||||
-0.012040670961141586,
|
||||
-0.0008170561632141471,
|
||||
-0.06880783289670944,
|
||||
-0.053053151816129684,
|
||||
0.05272500216960907,
|
||||
0.021709589287638664,
|
||||
-0.09712725877761841,
|
||||
0.06947346031665802,
|
||||
-0.07973745465278625,
|
||||
-0.036861639469861984,
|
||||
-0.08714801073074341,
|
||||
0.05473816394805908,
|
||||
-0.006384482141584158,
|
||||
-0.03656519949436188,
|
||||
0.0605260394513607,
|
||||
0.0407724604010582,
|
||||
-0.1314084380865097,
|
||||
-0.05484895780682564,
|
||||
0.014381998218595982,
|
||||
-0.07414797693490982,
|
||||
-0.013259666971862316,
|
||||
-0.1076463982462883,
|
||||
-0.04896606504917145,
|
||||
0.050690483301877975,
|
||||
0.0719417929649353,
|
||||
0.04990950971841812,
|
||||
-0.049923382699489594,
|
||||
0.08706197887659073,
|
||||
-0.06278207153081894,
|
||||
-0.029196983203291893,
|
||||
-0.07312408834695816,
|
||||
0.01651231199502945,
|
||||
0.025062547996640205,
|
||||
-0.023919139057397842,
|
||||
0.05597180873155594,
|
||||
0.08446669578552246,
|
||||
-0.06616690754890442,
|
||||
0.011679486371576786,
|
||||
0.008357426151633263,
|
||||
-0.07388673722743988,
|
||||
0.03612314909696579,
|
||||
-0.055705588310956955,
|
||||
-0.008656222373247147,
|
||||
-0.06408344209194183,
|
||||
-0.05341912433505058,
|
||||
0.01561578270047903,
|
||||
0.002446901286020875,
|
||||
0.042539432644844055,
|
||||
0.12226217240095139,
|
||||
-0.03700198978185654,
|
||||
0.02393815666437149,
|
||||
-0.021217981353402138,
|
||||
0.04431416094303131,
|
||||
-0.09150857478380203,
|
||||
-0.004766684491187334,
|
||||
-0.06133556738495827,
|
||||
0.07721113413572311
|
||||
],
|
||||
"env": "near",
|
||||
"threshold": 0.4,
|
||||
"registered_at": 1768311644.5742264
|
||||
}
|
||||
}
|
||||
@@ -10,7 +10,7 @@ dashscope:
|
||||
base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
temperature: 0.7
|
||||
max_tokens: 4096
|
||||
max_history: 5
|
||||
max_history: 10
|
||||
tts:
|
||||
model: "cosyvoice-v3-flash"
|
||||
voice: "longanyang"
|
||||
@@ -21,12 +21,19 @@ audio:
|
||||
sample_rate: 16000 # 输入采样率:16kHz(语音识别常用采样率)
|
||||
channels: 1 # 输入声道数:单声道(MONO,适合语音采集)
|
||||
chunk: 1024
|
||||
heartbeat_interval: 2.0 # 心跳间隔(秒),用于定期输出录音状态
|
||||
soundcard:
|
||||
card_index: 1 # USB Audio Device (card 1)
|
||||
device_index: 0 # USB Audio [USB Audio] (device 0)
|
||||
sample_rate: 48000 # 输出采样率:48kHz(支持48000或44100)
|
||||
sample_rate: 44100 # 输出采样率:44.1kHz(支持48000或44100)
|
||||
channels: 2 # 输出声道数:立体声(2声道,FL+FR)
|
||||
volume: 0.3 # 音量比例(0.0-1.0,0.2表示20%音量)
|
||||
volume: 1.0 # 音量比例(0.0-1.0,0.2表示20%音量)
|
||||
echo_cancellation:
|
||||
max_duration_ms: 500 # 参考信号缓冲区最大时长(毫秒)
|
||||
tts:
|
||||
source_sample_rate: 22050 # TTS服务固定输出采样率(DashScope服务固定值,不可修改)
|
||||
source_channels: 1 # TTS服务固定输出声道数(DashScope服务固定值,不可修改)
|
||||
ffmpeg_thread_queue_size: 4096 # ffmpeg输入线程队列大小(增大以减少卡顿)
|
||||
|
||||
vad:
|
||||
vad_mode: 3 # VAD模式:0-3,3最严格
|
||||
@@ -37,8 +44,25 @@ system:
|
||||
use_llm: true # 是否使用LLM
|
||||
use_wake_word: true # 是否启用唤醒词检测
|
||||
wake_word: "er gou" # 唤醒词(拼音)
|
||||
session_timeout: 10.0 # 会话超时时间(秒)
|
||||
session_timeout: 3.0 # 会话超时时间(秒)
|
||||
shutup_keywords: "bi zui" # 闭嘴指令关键词(拼音,逗号分隔)
|
||||
interrupt_command_queue_depth: 10 # 中断命令订阅的队列深度(QoS)
|
||||
sv_enabled: true # 是否启用声纹识别
|
||||
sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common" # 声纹模型路径
|
||||
sv_threshold: 0.35 # 声纹识别阈值(0.0-1.0,值越小越宽松,值越大越严格)
|
||||
sv_threshold: 0.55 # 声纹识别阈值(0.0-1.0,值越小越宽松,值越大越严格)
|
||||
sv_speaker_db_path: "config/speakers.json" # 声纹数据库保存路径(JSON格式,相对于ROS2包share目录)
|
||||
sv_buffer_size: 64000 # 声纹验证录音缓冲区大小(样本数)
|
||||
sv_registration_silence_threshold_ms: 500 # 声纹注册状态下的静音阈值(毫秒)
|
||||
|
||||
camera:
|
||||
serial_number: "405622075404" # 相机序列号(Intel RealSense D435)
|
||||
rgb:
|
||||
width: 640 # 图像宽度
|
||||
height: 480 # 图像高度
|
||||
fps: 30 # 帧率(支持:6, 10, 15, 30, 60)
|
||||
format: "RGB8" # 图像格式:RGB8, BGR8
|
||||
image:
|
||||
jpeg_quality: 85 # JPEG压缩质量(0-100,85是质量和大小平衡点)
|
||||
max_size: "1280x720" # 最大尺寸
|
||||
commands:
|
||||
capture_keywords: "pai zhao,pai ge zhao,pai zhang zhao pian,pai zhang,da kai xiang ji,kan zhe li,zhao xiang" # 拍照相关指令(拼音,逗号分隔)
|
||||
|
||||
@@ -1,305 +1,14 @@
|
||||
from launch import LaunchDescription
|
||||
from launch_ros.actions import Node
|
||||
from launch.actions import DeclareLaunchArgument
|
||||
from launch.substitutions import LaunchConfiguration
|
||||
import os
|
||||
from ament_index_python.packages import get_package_share_directory
|
||||
import yaml
|
||||
|
||||
|
||||
def generate_launch_description():
|
||||
# 加载配置文件
|
||||
voice_config_file = os.path.join(
|
||||
get_package_share_directory('robot_speaker'),
|
||||
'config',
|
||||
'voice.yaml'
|
||||
)
|
||||
|
||||
camera_config_file = os.path.join(
|
||||
get_package_share_directory('robot_speaker'),
|
||||
'config',
|
||||
'camera.yaml'
|
||||
)
|
||||
|
||||
with open(voice_config_file, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# 加载相机配置(如果文件存在)
|
||||
camera_config = None
|
||||
if os.path.exists(camera_config_file):
|
||||
try:
|
||||
with open(camera_config_file, 'r') as f:
|
||||
camera_config = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f"警告: 无法加载相机配置文件: {e}")
|
||||
|
||||
# 从配置文件提取参数
|
||||
dashscope_config = config['dashscope']
|
||||
audio_config = config['audio']
|
||||
vad_config = config['vad']
|
||||
system_config = config['system']
|
||||
camera_config_data = camera_config.get('camera', {}) if camera_config else {}
|
||||
|
||||
"""启动语音交互节点,所有参数从 voice.yaml 读取"""
|
||||
return LaunchDescription([
|
||||
# 音频输入参数
|
||||
DeclareLaunchArgument(
|
||||
'input_device_index',
|
||||
default_value=str(audio_config['microphone']['device_index']),
|
||||
description='麦克风设备索引'
|
||||
),
|
||||
|
||||
# 音频输出参数
|
||||
DeclareLaunchArgument(
|
||||
'output_card_index',
|
||||
default_value=str(audio_config['soundcard']['card_index']),
|
||||
description='声卡 card index'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'output_device_index',
|
||||
default_value=str(audio_config['soundcard']['device_index']),
|
||||
description='声卡 device index'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'output_sample_rate',
|
||||
default_value=str(audio_config['soundcard'].get('sample_rate', 48000)),
|
||||
description='输出采样率'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'output_channels',
|
||||
default_value=str(audio_config['soundcard'].get('channels', 2)),
|
||||
description='输出声道数'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'output_volume',
|
||||
default_value=str(audio_config['soundcard'].get('volume', 0.2)),
|
||||
description='输出音量比例(0.0-1.0)'
|
||||
),
|
||||
|
||||
# 音频参数
|
||||
DeclareLaunchArgument(
|
||||
'sample_rate',
|
||||
default_value=str(audio_config['microphone']['sample_rate']),
|
||||
description='采样率'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'channels',
|
||||
default_value=str(audio_config['microphone']['channels']),
|
||||
description='声道数'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'chunk',
|
||||
default_value=str(audio_config['microphone']['chunk']),
|
||||
description='音频块大小'
|
||||
),
|
||||
|
||||
# VAD参数
|
||||
DeclareLaunchArgument(
|
||||
'vad_mode',
|
||||
default_value=str(vad_config['vad_mode']),
|
||||
description='VAD模式:0-3,3最严格'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'silence_duration_ms',
|
||||
default_value=str(vad_config.get('silence_duration_ms', 1000)),
|
||||
description='静音持续时长(毫秒)'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'min_energy_threshold',
|
||||
default_value=str(vad_config.get('min_energy_threshold', 300)),
|
||||
description='最小能量阈值'
|
||||
),
|
||||
|
||||
# DashScope参数
|
||||
DeclareLaunchArgument(
|
||||
'dashscope_api_key',
|
||||
default_value=dashscope_config['api_key'],
|
||||
description='DashScope API Key'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'asr_model',
|
||||
default_value=dashscope_config['asr']['model'],
|
||||
description='ASR模型名称'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'asr_url',
|
||||
default_value=dashscope_config['asr']['url'],
|
||||
description='ASR WebSocket URL'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'llm_model',
|
||||
default_value=dashscope_config['llm']['model'],
|
||||
description='LLM模型名称'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'llm_base_url',
|
||||
default_value=dashscope_config['llm']['base_url'],
|
||||
description='LLM API Base URL'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'llm_temperature',
|
||||
default_value=str(dashscope_config['llm']['temperature']),
|
||||
description='LLM温度参数'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'llm_max_tokens',
|
||||
default_value=str(dashscope_config['llm']['max_tokens']),
|
||||
description='LLM最大token数'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'llm_max_history',
|
||||
default_value=str(dashscope_config['llm']['max_history']),
|
||||
description='LLM最大对话历史条数'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'tts_model',
|
||||
default_value=dashscope_config['tts']['model'],
|
||||
description='TTS模型名称'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'tts_voice',
|
||||
default_value=dashscope_config['tts']['voice'],
|
||||
description='TTS语音'
|
||||
),
|
||||
|
||||
# 系统参数
|
||||
DeclareLaunchArgument(
|
||||
'use_llm',
|
||||
default_value=str(system_config['use_llm']).lower(),
|
||||
description='是否使用LLM'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'use_wake_word',
|
||||
default_value=str(system_config['use_wake_word']).lower(),
|
||||
description='是否启用唤醒词检测'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'wake_word',
|
||||
default_value=system_config['wake_word'],
|
||||
description='唤醒词'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'session_timeout',
|
||||
default_value=str(system_config.get('session_timeout', 30.0)),
|
||||
description='会话超时时间(秒)'
|
||||
),
|
||||
|
||||
# 声纹识别参数
|
||||
DeclareLaunchArgument(
|
||||
'sv_enabled',
|
||||
default_value=str(system_config.get('sv_enabled', True)).lower(),
|
||||
description='是否启用声纹识别'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'sv_model_path',
|
||||
default_value=os.path.expanduser(system_config.get('sv_model_path', '')),
|
||||
description='声纹模型路径'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'sv_threshold',
|
||||
default_value=str(system_config.get('sv_threshold', 0.45)),
|
||||
description='声纹识别阈值'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'sv_speaker_db_path',
|
||||
default_value=os.path.join(
|
||||
get_package_share_directory('robot_speaker'),
|
||||
system_config.get('sv_speaker_db_path', 'config/speakers.json')
|
||||
) if system_config.get('sv_speaker_db_path') else '',
|
||||
description='声纹数据库路径'
|
||||
),
|
||||
|
||||
# 相机参数
|
||||
DeclareLaunchArgument(
|
||||
'camera_serial_number',
|
||||
default_value=str(camera_config_data.get('serial_number', '')) if camera_config_data.get('serial_number') else '',
|
||||
description='相机序列号'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'camera_width',
|
||||
default_value=str(camera_config_data.get('rgb', {}).get('width', 640)),
|
||||
description='相机图像宽度'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'camera_height',
|
||||
default_value=str(camera_config_data.get('rgb', {}).get('height', 480)),
|
||||
description='相机图像高度'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'camera_fps',
|
||||
default_value=str(camera_config_data.get('rgb', {}).get('fps', 30)),
|
||||
description='相机帧率'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'camera_format',
|
||||
default_value=camera_config_data.get('rgb', {}).get('format', 'RGB8'),
|
||||
description='相机图像格式'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'camera_jpeg_quality',
|
||||
default_value=str(camera_config_data.get('image', {}).get('jpeg_quality', 85)),
|
||||
description='JPEG压缩质量'
|
||||
),
|
||||
DeclareLaunchArgument(
|
||||
'camera_capture_keywords',
|
||||
default_value=','.join(camera_config_data.get('commands', {}).get('capture_keywords', ['pai zhao'])),
|
||||
description='相机拍照指令关键词(逗号分隔)'
|
||||
),
|
||||
|
||||
# 语音节点
|
||||
Node(
|
||||
package='robot_speaker',
|
||||
executable='robot_speaker_node',
|
||||
name='robot_speaker_node',
|
||||
parameters=[{
|
||||
# 音频参数
|
||||
'input_device_index': LaunchConfiguration('input_device_index'),
|
||||
'output_card_index': LaunchConfiguration('output_card_index'),
|
||||
'output_device_index': LaunchConfiguration('output_device_index'),
|
||||
'sample_rate': LaunchConfiguration('sample_rate'),
|
||||
'channels': LaunchConfiguration('channels'),
|
||||
'chunk': LaunchConfiguration('chunk'),
|
||||
'output_sample_rate': LaunchConfiguration('output_sample_rate'),
|
||||
'output_channels': LaunchConfiguration('output_channels'),
|
||||
'output_volume': LaunchConfiguration('output_volume'),
|
||||
|
||||
# VAD参数
|
||||
'vad_mode': LaunchConfiguration('vad_mode'),
|
||||
'silence_duration_ms': LaunchConfiguration('silence_duration_ms'),
|
||||
'min_energy_threshold': LaunchConfiguration('min_energy_threshold'),
|
||||
|
||||
# DashScope参数
|
||||
'dashscope_api_key': LaunchConfiguration('dashscope_api_key'),
|
||||
'asr_model': LaunchConfiguration('asr_model'),
|
||||
'asr_url': LaunchConfiguration('asr_url'),
|
||||
'llm_model': LaunchConfiguration('llm_model'),
|
||||
'llm_base_url': LaunchConfiguration('llm_base_url'),
|
||||
'llm_temperature': LaunchConfiguration('llm_temperature'),
|
||||
'llm_max_tokens': LaunchConfiguration('llm_max_tokens'),
|
||||
'llm_max_history': LaunchConfiguration('llm_max_history'),
|
||||
'tts_model': LaunchConfiguration('tts_model'),
|
||||
'tts_voice': LaunchConfiguration('tts_voice'),
|
||||
|
||||
# 系统参数
|
||||
'use_llm': LaunchConfiguration('use_llm'),
|
||||
'use_wake_word': LaunchConfiguration('use_wake_word'),
|
||||
'wake_word': LaunchConfiguration('wake_word'),
|
||||
'session_timeout': LaunchConfiguration('session_timeout'),
|
||||
|
||||
# 声纹识别参数
|
||||
'sv_enabled': LaunchConfiguration('sv_enabled'),
|
||||
'sv_model_path': LaunchConfiguration('sv_model_path'),
|
||||
'sv_threshold': LaunchConfiguration('sv_threshold'),
|
||||
'sv_speaker_db_path': LaunchConfiguration('sv_speaker_db_path'),
|
||||
|
||||
# 相机参数
|
||||
'camera_serial_number': LaunchConfiguration('camera_serial_number'),
|
||||
'camera_width': LaunchConfiguration('camera_width'),
|
||||
'camera_height': LaunchConfiguration('camera_height'),
|
||||
'camera_fps': LaunchConfiguration('camera_fps'),
|
||||
'camera_format': LaunchConfiguration('camera_format'),
|
||||
'camera_jpeg_quality': LaunchConfiguration('camera_jpeg_quality'),
|
||||
'camera_capture_keywords': LaunchConfiguration('camera_capture_keywords'),
|
||||
}],
|
||||
output='screen'
|
||||
),
|
||||
])
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
dashscope>=1.20.0
|
||||
openai>=1.0.0
|
||||
pyaudio>=0.2.11
|
||||
webrtcvad>=2.0.10
|
||||
webrtcvad>=2.0.10 # WebRTC VAD(语音活动检测),不包含回声消除
|
||||
pypinyin>=0.49.0
|
||||
rclpy>=3.0.0
|
||||
pyrealsense2>=2.54.0
|
||||
Pillow>=10.0.0
|
||||
numpy>=1.24.0
|
||||
# 回声消除库(可选):
|
||||
# aec-audio-processing - 专门用于回声消除的WebRTC库,API简单(推荐)
|
||||
# pip install aec-audio-processing
|
||||
# 如果未安装,将使用内置的简单自适应算法
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -66,14 +66,14 @@ class DashScopeASR:
|
||||
|
||||
self.conversation.connect()
|
||||
|
||||
# 自定义文本语料增强识别
|
||||
custom_text = "蜂核科技, 杭州蜂核科技有限公司, 西林瓶,瓶子"
|
||||
# 自定义文本语料增强识别,听不清的时候高概率说这个词
|
||||
# custom_text = "二狗"
|
||||
|
||||
transcription_params = TranscriptionParams(
|
||||
language='zh',
|
||||
sample_rate=self.sample_rate,
|
||||
input_audio_format="pcm",
|
||||
corpus_text=custom_text,
|
||||
# corpus_text=custom_text,
|
||||
)
|
||||
|
||||
# 本地 VAD → 只控制 TTS 打断
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
"""
|
||||
音频处理模块:录音 + VAD
|
||||
音频处理模块:录音 + VAD + 回声消除
|
||||
"""
|
||||
import time
|
||||
import pyaudio
|
||||
import webrtcvad
|
||||
import struct
|
||||
import queue
|
||||
from .echo_cancellation import EchoCanceller, ReferenceSignalBuffer
|
||||
|
||||
|
||||
class VADDetector:
|
||||
@@ -40,6 +41,8 @@ class AudioRecorder:
|
||||
on_audio_chunk=None, # 音频chunk回调(用于声纹录音等,可选)
|
||||
should_put_to_queue=None, # 检查是否应该将音频放入队列(用于阻止ASR,可选)
|
||||
get_silence_threshold=None, # 获取动态静音阈值(毫秒,可选)
|
||||
enable_echo_cancellation: bool = True, # 是否启用回声消除
|
||||
reference_signal_buffer: ReferenceSignalBuffer = None, # 参考信号缓冲区(可选)
|
||||
logger=None):
|
||||
self.device_index = device_index
|
||||
self.sample_rate = sample_rate
|
||||
@@ -64,6 +67,39 @@ class AudioRecorder:
|
||||
self.audio = pyaudio.PyAudio()
|
||||
self.format = pyaudio.paInt16
|
||||
self._debug_counter = 0
|
||||
|
||||
# 回声消除相关
|
||||
self.enable_echo_cancellation = enable_echo_cancellation
|
||||
self.reference_signal_buffer = reference_signal_buffer
|
||||
if enable_echo_cancellation:
|
||||
# 初始化回声消除器(在录音线程中同步处理,不是单独线程)
|
||||
# frame_size设置为chunk大小,确保每次处理一个chunk
|
||||
frame_size = chunk
|
||||
try:
|
||||
# 获取参考信号声道数(从reference_signal_buffer获取,因为它是根据播放声道数创建的)
|
||||
ref_channels = self.reference_signal_buffer.channels if self.reference_signal_buffer else 1
|
||||
self.echo_canceller = EchoCanceller(
|
||||
sample_rate=sample_rate,
|
||||
frame_size=frame_size,
|
||||
channels=self.channels, # 麦克风输入:1声道
|
||||
ref_channels=ref_channels, # 参考信号:播放声道数(2声道)
|
||||
logger=logger
|
||||
)
|
||||
if self.echo_canceller.aec is not None:
|
||||
if logger:
|
||||
logger.info(f"回声消除器已启用: sample_rate={sample_rate}, frame_size={frame_size}")
|
||||
else:
|
||||
if logger:
|
||||
logger.warning("回声消除器初始化失败,将禁用回声消除功能")
|
||||
self.enable_echo_cancellation = False
|
||||
self.echo_canceller = None
|
||||
except Exception as e:
|
||||
if logger:
|
||||
logger.warning(f"回声消除器初始化失败: {e},将禁用回声消除功能")
|
||||
self.enable_echo_cancellation = False
|
||||
self.echo_canceller = None
|
||||
else:
|
||||
self.echo_canceller = None
|
||||
|
||||
def record_with_vad(self):
|
||||
"""
|
||||
@@ -103,18 +139,34 @@ class AudioRecorder:
|
||||
# exception_on_overflow=False, 宁可丢帧,也不阻塞
|
||||
data = stream.read(self.chunk, exception_on_overflow=False)
|
||||
|
||||
# 回声消除处理
|
||||
processed_data = data
|
||||
if self.enable_echo_cancellation and self.echo_canceller and self.reference_signal_buffer:
|
||||
try:
|
||||
# 获取参考信号(长度与麦克风信号匹配)
|
||||
ref_signal = self.reference_signal_buffer.get_reference(num_samples=self.chunk)
|
||||
# 执行回声消除
|
||||
processed_data = self.echo_canceller.process(data, ref_signal)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"回声消除处理失败: {e},使用原始音频")
|
||||
processed_data = data
|
||||
|
||||
# 检查是否应该将音频放入队列(用于阻止ASR,例如无声纹文件时需要注册)
|
||||
if self.should_put_to_queue():
|
||||
# 队列满时丢弃最旧的数据,ASR 跟不上时系统仍然听得见
|
||||
if self.audio_queue.full():
|
||||
self.audio_queue.get_nowait()
|
||||
self.audio_queue.put_nowait(data)
|
||||
# 使用处理后的音频数据(经过回声消除)
|
||||
self.audio_queue.put_nowait(processed_data)
|
||||
|
||||
# 音频chunk回调(用于声纹录音等,仅在需要时调用)
|
||||
if self.on_audio_chunk:
|
||||
self.on_audio_chunk(data)
|
||||
# 回调使用处理后的音频数据
|
||||
self.on_audio_chunk(processed_data)
|
||||
|
||||
audio_buffer.append(data) # 只用于 VAD,不用于 ASR
|
||||
# VAD检测使用处理后的音频(经过回声消除)
|
||||
audio_buffer.append(processed_data) # 只用于 VAD,不用于 ASR
|
||||
|
||||
# VAD检测窗口
|
||||
now = time.time()
|
||||
|
||||
@@ -7,17 +7,12 @@ import contextlib
|
||||
|
||||
|
||||
class CameraClient:
|
||||
"""
|
||||
相机客户端 - 封装RealSense相机操作
|
||||
相机初始化后一直运行,capture_rgb() 只负责从运行中的管道捕获一帧
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
serial_number: str | None = None,
|
||||
width: int = 640,
|
||||
height: int = 480,
|
||||
fps: int = 30,
|
||||
format: str = 'RGB8',
|
||||
serial_number: str | None,
|
||||
width: int,
|
||||
height: int,
|
||||
fps: int,
|
||||
format: str,
|
||||
logger=None):
|
||||
self.serial_number = serial_number
|
||||
self.width = width
|
||||
@@ -35,7 +30,7 @@ class CameraClient:
|
||||
if self.logger:
|
||||
getattr(self.logger, level, self.logger.info)(msg)
|
||||
else:
|
||||
print(f"[Camera] {msg}")
|
||||
print(f"[相机] {msg}")
|
||||
|
||||
def initialize(self) -> bool:
|
||||
"""
|
||||
@@ -68,9 +63,6 @@ class CameraClient:
|
||||
self._is_initialized = True
|
||||
self._log("info", f"相机已启动并保持运行: {self.width}x{self.height}@{self.fps}fps")
|
||||
return True
|
||||
except ImportError:
|
||||
self._log("error", "pyrealsense2库未安装,请运行: pip install pyrealsense2")
|
||||
return False
|
||||
except Exception as e:
|
||||
self._log("error", f"相机初始化失败: {e}")
|
||||
self.cleanup()
|
||||
@@ -98,10 +90,6 @@ class CameraClient:
|
||||
frames = self.pipeline.wait_for_frames()
|
||||
color_frame = frames.get_color_frame()
|
||||
|
||||
if not color_frame:
|
||||
self._log("warning", "未获取到颜色帧")
|
||||
return None
|
||||
|
||||
return np.asanyarray(color_frame.get_data())
|
||||
except Exception as e:
|
||||
self._log("error", f"捕获图像失败: {e}")
|
||||
|
||||
157
robot_speaker/echo_cancellation.py
Normal file
157
robot_speaker/echo_cancellation.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
回声消除模块
|
||||
mic = 人声 + 扬声器回声 + 环境噪声
|
||||
ref = 声卡原始播放音频
|
||||
AEC(mic, ref) → 去掉 ref 在 mic 中的那一部分
|
||||
"""
|
||||
import numpy as np
|
||||
import struct
|
||||
import threading
|
||||
from collections import deque
|
||||
import aec_audio_processing
|
||||
|
||||
|
||||
class EchoCanceller:
|
||||
"""回声消除器"""
|
||||
|
||||
def __init__(self, sample_rate: int, frame_size: int, channels: int,
|
||||
ref_channels: int, logger=None):
|
||||
|
||||
self.sample_rate = sample_rate
|
||||
self.frame_size = frame_size
|
||||
self.channels = channels
|
||||
self.ref_channels = ref_channels
|
||||
self.logger = logger
|
||||
self.aec = None
|
||||
self.aec_frame_size = None # AudioProcessor期望的帧大小(固定10ms=160样本)
|
||||
|
||||
# 初始化aec-audio-processing的AudioProcessor
|
||||
try:
|
||||
self.aec = aec_audio_processing.AudioProcessor(
|
||||
enable_aec=True, # 回声消除
|
||||
enable_ns=False, # 降噪
|
||||
enable_agc=False # 自动增益
|
||||
)
|
||||
|
||||
# 设置流格式(麦克风输入:1声道,16kHz)
|
||||
self.aec.set_stream_format(
|
||||
sample_rate_in=sample_rate,
|
||||
channel_count_in=channels,
|
||||
sample_rate_out=sample_rate,
|
||||
channel_count_out=channels
|
||||
)
|
||||
|
||||
# 设置反向流格式(参考信号:播放是2声道,重采样到16kHz)
|
||||
# 参考信号是声卡播放的音频(2声道),重采样到16kHz用于回声消除
|
||||
self.aec.set_reverse_stream_format(sample_rate, ref_channels)
|
||||
|
||||
# 获取AudioProcessor期望的帧大小(固定10ms)
|
||||
self.aec_frame_size = self.aec.get_frame_size()
|
||||
if logger:
|
||||
logger.info(f"AudioProcessor期望的帧大小: {self.aec_frame_size} 样本 ({self.aec_frame_size / sample_rate * 1000}ms)")
|
||||
except Exception as e:
|
||||
if logger:
|
||||
logger.warning(f"aec_audio_processing 初始化失败: {e},将禁用回声消除")
|
||||
self.aec = None
|
||||
|
||||
def process(self, mic_signal: bytes, ref_signal: bytes = None) -> bytes:
|
||||
"""处理音频数据,消除回声(在录音线程中同步处理)"""
|
||||
if self.aec is None or ref_signal is None or self.aec_frame_size is None:
|
||||
return mic_signal
|
||||
|
||||
# 保存原始长度
|
||||
original_mic_len = len(mic_signal)
|
||||
|
||||
# AudioProcessor期望固定10ms的帧,需要将大的chunk分成多个小块处理
|
||||
# 麦克风(1声道):160样本 * 1声道 * 2字节 = 320字节
|
||||
mic_frame_bytes = self.aec_frame_size * self.channels * 2
|
||||
# 参考信号(2声道):160样本 * 2声道 * 2字节 = 640字节
|
||||
ref_frame_bytes = self.aec_frame_size * self.ref_channels * 2
|
||||
|
||||
# 确保输入数据长度是帧大小的整数倍
|
||||
if len(mic_signal) % mic_frame_bytes != 0:
|
||||
padding = mic_frame_bytes - (len(mic_signal) % mic_frame_bytes)
|
||||
mic_signal = mic_signal + b'\x00' * padding
|
||||
|
||||
if len(ref_signal) % ref_frame_bytes != 0:
|
||||
padding = ref_frame_bytes - (len(ref_signal) % ref_frame_bytes)
|
||||
ref_signal = ref_signal + b'\x00' * padding
|
||||
|
||||
# 分块处理:将大的chunk(1024样本)分成多个10ms块(160样本)处理
|
||||
try:
|
||||
num_frames = len(mic_signal) // mic_frame_bytes
|
||||
output_chunks = []
|
||||
for i in range(num_frames):
|
||||
mic_chunk = mic_signal[i * mic_frame_bytes:(i + 1) * mic_frame_bytes]
|
||||
ref_chunk = ref_signal[i * ref_frame_bytes:(i + 1) * ref_frame_bytes]
|
||||
|
||||
self.aec.process_reverse_stream(ref_chunk)
|
||||
output_chunk = self.aec.process_stream(mic_chunk)
|
||||
|
||||
# AudioProcessor.process_stream返回bytes
|
||||
output_chunks.append(output_chunk)
|
||||
|
||||
result = b''.join(output_chunks)
|
||||
return result[:original_mic_len]
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"回声消除处理失败: {e}")
|
||||
return mic_signal[:original_mic_len]
|
||||
|
||||
|
||||
class ReferenceSignalBuffer:
|
||||
"""缓存声卡播放的参考音频(供 AEC 使用)"""
|
||||
|
||||
def __init__(self, max_duration_ms: int, sample_rate: int, channels: int):
|
||||
max_samples = int(sample_rate * max_duration_ms / 1000)
|
||||
self.sample_rate = sample_rate
|
||||
self.channels = channels # 参考信号声道数(播放声道数,2声道)
|
||||
self.buffer = deque(maxlen=max_samples * channels)
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def add_reference(self, audio_data: bytes, source_sample_rate: int = None, source_channels: int = 1):
|
||||
"""
|
||||
添加参考信号
|
||||
"""
|
||||
if not audio_data:
|
||||
return
|
||||
with self.lock:
|
||||
# 重采样:TTS源采样率 -> 麦克风采样率(匹配麦克风采样率)
|
||||
if source_sample_rate and source_sample_rate != self.sample_rate:
|
||||
audio_data = self._resample(audio_data, source_sample_rate, self.sample_rate)
|
||||
|
||||
# 转换声道数:1声道 -> 2声道(匹配播放声道数)
|
||||
samples = struct.unpack(f'<{len(audio_data) // 2}h', audio_data)
|
||||
if source_channels == 1 and self.channels == 2:
|
||||
# 单声道转2声道:复制到左右声道
|
||||
stereo_samples = [s for sample in samples for s in [sample, sample]]
|
||||
samples = stereo_samples
|
||||
|
||||
self.buffer.extend(samples)
|
||||
|
||||
def get_reference(self, num_samples: int) -> bytes:
|
||||
"""获取参考信号(指定样本数,考虑声道数)"""
|
||||
with self.lock:
|
||||
if not self.buffer:
|
||||
return b'\x00' * (num_samples * self.channels * 2)
|
||||
# 需要的总样本数(考虑声道数)
|
||||
total_samples_needed = num_samples * self.channels
|
||||
samples = list(self.buffer)[-total_samples_needed:] if len(self.buffer) >= total_samples_needed else list(self.buffer)
|
||||
if len(samples) < total_samples_needed:
|
||||
samples = [0] * (total_samples_needed - len(samples)) + samples
|
||||
return struct.pack(f'<{len(samples)}h', *samples)
|
||||
|
||||
def clear(self):
|
||||
"""清空缓冲区"""
|
||||
with self.lock:
|
||||
self.buffer.clear()
|
||||
|
||||
def _resample(self, audio_data: bytes, source_rate: int, target_rate: int) -> bytes:
|
||||
"""简单线性重采样"""
|
||||
if source_rate == target_rate:
|
||||
return audio_data
|
||||
samples = np.frombuffer(audio_data, dtype=np.int16)
|
||||
ratio = target_rate / source_rate
|
||||
indices = np.linspace(0, len(samples) - 1, int(len(samples) * ratio))
|
||||
resampled = np.interp(indices, np.arange(len(samples)), samples.astype(np.float32))
|
||||
return resampled.astype(np.int16).tobytes()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -34,9 +34,11 @@ class SpeakerVerificationClient:
|
||||
self._expected_embedding_dim = None # 只存储维度大小,不存储shape元组
|
||||
|
||||
from funasr import AutoModel
|
||||
self.model = AutoModel(model=self.model_path, device="cpu")
|
||||
# 确保模型路径是绝对路径(展开 ~)
|
||||
model_path = os.path.expanduser(self.model_path)
|
||||
self.model = AutoModel(model=model_path, device="cpu")
|
||||
if self.logger:
|
||||
self.logger.info(f"声纹模型已加载: {self.model_path}, 阈值: {self.threshold}")
|
||||
self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
|
||||
|
||||
if self.speaker_db_path:
|
||||
self.load_speakers()
|
||||
@@ -176,23 +178,23 @@ class SpeakerVerificationClient:
|
||||
def match_speaker(self, embedding: np.ndarray):
|
||||
"""
|
||||
匹配说话人(一句话只调用一次)
|
||||
返回: (speaker_id: str | None, state: SpeakerState)
|
||||
返回: (speaker_id: str | None, state: SpeakerState, score: float, threshold: float)
|
||||
"""
|
||||
if not self.speaker_db:
|
||||
return None, SpeakerState.UNKNOWN
|
||||
return None, SpeakerState.UNKNOWN, 0.0, self.threshold
|
||||
|
||||
embedding_dim = len(embedding)
|
||||
if embedding_dim == 0:
|
||||
return None, SpeakerState.ERROR
|
||||
return None, SpeakerState.ERROR, 0.0, self.threshold
|
||||
|
||||
# 校验维度一致性
|
||||
if self._expected_embedding_dim is not None and embedding_dim != self._expected_embedding_dim:
|
||||
return None, SpeakerState.ERROR
|
||||
return None, SpeakerState.ERROR, 0.0, self.threshold
|
||||
|
||||
# 归一化当前embedding(注册时已归一化,这里只需要归一化当前embedding)
|
||||
embedding_norm = np.linalg.norm(embedding)
|
||||
if embedding_norm == 0:
|
||||
return None, SpeakerState.ERROR
|
||||
return None, SpeakerState.ERROR, 0.0, self.threshold
|
||||
embedding_normalized = embedding / embedding_norm
|
||||
|
||||
best_match = None
|
||||
@@ -213,9 +215,10 @@ class SpeakerVerificationClient:
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = speaker_id
|
||||
best_threshold = speaker_data.get("threshold", self.threshold)
|
||||
best_threshold = speaker_data["threshold"]
|
||||
|
||||
return (best_match, SpeakerState.VERIFIED) if best_score >= best_threshold else (None, SpeakerState.REJECTED)
|
||||
state = SpeakerState.VERIFIED if best_score >= best_threshold else SpeakerState.REJECTED
|
||||
return (best_match, state, best_score, best_threshold)
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return self.model is not None
|
||||
@@ -272,16 +275,16 @@ class SpeakerVerificationClient:
|
||||
self._log("warning", f"跳过维度不匹配的声纹: {speaker_id} (期望{self._expected_embedding_dim}, 实际{embedding_dim})")
|
||||
continue
|
||||
|
||||
# 确保embedding已归一化(兼容旧数据)
|
||||
# 确保embedding已归一化
|
||||
embedding_norm = np.linalg.norm(embedding_array)
|
||||
if embedding_norm > 0:
|
||||
embedding_array = embedding_array / embedding_norm
|
||||
|
||||
self.speaker_db[speaker_id] = {
|
||||
"embedding": embedding_array,
|
||||
"env": speaker_data.get("env", "near"),
|
||||
"threshold": speaker_data.get("threshold", self.threshold),
|
||||
"registered_at": speaker_data.get("registered_at", time.time())
|
||||
"env": speaker_data["env"],
|
||||
"threshold": speaker_data["threshold"],
|
||||
"registered_at": speaker_data["registered_at"]
|
||||
}
|
||||
|
||||
count = len(self.speaker_db)
|
||||
@@ -290,47 +293,6 @@ class SpeakerVerificationClient:
|
||||
else:
|
||||
self._log("info", f"已加载 {count} 个已注册说话人")
|
||||
return True
|
||||
except json.JSONDecodeError as e:
|
||||
# 尝试兼容旧的pickle格式
|
||||
try:
|
||||
import pickle
|
||||
with open(self.speaker_db_path, 'rb') as f:
|
||||
old_data = pickle.load(f)
|
||||
self._log("warning", "检测到旧的pickle格式,正在迁移...")
|
||||
# 迁移逻辑:转换为新格式
|
||||
with self._lock:
|
||||
for speaker_id, speaker_info in old_data.items():
|
||||
if hasattr(speaker_info, 'embedding'):
|
||||
# 旧格式:SpeakerInfo对象
|
||||
embedding = speaker_info.embedding
|
||||
embedding_norm = np.linalg.norm(embedding)
|
||||
if embedding_norm > 0:
|
||||
embedding = embedding / embedding_norm
|
||||
self.speaker_db[speaker_id] = {
|
||||
"embedding": embedding,
|
||||
"env": getattr(speaker_info, 'env', 'near'),
|
||||
"threshold": getattr(speaker_info, 'threshold', self.threshold),
|
||||
"registered_at": getattr(speaker_info, 'registered_at', time.time())
|
||||
}
|
||||
else:
|
||||
# 可能是dict格式
|
||||
embedding = speaker_info.get("embedding")
|
||||
if embedding is not None:
|
||||
embedding_norm = np.linalg.norm(embedding)
|
||||
if embedding_norm > 0:
|
||||
embedding = embedding / embedding_norm
|
||||
self.speaker_db[speaker_id] = {
|
||||
"embedding": embedding,
|
||||
"env": speaker_info.get("env", "near"),
|
||||
"threshold": speaker_info.get("threshold", self.threshold),
|
||||
"registered_at": speaker_info.get("registered_at", time.time())
|
||||
}
|
||||
# 保存为新格式
|
||||
self.save_speakers()
|
||||
self._log("info", "已迁移到新格式")
|
||||
except Exception as e2:
|
||||
self._log("error", f"加载声纹数据库失败(JSON和pickle都失败): {e}, {e2}")
|
||||
return False
|
||||
except Exception as e:
|
||||
self._log("error", f"加载声纹数据库失败: {e}")
|
||||
return False
|
||||
@@ -355,9 +317,9 @@ class SpeakerVerificationClient:
|
||||
for speaker_id, speaker_data in self.speaker_db.items():
|
||||
json_data[speaker_id] = {
|
||||
"embedding": speaker_data["embedding"].tolist(), # numpy array -> list
|
||||
"env": speaker_data.get("env", "near"),
|
||||
"threshold": speaker_data.get("threshold", self.threshold),
|
||||
"registered_at": speaker_data.get("registered_at", time.time())
|
||||
"env": speaker_data["env"],
|
||||
"threshold": speaker_data["threshold"],
|
||||
"registered_at": speaker_data["registered_at"]
|
||||
}
|
||||
|
||||
# 使用临时文件 + 原子替换,避免写入过程中崩溃导致数据丢失
|
||||
|
||||
@@ -24,9 +24,13 @@ class DashScopeTTSClient(TTSClient):
|
||||
voice: str,
|
||||
card_index: int,
|
||||
device_index: int,
|
||||
output_sample_rate: int = 48000,
|
||||
output_sample_rate: int = 44100,
|
||||
output_channels: int = 2,
|
||||
output_volume: float = 1.0,
|
||||
tts_source_sample_rate: int = 22050, # TTS服务固定输出采样率
|
||||
tts_source_channels: int = 1, # TTS服务固定输出声道数
|
||||
tts_ffmpeg_thread_queue_size: int = 1024, # ffmpeg输入线程队列大小
|
||||
reference_signal_buffer=None, # 参考信号缓冲区(用于回声消除)
|
||||
logger=None):
|
||||
dashscope.api_key = api_key
|
||||
self.model = model
|
||||
@@ -36,7 +40,12 @@ class DashScopeTTSClient(TTSClient):
|
||||
self.output_sample_rate = output_sample_rate
|
||||
self.output_channels = output_channels
|
||||
self.output_volume = output_volume
|
||||
self.tts_source_sample_rate = tts_source_sample_rate
|
||||
self.tts_source_channels = tts_source_channels
|
||||
self.tts_ffmpeg_thread_queue_size = tts_ffmpeg_thread_queue_size
|
||||
self.reference_signal_buffer = reference_signal_buffer # 参考信号缓冲区
|
||||
self.logger = logger
|
||||
self.current_ffmpeg_pid = None # 当前ffmpeg进程的PID
|
||||
|
||||
# 构建ALSA设备, 允许 ffmpeg 自动重采样 / 重声道
|
||||
self.alsa_device = f"plughw:{card_index},{device_index}" if (
|
||||
@@ -64,7 +73,7 @@ class DashScopeTTSClient(TTSClient):
|
||||
on_chunk=None,
|
||||
interrupt_check=None) -> bool:
|
||||
"""主流程:流式合成并播放"""
|
||||
callback = _TTSCallback(self, interrupt_check, on_chunk)
|
||||
callback = _TTSCallback(self, interrupt_check, on_chunk, self.reference_signal_buffer)
|
||||
# 使用配置的voice,request.voice为None或空时使用self.voice
|
||||
voice_to_use = request.voice if request.voice and request.voice.strip() else self.voice
|
||||
|
||||
@@ -94,24 +103,24 @@ class _TTSCallback(ResultCallback):
|
||||
|
||||
def __init__(self, tts_client: DashScopeTTSClient,
|
||||
interrupt_check=None,
|
||||
on_chunk=None):
|
||||
on_chunk=None,
|
||||
reference_signal_buffer=None):
|
||||
self.tts_client = tts_client
|
||||
self.interrupt_check = interrupt_check
|
||||
self.on_chunk = on_chunk
|
||||
self.reference_signal_buffer = reference_signal_buffer # 参考信号缓冲区
|
||||
self._proc = None
|
||||
self._interrupted = False
|
||||
self._cleaned_up = False
|
||||
|
||||
def on_open(self):
|
||||
# 使用ffmpeg播放,自动处理采样率转换(22050 -> 设备采样率)
|
||||
# TTS服务输出固定为22050Hz单声道,ffmpeg会自动转换为设备采样率和声道数
|
||||
tts_output_rate = 22050 # TTS服务固定输出采样率
|
||||
tts_output_channels = 1 # TTS服务固定输出声道数(单声道)
|
||||
# 使用ffmpeg播放,自动处理采样率转换(TTS源采样率 -> 设备采样率)
|
||||
# TTS服务输出固定采样率和声道数,ffmpeg会自动转换为设备采样率和声道数
|
||||
ffmpeg_cmd = [
|
||||
'ffmpeg',
|
||||
'-f', 's16le', # 原始 PCM
|
||||
'-ar', str(tts_output_rate), # TTS输出采样率(固定22050)
|
||||
'-ac', str(tts_output_channels), # TTS输出声道数(固定单声道)
|
||||
'-ar', str(self.tts_client.tts_source_sample_rate), # TTS输出采样率(从配置文件读取)
|
||||
'-ac', str(self.tts_client.tts_source_channels), # TTS输出声道数(从配置文件读取)
|
||||
'-i', 'pipe:0', # stdin
|
||||
'-f', 'alsa', # 输出到 ALSA
|
||||
'-ar', str(self.tts_client.output_sample_rate), # 输出设备采样率(从配置文件读取)
|
||||
@@ -120,7 +129,7 @@ class _TTSCallback(ResultCallback):
|
||||
'-fflags', 'nobuffer', # 减少缓冲
|
||||
'-flags', 'low_delay', # 低延迟
|
||||
'-avioflags', 'direct', # 尝试直通写入 ALSA,减少延迟
|
||||
'-thread_queue_size', '1024', # 输入线程队列大小,防止丢帧
|
||||
'-thread_queue_size', str(self.tts_client.tts_ffmpeg_thread_queue_size), # 输入线程队列大小(从配置文件读取)
|
||||
self.tts_client.alsa_device
|
||||
]
|
||||
|
||||
@@ -142,6 +151,9 @@ class _TTSCallback(ResultCallback):
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.PIPE # 改为PIPE以便捕获错误
|
||||
)
|
||||
# 记录ffmpeg进程PID
|
||||
self.tts_client.current_ffmpeg_pid = self._proc.pid
|
||||
self.tts_client._log("debug", f"ffmpeg进程已启动,PID={self._proc.pid}")
|
||||
|
||||
def on_complete(self):
|
||||
pass
|
||||
@@ -167,6 +179,8 @@ class _TTSCallback(ResultCallback):
|
||||
self._proc.terminate()
|
||||
return
|
||||
|
||||
# 优先写入ffmpeg,避免阻塞播放
|
||||
# 优先写入ffmpeg,避免阻塞播放
|
||||
if self._proc and self._proc.stdin and not self._interrupted:
|
||||
try:
|
||||
self._proc.stdin.write(data)
|
||||
@@ -177,8 +191,22 @@ class _TTSCallback(ResultCallback):
|
||||
error_msg = self._proc.stderr.read().decode('utf-8', errors='ignore')
|
||||
self.tts_client._log("error", f"ffmpeg错误: {error_msg}")
|
||||
self._interrupted = True
|
||||
if self.on_chunk:
|
||||
self.on_chunk(data)
|
||||
|
||||
# 将音频数据添加到参考信号缓冲区(用于回声消除)
|
||||
# 在写入ffmpeg之后处理,避免阻塞播放
|
||||
if self.reference_signal_buffer and data:
|
||||
try:
|
||||
self.reference_signal_buffer.add_reference(
|
||||
data,
|
||||
source_sample_rate=self.tts_client.tts_source_sample_rate,
|
||||
source_channels=self.tts_client.tts_source_channels
|
||||
)
|
||||
except Exception as e:
|
||||
# 参考信号处理失败不应影响播放
|
||||
self.tts_client._log("warning", f"参考信号处理失败: {e}")
|
||||
|
||||
if self.on_chunk:
|
||||
self.on_chunk(data)
|
||||
|
||||
def cleanup(self):
|
||||
"""清理资源"""
|
||||
@@ -208,4 +236,8 @@ class _TTSCallback(ResultCallback):
|
||||
self._proc.wait(timeout=0.1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# 清空PID记录
|
||||
if self.tts_client.current_ffmpeg_pid == self._proc.pid:
|
||||
self.tts_client.current_ffmpeg_pid = None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user