增加AEC

2026-01-13 22:14:46 +08:00
parent 838a4a357c
commit eb91e2f139
13 changed files with 878 additions and 905 deletions
--- a/README.md
+++ b/README.md
@@ -28,18 +28,53 @@ ros2 launch robot_speaker voice.launch.py

 ## 架构说明
 [录音线程] - 唯一实时线程
+  ├─ 麦克风采集 PCM
  ├─ VAD + 能量检测
  ├─ 检测到人声 → 立即中断TTS
-  └─ 音频chunk → 音频队列
+  ├─ 语音 PCM → ASR 音频队列
+  └─ 语音 PCM → 声纹音频队列（旁路，不阻塞）

 [ASR推理线程] - 只做 audio → text
-  └─ 从音频队列取chunk → ASR识别 → sentence_end → 文本队列
+  └─ 从 ASR 音频队列取音频→ 实时 / 流式 ASR → text → 文本队列
+
+[声纹识别线程] - 非实时、低频（CAM++）
+  ├─ 通过回调函数接收音频chunk，写入缓冲区，等待 speech_end 事件触发处理
+  ├─ 累积 1~2 秒有效人声（VAD 后）
+  ├─ CAM++ 提取 speaker embedding
+  ├─ 声纹匹配 / 注册
+  └─ 更新 current_speaker_id（共享状态，只写不控）
+声纹线程要求：不影响录音，不影响ASR，不控制TTS，只更新当前说话人是谁

 [主线程/处理线程] - 处理业务逻辑
-  ├─ 从文本队列取文本
-  ├─ 唤醒词处理
-  ├─ LLM处理（流式）
-  └─ TTS文本 → TTS队列
+  ├─ 从 文本队列 取 ASR 文本
+  ├─ 读取 current_speaker_id（只读）
+  ├─ 唤醒词处理（结合 speaker_id）
+  ├─ 权限 / 身份判断（是否允许继续）
+  ├─ VLM处理（文本 / 多模态）
+  └─ TTS播放（启动TTS线程，不等待）

-[TTS播放线程] - 只播放
-  └─ 从TTS队列取文本 → 播放音频，响应中断标志
+[TTS播放线程] - 只播放（可被中断）
+  ├─ 接收 TTS 音频流
+  ├─ 播放到输出设备
+  └─ 响应中断标志（由录音线程触发）
+
+
+## 用到的命令
+1. 音频设备
+```bash
+# 1. 查看所有音频设备
+cat /proc/asound/cards
+# 2. 查看 card(1)的流信息（设备参数）
+cat /proc/asound/card1/stream0
+```
+
+2. 相机设备
+```bash
+# 1. 查看相机所有基础信息（型号、固件版本、序列号等）
+rs-enumerate-devices -c 
+```
+
+3. 模型下载
+```bash
+modelscope download --model iic/speech_campplus_sv_zh-cn_16k-common --local_dir [指定路径]
+```
--- a/config/camera.yaml
+++ b/config/camera.yaml
@@ -1,23 +0,0 @@
-# 相机配置文件
-# 相机默认一直运行，用户说拍照时自动捕获图像
-
-camera:
-  serial_number: null  # 相机序列号（null表示使用第一个可用设备）
-  
-  # RGB流配置
-  rgb:
-    width: 640   # 图像宽度
-    height: 480  # 图像高度
-    fps: 30      # 帧率（支持：6, 10, 15, 30, 60）
-    format: "RGB8"  # 图像格式：RGB8, BGR8
-  
-  # 图像处理配置
-  image:
-    jpeg_quality: 85  # JPEG压缩质量（0-100，85是质量和大小平衡点）
-    max_size: null  # 最大尺寸（null表示不缩放，格式："1280x720"）
-  
-  # 相机指令关键词（拼音）
-  commands:
-    capture_keywords: ["pai zhao", "pai ge zhao", "pai zhang", "da kai xiang ji", "kan zhe li", "zhao xiang"]  # 拍照相关指令
-    # capture_keywords: ["拍照", "拍张", "打开相机", "看这里", "照相"]  # 中文指令（如果ASR直接输出中文）
-
--- a/config/speakers.json
+++ b/config/speakers.json
@@ -0,0 +1,201 @@
+{
+  "user_1768311644": {
+    "embedding": [
+      0.017083248123526573,
+      -0.01032772846519947,
+      0.0058503481559455395,
+      0.11945011466741562,
+      0.03864186629652977,
+      -0.16047827899456024,
+      0.008000967092812061,
+      0.10669729858636856,
+      0.13221754133701324,
+      0.06365424394607544,
+      -0.06943577527999878,
+      0.08401959389448166,
+      0.09903465211391449,
+      0.0407508946955204,
+      -0.07486417144536972,
+      0.0010617832886055112,
+      0.12097838521003723,
+      -0.013734623789787292,
+      -0.020789025351405144,
+      -0.02113250270485878,
+      0.008510188199579716,
+      -0.05490498244762421,
+      -0.17027714848518372,
+      0.09569162130355835,
+      -0.07379947602748871,
+      0.05932804197072983,
+      0.0839226171374321,
+      0.004776939284056425,
+      0.050190482288599014,
+      -0.19962339103221893,
+      -0.13987377285957336,
+      0.041607797145843506,
+      0.10067984461784363,
+      0.0684289038181305,
+      0.08163066953420639,
+      -0.029243428260087967,
+      -0.10118222236633301,
+      -0.11619988083839417,
+      -0.10121472179889679,
+      -0.04290663078427315,
+      -0.08373524248600006,
+      0.03493887186050415,
+      0.055566269904375076,
+      -0.11284282803535461,
+      -0.10970190167427063,
+      0.03457016497850418,
+      0.11647575348615646,
+      -0.014930102974176407,
+      -0.04663793370127678,
+      0.0752566009759903,
+      -0.06746217608451843,
+      -0.07642832398414612,
+      0.06518206000328064,
+      0.07191824167966843,
+      0.13557033240795135,
+      0.04906972125172615,
+      0.03679114207625389,
+      0.07466751337051392,
+      0.01071798987686634,
+      -0.07979520410299301,
+      -0.10039637982845306,
+      0.004846179857850075,
+      -0.07325125485658646,
+      -0.08750395476818085,
+      0.05332862585783005,
+      0.10648373514413834,
+      -0.035643525421619415,
+      0.21233271062374115,
+      0.011915713548660278,
+      0.13632774353027344,
+      0.10383394360542297,
+      -0.053550489246845245,
+      0.05719169229269028,
+      0.04600509628653526,
+      0.043678827583789825,
+      -0.03646669536828995,
+      0.08175459504127502,
+      0.042513635009527206,
+      -0.09215544164180756,
+      -0.06402364373207092,
+      -0.10830589383840561,
+      0.03379691392183304,
+      0.07699205726385117,
+      -0.11046901345252991,
+      -0.016612332314252853,
+      -0.02984754927456379,
+      0.00998819898813963,
+      -0.05820641294121742,
+      0.007753593847155571,
+      -0.016712933778762817,
+      0.0014505418948829174,
+      -0.04807407408952713,
+      -0.048170242458581924,
+      -0.0531715452671051,
+      0.019113507121801376,
+      0.08439801633358002,
+      0.010585008189082146,
+      -0.07400234043598175,
+      0.10156761854887009,
+      -0.018891986459493637,
+      -0.052156757563352585,
+      0.1302887201309204,
+      0.08590760082006454,
+      0.13382190465927124,
+      -0.1498136967420578,
+      -0.030552342534065247,
+      -0.09281301498413086,
+      0.10279291868209839,
+      0.015315898694097996,
+      -0.014133274555206299,
+      -0.01298056822270155,
+      0.06241781264543533,
+      0.017693962901830673,
+      0.0007682808791287243,
+      0.029756756499409676,
+      0.12711282074451447,
+      -0.0695323497056961,
+      0.01649993099272251,
+      0.08811338990926743,
+      -0.06976141035556793,
+      -0.0763985738158226,
+      -0.10730905085802078,
+      0.0256052203476429,
+      0.05183263123035431,
+      0.0947495624423027,
+      0.007070058956742287,
+      -0.0505177341401577,
+      -0.009485805407166481,
+      0.003954170271754265,
+      0.014901814050972462,
+      -0.08098141849040985,
+      0.03615008667111397,
+      -0.09673020988702774,
+      0.06970252841711044,
+      0.009914563037455082,
+      -0.012040670961141586,
+      -0.0008170561632141471,
+      -0.06880783289670944,
+      -0.053053151816129684,
+      0.05272500216960907,
+      0.021709589287638664,
+      -0.09712725877761841,
+      0.06947346031665802,
+      -0.07973745465278625,
+      -0.036861639469861984,
+      -0.08714801073074341,
+      0.05473816394805908,
+      -0.006384482141584158,
+      -0.03656519949436188,
+      0.0605260394513607,
+      0.0407724604010582,
+      -0.1314084380865097,
+      -0.05484895780682564,
+      0.014381998218595982,
+      -0.07414797693490982,
+      -0.013259666971862316,
+      -0.1076463982462883,
+      -0.04896606504917145,
+      0.050690483301877975,
+      0.0719417929649353,
+      0.04990950971841812,
+      -0.049923382699489594,
+      0.08706197887659073,
+      -0.06278207153081894,
+      -0.029196983203291893,
+      -0.07312408834695816,
+      0.01651231199502945,
+      0.025062547996640205,
+      -0.023919139057397842,
+      0.05597180873155594,
+      0.08446669578552246,
+      -0.06616690754890442,
+      0.011679486371576786,
+      0.008357426151633263,
+      -0.07388673722743988,
+      0.03612314909696579,
+      -0.055705588310956955,
+      -0.008656222373247147,
+      -0.06408344209194183,
+      -0.05341912433505058,
+      0.01561578270047903,
+      0.002446901286020875,
+      0.042539432644844055,
+      0.12226217240095139,
+      -0.03700198978185654,
+      0.02393815666437149,
+      -0.021217981353402138,
+      0.04431416094303131,
+      -0.09150857478380203,
+      -0.004766684491187334,
+      -0.06133556738495827,
+      0.07721113413572311
+    ],
+    "env": "near",
+    "threshold": 0.4,
+    "registered_at": 1768311644.5742264
+  }
+}
--- a/config/voice.yaml
+++ b/config/voice.yaml
@@ -10,7 +10,7 @@ dashscope:
    base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"
    temperature: 0.7
    max_tokens: 4096
-    max_history: 5
+    max_history: 10
  tts:
    model: "cosyvoice-v3-flash"
    voice: "longanyang"
@@ -21,12 +21,19 @@ audio:
    sample_rate: 16000  # 输入采样率：16kHz（语音识别常用采样率）
    channels: 1  # 输入声道数：单声道（MONO，适合语音采集）
    chunk: 1024
+    heartbeat_interval: 2.0  # 心跳间隔（秒），用于定期输出录音状态
  soundcard:
    card_index: 1  # USB Audio Device (card 1)
    device_index: 0  # USB Audio [USB Audio] (device 0)
-    sample_rate: 48000  # 输出采样率：48kHz（支持48000或44100）
+    sample_rate: 44100  # 输出采样率：44.1kHz（支持48000或44100）
    channels: 2  # 输出声道数：立体声（2声道，FL+FR）
-    volume: 0.3  # 音量比例（0.0-1.0，0.2表示20%音量）
+    volume: 1.0  # 音量比例（0.0-1.0，0.2表示20%音量）
+  echo_cancellation:
+    max_duration_ms: 500  # 参考信号缓冲区最大时长（毫秒）
+  tts:
+    source_sample_rate: 22050  # TTS服务固定输出采样率（DashScope服务固定值，不可修改）
+    source_channels: 1  # TTS服务固定输出声道数（DashScope服务固定值，不可修改）
+    ffmpeg_thread_queue_size: 4096  # ffmpeg输入线程队列大小（增大以减少卡顿）

 vad:
  vad_mode: 3  # VAD模式：0-3，3最严格
@@ -37,8 +44,25 @@ system:
  use_llm: true  # 是否使用LLM
  use_wake_word: true  # 是否启用唤醒词检测
  wake_word: "er gou"  # 唤醒词（拼音）
-  session_timeout: 10.0  # 会话超时时间（秒）
+  session_timeout: 3.0  # 会话超时时间（秒）
+  shutup_keywords: "bi zui"  # 闭嘴指令关键词（拼音，逗号分隔）
+  interrupt_command_queue_depth: 10  # 中断命令订阅的队列深度（QoS）
  sv_enabled: true  # 是否启用声纹识别
  sv_model_path: "~/ros_learn/speech_campplus_sv_zh-cn_16k-common"  # 声纹模型路径
-  sv_threshold: 0.35  # 声纹识别阈值（0.0-1.0，值越小越宽松，值越大越严格）
+  sv_threshold: 0.55  # 声纹识别阈值（0.0-1.0，值越小越宽松，值越大越严格）
  sv_speaker_db_path: "config/speakers.json"  # 声纹数据库保存路径（JSON格式，相对于ROS2包share目录）
+  sv_buffer_size: 64000  # 声纹验证录音缓冲区大小（样本数）
+  sv_registration_silence_threshold_ms: 500  # 声纹注册状态下的静音阈值（毫秒）
+
+camera:
+  serial_number: "405622075404"  # 相机序列号（Intel RealSense D435）
+  rgb:
+    width: 640   # 图像宽度
+    height: 480  # 图像高度
+    fps: 30      # 帧率（支持：6, 10, 15, 30, 60）
+    format: "RGB8"  # 图像格式：RGB8, BGR8
+  image:
+    jpeg_quality: 85  # JPEG压缩质量（0-100，85是质量和大小平衡点）
+    max_size: "1280x720"  # 最大尺寸
+  commands:
+    capture_keywords: "pai zhao,pai ge zhao,pai zhang zhao pian,pai zhang,da kai xiang ji,kan zhe li,zhao xiang"  # 拍照相关指令（拼音，逗号分隔）
--- a/launch/voice.launch.py
+++ b/launch/voice.launch.py
@@ -1,305 +1,14 @@
 from launch import LaunchDescription
 from launch_ros.actions import Node
-from launch.actions import DeclareLaunchArgument
-from launch.substitutions import LaunchConfiguration
-import os
-from ament_index_python.packages import get_package_share_directory
-import yaml


 def generate_launch_description():
-    # 加载配置文件
-    voice_config_file = os.path.join(
-        get_package_share_directory('robot_speaker'),
-        'config',
-        'voice.yaml'
-    )
-    
-    camera_config_file = os.path.join(
-        get_package_share_directory('robot_speaker'),
-        'config',
-        'camera.yaml'
-    )
-    
-    with open(voice_config_file, 'r') as f:
-        config = yaml.safe_load(f)
-    
-    # 加载相机配置（如果文件存在）
-    camera_config = None
-    if os.path.exists(camera_config_file):
-        try:
-            with open(camera_config_file, 'r') as f:
-                camera_config = yaml.safe_load(f)
-        except Exception as e:
-            print(f"警告: 无法加载相机配置文件: {e}")
-    
-    # 从配置文件提取参数
-    dashscope_config = config['dashscope']
-    audio_config = config['audio']
-    vad_config = config['vad']
-    system_config = config['system']
-    camera_config_data = camera_config.get('camera', {}) if camera_config else {}
-    
+    """启动语音交互节点，所有参数从 voice.yaml 读取"""
    return LaunchDescription([
-        # 音频输入参数
-        DeclareLaunchArgument(
-            'input_device_index',
-            default_value=str(audio_config['microphone']['device_index']),
-            description='麦克风设备索引'
-        ),
-        
-        # 音频输出参数
-        DeclareLaunchArgument(
-            'output_card_index',
-            default_value=str(audio_config['soundcard']['card_index']),
-            description='声卡 card index'
-        ),
-        DeclareLaunchArgument(
-            'output_device_index',
-            default_value=str(audio_config['soundcard']['device_index']),
-            description='声卡 device index'
-        ),
-        DeclareLaunchArgument(
-            'output_sample_rate',
-            default_value=str(audio_config['soundcard'].get('sample_rate', 48000)),
-            description='输出采样率'
-        ),
-        DeclareLaunchArgument(
-            'output_channels',
-            default_value=str(audio_config['soundcard'].get('channels', 2)),
-            description='输出声道数'
-        ),
-        DeclareLaunchArgument(
-            'output_volume',
-            default_value=str(audio_config['soundcard'].get('volume', 0.2)),
-            description='输出音量比例（0.0-1.0）'
-        ),
-        
-        # 音频参数
-        DeclareLaunchArgument(
-            'sample_rate',
-            default_value=str(audio_config['microphone']['sample_rate']),
-            description='采样率'
-        ),
-        DeclareLaunchArgument(
-            'channels',
-            default_value=str(audio_config['microphone']['channels']),
-            description='声道数'
-        ),
-        DeclareLaunchArgument(
-            'chunk',
-            default_value=str(audio_config['microphone']['chunk']),
-            description='音频块大小'
-        ),
-        
-        # VAD参数
-        DeclareLaunchArgument(
-            'vad_mode',
-            default_value=str(vad_config['vad_mode']),
-            description='VAD模式：0-3，3最严格'
-        ),
-        DeclareLaunchArgument(
-            'silence_duration_ms',
-            default_value=str(vad_config.get('silence_duration_ms', 1000)),
-            description='静音持续时长（毫秒）'
-        ),
-        DeclareLaunchArgument(
-            'min_energy_threshold',
-            default_value=str(vad_config.get('min_energy_threshold', 300)),
-            description='最小能量阈值'
-        ),
-        
-        # DashScope参数
-        DeclareLaunchArgument(
-            'dashscope_api_key',
-            default_value=dashscope_config['api_key'],
-            description='DashScope API Key'
-        ),
-        DeclareLaunchArgument(
-            'asr_model',
-            default_value=dashscope_config['asr']['model'],
-            description='ASR模型名称'
-        ),
-        DeclareLaunchArgument(
-            'asr_url',
-            default_value=dashscope_config['asr']['url'],
-            description='ASR WebSocket URL'
-        ),
-        DeclareLaunchArgument(
-            'llm_model',
-            default_value=dashscope_config['llm']['model'],
-            description='LLM模型名称'
-        ),
-        DeclareLaunchArgument(
-            'llm_base_url',
-            default_value=dashscope_config['llm']['base_url'],
-            description='LLM API Base URL'
-        ),
-        DeclareLaunchArgument(
-            'llm_temperature',
-            default_value=str(dashscope_config['llm']['temperature']),
-            description='LLM温度参数'
-        ),
-        DeclareLaunchArgument(
-            'llm_max_tokens',
-            default_value=str(dashscope_config['llm']['max_tokens']),
-            description='LLM最大token数'
-        ),
-        DeclareLaunchArgument(
-            'llm_max_history',
-            default_value=str(dashscope_config['llm']['max_history']),
-            description='LLM最大对话历史条数'
-        ),
-        DeclareLaunchArgument(
-            'tts_model',
-            default_value=dashscope_config['tts']['model'],
-            description='TTS模型名称'
-        ),
-        DeclareLaunchArgument(
-            'tts_voice',
-            default_value=dashscope_config['tts']['voice'],
-            description='TTS语音'
-        ),
-        
-        # 系统参数
-        DeclareLaunchArgument(
-            'use_llm',
-            default_value=str(system_config['use_llm']).lower(),
-            description='是否使用LLM'
-        ),
-        DeclareLaunchArgument(
-            'use_wake_word',
-            default_value=str(system_config['use_wake_word']).lower(),
-            description='是否启用唤醒词检测'
-        ),
-        DeclareLaunchArgument(
-            'wake_word',
-            default_value=system_config['wake_word'],
-            description='唤醒词'
-        ),
-        DeclareLaunchArgument(
-            'session_timeout',
-            default_value=str(system_config.get('session_timeout', 30.0)),
-            description='会话超时时间（秒）'
-        ),
-        
-        # 声纹识别参数
-        DeclareLaunchArgument(
-            'sv_enabled',
-            default_value=str(system_config.get('sv_enabled', True)).lower(),
-            description='是否启用声纹识别'
-        ),
-        DeclareLaunchArgument(
-            'sv_model_path',
-            default_value=os.path.expanduser(system_config.get('sv_model_path', '')),
-            description='声纹模型路径'
-        ),
-        DeclareLaunchArgument(
-            'sv_threshold',
-            default_value=str(system_config.get('sv_threshold', 0.45)),
-            description='声纹识别阈值'
-        ),
-        DeclareLaunchArgument(
-            'sv_speaker_db_path',
-            default_value=os.path.join(
-                get_package_share_directory('robot_speaker'),
-                system_config.get('sv_speaker_db_path', 'config/speakers.json')
-            ) if system_config.get('sv_speaker_db_path') else '',
-            description='声纹数据库路径'
-        ),
-        
-        # 相机参数
-        DeclareLaunchArgument(
-            'camera_serial_number',
-            default_value=str(camera_config_data.get('serial_number', '')) if camera_config_data.get('serial_number') else '',
-            description='相机序列号'
-        ),
-        DeclareLaunchArgument(
-            'camera_width',
-            default_value=str(camera_config_data.get('rgb', {}).get('width', 640)),
-            description='相机图像宽度'
-        ),
-        DeclareLaunchArgument(
-            'camera_height',
-            default_value=str(camera_config_data.get('rgb', {}).get('height', 480)),
-            description='相机图像高度'
-        ),
-        DeclareLaunchArgument(
-            'camera_fps',
-            default_value=str(camera_config_data.get('rgb', {}).get('fps', 30)),
-            description='相机帧率'
-        ),
-        DeclareLaunchArgument(
-            'camera_format',
-            default_value=camera_config_data.get('rgb', {}).get('format', 'RGB8'),
-            description='相机图像格式'
-        ),
-        DeclareLaunchArgument(
-            'camera_jpeg_quality',
-            default_value=str(camera_config_data.get('image', {}).get('jpeg_quality', 85)),
-            description='JPEG压缩质量'
-        ),
-        DeclareLaunchArgument(
-            'camera_capture_keywords',
-            default_value=','.join(camera_config_data.get('commands', {}).get('capture_keywords', ['pai zhao'])),
-            description='相机拍照指令关键词（逗号分隔）'
-        ),
-        
-        # 语音节点
        Node(
            package='robot_speaker',
            executable='robot_speaker_node',
            name='robot_speaker_node',
-            parameters=[{
-                # 音频参数
-                'input_device_index': LaunchConfiguration('input_device_index'),
-                'output_card_index': LaunchConfiguration('output_card_index'),
-                'output_device_index': LaunchConfiguration('output_device_index'),
-                'sample_rate': LaunchConfiguration('sample_rate'),
-                'channels': LaunchConfiguration('channels'),
-                'chunk': LaunchConfiguration('chunk'),
-                'output_sample_rate': LaunchConfiguration('output_sample_rate'),
-                'output_channels': LaunchConfiguration('output_channels'),
-                'output_volume': LaunchConfiguration('output_volume'),
-                
-                # VAD参数
-                'vad_mode': LaunchConfiguration('vad_mode'),
-                'silence_duration_ms': LaunchConfiguration('silence_duration_ms'),
-                'min_energy_threshold': LaunchConfiguration('min_energy_threshold'),
-                
-                # DashScope参数
-                'dashscope_api_key': LaunchConfiguration('dashscope_api_key'),
-                'asr_model': LaunchConfiguration('asr_model'),
-                'asr_url': LaunchConfiguration('asr_url'),
-                'llm_model': LaunchConfiguration('llm_model'),
-                'llm_base_url': LaunchConfiguration('llm_base_url'),
-                'llm_temperature': LaunchConfiguration('llm_temperature'),
-                'llm_max_tokens': LaunchConfiguration('llm_max_tokens'),
-                'llm_max_history': LaunchConfiguration('llm_max_history'),
-                'tts_model': LaunchConfiguration('tts_model'),
-                'tts_voice': LaunchConfiguration('tts_voice'),
-                
-                # 系统参数
-                'use_llm': LaunchConfiguration('use_llm'),
-                'use_wake_word': LaunchConfiguration('use_wake_word'),
-                'wake_word': LaunchConfiguration('wake_word'),
-                'session_timeout': LaunchConfiguration('session_timeout'),
-                
-                # 声纹识别参数
-                'sv_enabled': LaunchConfiguration('sv_enabled'),
-                'sv_model_path': LaunchConfiguration('sv_model_path'),
-                'sv_threshold': LaunchConfiguration('sv_threshold'),
-                'sv_speaker_db_path': LaunchConfiguration('sv_speaker_db_path'),
-                
-                # 相机参数
-                'camera_serial_number': LaunchConfiguration('camera_serial_number'),
-                'camera_width': LaunchConfiguration('camera_width'),
-                'camera_height': LaunchConfiguration('camera_height'),
-                'camera_fps': LaunchConfiguration('camera_fps'),
-                'camera_format': LaunchConfiguration('camera_format'),
-                'camera_jpeg_quality': LaunchConfiguration('camera_jpeg_quality'),
-                'camera_capture_keywords': LaunchConfiguration('camera_capture_keywords'),
-            }],
            output='screen'
        ),
    ])
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,16 @@
 dashscope>=1.20.0
 openai>=1.0.0
 pyaudio>=0.2.11
-webrtcvad>=2.0.10
+webrtcvad>=2.0.10  # WebRTC VAD（语音活动检测），不包含回声消除
 pypinyin>=0.49.0
 rclpy>=3.0.0
 pyrealsense2>=2.54.0
 Pillow>=10.0.0
 numpy>=1.24.0
+# 回声消除库（可选）：
+# aec-audio-processing - 专门用于回声消除的WebRTC库，API简单（推荐）
+# pip install aec-audio-processing
+# 如果未安装，将使用内置的简单自适应算法



--- a/robot_speaker/asr.py
+++ b/robot_speaker/asr.py
@@ -66,14 +66,14 @@ class DashScopeASR:
            
            self.conversation.connect()

-            # 自定义文本语料增强识别
-            custom_text = "蜂核科技, 杭州蜂核科技有限公司, 西林瓶,瓶子"
+            # 自定义文本语料增强识别,听不清的时候高概率说这个词
+            # custom_text = "二狗"
            
            transcription_params = TranscriptionParams(
                language='zh',
                sample_rate=self.sample_rate,
                input_audio_format="pcm",
-                corpus_text=custom_text,
+                # corpus_text=custom_text,
            )

            # 本地 VAD → 只控制 TTS 打断
--- a/robot_speaker/audio.py
+++ b/robot_speaker/audio.py
@@ -1,11 +1,12 @@
 """
-音频处理模块：录音 + VAD
+音频处理模块：录音 + VAD + 回声消除
 """
 import time
 import pyaudio
 import webrtcvad
 import struct
 import queue
+from .echo_cancellation import EchoCanceller, ReferenceSignalBuffer


 class VADDetector:
@@ -40,6 +41,8 @@ class AudioRecorder:
                 on_audio_chunk=None,  # 音频chunk回调（用于声纹录音等，可选）
                 should_put_to_queue=None,  # 检查是否应该将音频放入队列（用于阻止ASR，可选）
                 get_silence_threshold=None,  # 获取动态静音阈值（毫秒，可选）
+                 enable_echo_cancellation: bool = True,  # 是否启用回声消除
+                 reference_signal_buffer: ReferenceSignalBuffer = None,  # 参考信号缓冲区（可选）
                 logger=None):
        self.device_index = device_index
        self.sample_rate = sample_rate
@@ -64,6 +67,39 @@ class AudioRecorder:
        self.audio = pyaudio.PyAudio()
        self.format = pyaudio.paInt16
        self._debug_counter = 0
+        
+        # 回声消除相关
+        self.enable_echo_cancellation = enable_echo_cancellation
+        self.reference_signal_buffer = reference_signal_buffer
+        if enable_echo_cancellation:
+            # 初始化回声消除器（在录音线程中同步处理，不是单独线程）
+            # frame_size设置为chunk大小，确保每次处理一个chunk
+            frame_size = chunk
+            try:
+                # 获取参考信号声道数（从reference_signal_buffer获取，因为它是根据播放声道数创建的）
+                ref_channels = self.reference_signal_buffer.channels if self.reference_signal_buffer else 1
+                self.echo_canceller = EchoCanceller(
+                    sample_rate=sample_rate,
+                    frame_size=frame_size,
+                    channels=self.channels,  # 麦克风输入：1声道
+                    ref_channels=ref_channels,  # 参考信号：播放声道数（2声道）
+                    logger=logger
+                )
+                if self.echo_canceller.aec is not None:
+                    if logger:
+                        logger.info(f"回声消除器已启用: sample_rate={sample_rate}, frame_size={frame_size}")
+                else:
+                    if logger:
+                        logger.warning("回声消除器初始化失败，将禁用回声消除功能")
+                    self.enable_echo_cancellation = False
+                    self.echo_canceller = None
+            except Exception as e:
+                if logger:
+                    logger.warning(f"回声消除器初始化失败: {e}，将禁用回声消除功能")
+                self.enable_echo_cancellation = False
+                self.echo_canceller = None
+        else:
+            self.echo_canceller = None
    
    def record_with_vad(self):
        """
@@ -103,18 +139,34 @@ class AudioRecorder:
                # exception_on_overflow=False, 宁可丢帧，也不阻塞
                data = stream.read(self.chunk, exception_on_overflow=False)
                
+                # 回声消除处理
+                processed_data = data
+                if self.enable_echo_cancellation and self.echo_canceller and self.reference_signal_buffer:
+                    try:
+                        # 获取参考信号（长度与麦克风信号匹配）
+                        ref_signal = self.reference_signal_buffer.get_reference(num_samples=self.chunk)
+                        # 执行回声消除
+                        processed_data = self.echo_canceller.process(data, ref_signal)
+                    except Exception as e:
+                        if self.logger:
+                            self.logger.warning(f"回声消除处理失败: {e}，使用原始音频")
+                        processed_data = data
+                
                # 检查是否应该将音频放入队列（用于阻止ASR，例如无声纹文件时需要注册）
                if self.should_put_to_queue():
                    # 队列满时丢弃最旧的数据，ASR 跟不上时系统仍然听得见
                    if self.audio_queue.full():
                        self.audio_queue.get_nowait()
-                    self.audio_queue.put_nowait(data)
+                    # 使用处理后的音频数据（经过回声消除）
+                    self.audio_queue.put_nowait(processed_data)
                
                # 音频chunk回调（用于声纹录音等，仅在需要时调用）
                if self.on_audio_chunk:
-                    self.on_audio_chunk(data)
+                    # 回调使用处理后的音频数据
+                    self.on_audio_chunk(processed_data)
                
-                audio_buffer.append(data) # 只用于 VAD，不用于 ASR
+                # VAD检测使用处理后的音频（经过回声消除）
+                audio_buffer.append(processed_data) # 只用于 VAD，不用于 ASR

                # VAD检测窗口
                now = time.time()
--- a/robot_speaker/camera.py
+++ b/robot_speaker/camera.py
@@ -7,17 +7,12 @@ import contextlib


 class CameraClient:
-    """
-    相机客户端 - 封装RealSense相机操作
-    相机初始化后一直运行，capture_rgb() 只负责从运行中的管道捕获一帧
-    """
-    
    def __init__(self, 
-                 serial_number: str | None = None,
-                 width: int = 640,
-                 height: int = 480,
-                 fps: int = 30,
-                 format: str = 'RGB8',
+                 serial_number: str | None,
+                 width: int,
+                 height: int,
+                 fps: int,
+                 format: str,
                 logger=None):
        self.serial_number = serial_number
        self.width = width
@@ -35,7 +30,7 @@ class CameraClient:
        if self.logger:
            getattr(self.logger, level, self.logger.info)(msg)
        else:
-            print(f"[Camera] {msg}")
+            print(f"[相机] {msg}")
    
    def initialize(self) -> bool:
        """
@@ -68,9 +63,6 @@ class CameraClient:
            self._is_initialized = True
            self._log("info", f"相机已启动并保持运行: {self.width}x{self.height}@{self.fps}fps")
            return True
-        except ImportError:
-            self._log("error", "pyrealsense2库未安装，请运行: pip install pyrealsense2")
-            return False
        except Exception as e:
            self._log("error", f"相机初始化失败: {e}")
            self.cleanup()
@@ -98,10 +90,6 @@ class CameraClient:
            frames = self.pipeline.wait_for_frames()
            color_frame = frames.get_color_frame()
            
-            if not color_frame:
-                self._log("warning", "未获取到颜色帧")
-                return None
-            
            return np.asanyarray(color_frame.get_data())
        except Exception as e:
            self._log("error", f"捕获图像失败: {e}")
--- a/robot_speaker/echo_cancellation.py
+++ b/robot_speaker/echo_cancellation.py
@@ -0,0 +1,157 @@
+"""
+回声消除模块
+mic = 人声 + 扬声器回声 + 环境噪声
+ref = 声卡原始播放音频
+AEC(mic, ref) → 去掉 ref 在 mic 中的那一部分
+"""
+import numpy as np
+import struct
+import threading
+from collections import deque
+import aec_audio_processing
+
+
+class EchoCanceller:
+    """回声消除器"""
+    
+    def __init__(self, sample_rate: int, frame_size: int, channels: int, 
+                 ref_channels: int, logger=None):
+        
+        self.sample_rate = sample_rate
+        self.frame_size = frame_size
+        self.channels = channels
+        self.ref_channels = ref_channels
+        self.logger = logger
+        self.aec = None
+        self.aec_frame_size = None  # AudioProcessor期望的帧大小（固定10ms=160样本）
+        
+        # 初始化aec-audio-processing的AudioProcessor
+        try:
+            self.aec = aec_audio_processing.AudioProcessor(
+                enable_aec=True, # 回声消除
+                enable_ns=False, # 降噪
+                enable_agc=False # 自动增益
+            )
+            
+            # 设置流格式（麦克风输入：1声道，16kHz）
+            self.aec.set_stream_format(
+                sample_rate_in=sample_rate,
+                channel_count_in=channels,
+                sample_rate_out=sample_rate,
+                channel_count_out=channels
+            )
+            
+            # 设置反向流格式（参考信号：播放是2声道，重采样到16kHz）
+            # 参考信号是声卡播放的音频（2声道），重采样到16kHz用于回声消除
+            self.aec.set_reverse_stream_format(sample_rate, ref_channels)
+            
+            # 获取AudioProcessor期望的帧大小（固定10ms）
+            self.aec_frame_size = self.aec.get_frame_size()
+            if logger:
+                logger.info(f"AudioProcessor期望的帧大小: {self.aec_frame_size} 样本 ({self.aec_frame_size / sample_rate * 1000}ms)")
+        except Exception as e:
+            if logger:
+                logger.warning(f"aec_audio_processing 初始化失败: {e}，将禁用回声消除")
+            self.aec = None
+    
+    def process(self, mic_signal: bytes, ref_signal: bytes = None) -> bytes:
+        """处理音频数据，消除回声（在录音线程中同步处理）"""
+        if self.aec is None or ref_signal is None or self.aec_frame_size is None:
+            return mic_signal
+        
+        # 保存原始长度
+        original_mic_len = len(mic_signal)
+        
+        # AudioProcessor期望固定10ms的帧，需要将大的chunk分成多个小块处理
+        # 麦克风（1声道）：160样本 * 1声道 * 2字节 = 320字节
+        mic_frame_bytes = self.aec_frame_size * self.channels * 2
+        # 参考信号（2声道）：160样本 * 2声道 * 2字节 = 640字节
+        ref_frame_bytes = self.aec_frame_size * self.ref_channels * 2
+        
+        # 确保输入数据长度是帧大小的整数倍
+        if len(mic_signal) % mic_frame_bytes != 0:
+            padding = mic_frame_bytes - (len(mic_signal) % mic_frame_bytes)
+            mic_signal = mic_signal + b'\x00' * padding
+        
+        if len(ref_signal) % ref_frame_bytes != 0:
+            padding = ref_frame_bytes - (len(ref_signal) % ref_frame_bytes)
+            ref_signal = ref_signal + b'\x00' * padding
+        
+        # 分块处理：将大的chunk（1024样本）分成多个10ms块（160样本）处理
+        try:
+            num_frames = len(mic_signal) // mic_frame_bytes
+            output_chunks = []
+            for i in range(num_frames):
+                mic_chunk = mic_signal[i * mic_frame_bytes:(i + 1) * mic_frame_bytes]
+                ref_chunk = ref_signal[i * ref_frame_bytes:(i + 1) * ref_frame_bytes]
+                
+                self.aec.process_reverse_stream(ref_chunk)
+                output_chunk = self.aec.process_stream(mic_chunk)
+                
+                # AudioProcessor.process_stream返回bytes
+                output_chunks.append(output_chunk)
+            
+            result = b''.join(output_chunks)
+            return result[:original_mic_len]
+        except Exception as e:
+            if self.logger:
+                self.logger.warning(f"回声消除处理失败: {e}")
+            return mic_signal[:original_mic_len]
+
+
+class ReferenceSignalBuffer:
+    """缓存声卡播放的参考音频（供 AEC 使用）"""
+    
+    def __init__(self, max_duration_ms: int, sample_rate: int, channels: int):
+        max_samples = int(sample_rate * max_duration_ms / 1000)
+        self.sample_rate = sample_rate
+        self.channels = channels  # 参考信号声道数（播放声道数，2声道）
+        self.buffer = deque(maxlen=max_samples * channels)
+        self.lock = threading.Lock()
+    
+    def add_reference(self, audio_data: bytes, source_sample_rate: int = None, source_channels: int = 1):
+        """
+        添加参考信号
+        """
+        if not audio_data:
+            return
+        with self.lock:
+            # 重采样：TTS源采样率 -> 麦克风采样率（匹配麦克风采样率）
+            if source_sample_rate and source_sample_rate != self.sample_rate:
+                audio_data = self._resample(audio_data, source_sample_rate, self.sample_rate)
+            
+            # 转换声道数：1声道 -> 2声道（匹配播放声道数）
+            samples = struct.unpack(f'<{len(audio_data) // 2}h', audio_data)
+            if source_channels == 1 and self.channels == 2:
+                # 单声道转2声道：复制到左右声道
+                stereo_samples = [s for sample in samples for s in [sample, sample]]
+                samples = stereo_samples
+            
+            self.buffer.extend(samples)
+    
+    def get_reference(self, num_samples: int) -> bytes:
+        """获取参考信号（指定样本数，考虑声道数）"""
+        with self.lock:
+            if not self.buffer:
+                return b'\x00' * (num_samples * self.channels * 2)
+            # 需要的总样本数（考虑声道数）
+            total_samples_needed = num_samples * self.channels
+            samples = list(self.buffer)[-total_samples_needed:] if len(self.buffer) >= total_samples_needed else list(self.buffer)
+            if len(samples) < total_samples_needed:
+                samples = [0] * (total_samples_needed - len(samples)) + samples
+            return struct.pack(f'<{len(samples)}h', *samples)
+    
+    def clear(self):
+        """清空缓冲区"""
+        with self.lock:
+            self.buffer.clear()
+    
+    def _resample(self, audio_data: bytes, source_rate: int, target_rate: int) -> bytes:
+        """简单线性重采样"""
+        if source_rate == target_rate:
+            return audio_data
+        samples = np.frombuffer(audio_data, dtype=np.int16)
+        ratio = target_rate / source_rate
+        indices = np.linspace(0, len(samples) - 1, int(len(samples) * ratio))
+        resampled = np.interp(indices, np.arange(len(samples)), samples.astype(np.float32))
+        return resampled.astype(np.int16).tobytes()
--- a/robot_speaker/robot_speaker_node.py
+++ b/robot_speaker/robot_speaker_node.py
--- a/robot_speaker/speaker_verification.py
+++ b/robot_speaker/speaker_verification.py
@@ -34,9 +34,11 @@ class SpeakerVerificationClient:
        self._expected_embedding_dim = None  # 只存储维度大小，不存储shape元组
        
        from funasr import AutoModel
-        self.model = AutoModel(model=self.model_path, device="cpu")
+        # 确保模型路径是绝对路径（展开 ~）
+        model_path = os.path.expanduser(self.model_path)
+        self.model = AutoModel(model=model_path, device="cpu")
        if self.logger:
-            self.logger.info(f"声纹模型已加载: {self.model_path}, 阈值: {self.threshold}")
+            self.logger.info(f"声纹模型已加载: {model_path}, 阈值: {self.threshold}")
        
        if self.speaker_db_path:
            self.load_speakers()
@@ -176,23 +178,23 @@ class SpeakerVerificationClient:
    def match_speaker(self, embedding: np.ndarray):
        """
        匹配说话人（一句话只调用一次）
-        返回: (speaker_id: str | None, state: SpeakerState)
+        返回: (speaker_id: str | None, state: SpeakerState, score: float, threshold: float)
        """
        if not self.speaker_db:
-            return None, SpeakerState.UNKNOWN
+            return None, SpeakerState.UNKNOWN, 0.0, self.threshold
        
        embedding_dim = len(embedding)
        if embedding_dim == 0:
-            return None, SpeakerState.ERROR
+            return None, SpeakerState.ERROR, 0.0, self.threshold
        
        # 校验维度一致性
        if self._expected_embedding_dim is not None and embedding_dim != self._expected_embedding_dim:
-            return None, SpeakerState.ERROR
+            return None, SpeakerState.ERROR, 0.0, self.threshold
        
        # 归一化当前embedding（注册时已归一化，这里只需要归一化当前embedding）
        embedding_norm = np.linalg.norm(embedding)
        if embedding_norm == 0:
-            return None, SpeakerState.ERROR
+            return None, SpeakerState.ERROR, 0.0, self.threshold
        embedding_normalized = embedding / embedding_norm
        
        best_match = None
@@ -213,9 +215,10 @@ class SpeakerVerificationClient:
                if score > best_score:
                    best_score = score
                    best_match = speaker_id
-                    best_threshold = speaker_data.get("threshold", self.threshold)
+                    best_threshold = speaker_data["threshold"]
        
-        return (best_match, SpeakerState.VERIFIED) if best_score >= best_threshold else (None, SpeakerState.REJECTED)
+        state = SpeakerState.VERIFIED if best_score >= best_threshold else SpeakerState.REJECTED
+        return (best_match, state, best_score, best_threshold)
    
    def is_available(self) -> bool:
        return self.model is not None
@@ -272,16 +275,16 @@ class SpeakerVerificationClient:
                        self._log("warning", f"跳过维度不匹配的声纹: {speaker_id} (期望{self._expected_embedding_dim}, 实际{embedding_dim})")
                        continue
                    
-                    # 确保embedding已归一化（兼容旧数据）
+                    # 确保embedding已归一化
                    embedding_norm = np.linalg.norm(embedding_array)
                    if embedding_norm > 0:
                        embedding_array = embedding_array / embedding_norm
                    
                    self.speaker_db[speaker_id] = {
                        "embedding": embedding_array,
-                        "env": speaker_data.get("env", "near"),
-                        "threshold": speaker_data.get("threshold", self.threshold),
-                        "registered_at": speaker_data.get("registered_at", time.time())
+                        "env": speaker_data["env"],
+                        "threshold": speaker_data["threshold"],
+                        "registered_at": speaker_data["registered_at"]
                    }
                
                count = len(self.speaker_db)
@@ -290,47 +293,6 @@ class SpeakerVerificationClient:
                else:
                    self._log("info", f"已加载 {count} 个已注册说话人")
            return True
-        except json.JSONDecodeError as e:
-            # 尝试兼容旧的pickle格式
-            try:
-                import pickle
-                with open(self.speaker_db_path, 'rb') as f:
-                    old_data = pickle.load(f)
-                self._log("warning", "检测到旧的pickle格式，正在迁移...")
-                # 迁移逻辑：转换为新格式
-                with self._lock:
-                    for speaker_id, speaker_info in old_data.items():
-                        if hasattr(speaker_info, 'embedding'):
-                            # 旧格式：SpeakerInfo对象
-                            embedding = speaker_info.embedding
-                            embedding_norm = np.linalg.norm(embedding)
-                            if embedding_norm > 0:
-                                embedding = embedding / embedding_norm
-                            self.speaker_db[speaker_id] = {
-                                "embedding": embedding,
-                                "env": getattr(speaker_info, 'env', 'near'),
-                                "threshold": getattr(speaker_info, 'threshold', self.threshold),
-                                "registered_at": getattr(speaker_info, 'registered_at', time.time())
-                            }
-                        else:
-                            # 可能是dict格式
-                            embedding = speaker_info.get("embedding")
-                            if embedding is not None:
-                                embedding_norm = np.linalg.norm(embedding)
-                                if embedding_norm > 0:
-                                    embedding = embedding / embedding_norm
-                                self.speaker_db[speaker_id] = {
-                                    "embedding": embedding,
-                                    "env": speaker_info.get("env", "near"),
-                                    "threshold": speaker_info.get("threshold", self.threshold),
-                                    "registered_at": speaker_info.get("registered_at", time.time())
-                                }
-                    # 保存为新格式
-                    self.save_speakers()
-                    self._log("info", "已迁移到新格式")
-            except Exception as e2:
-                self._log("error", f"加载声纹数据库失败（JSON和pickle都失败）: {e}, {e2}")
-                return False
        except Exception as e:
            self._log("error", f"加载声纹数据库失败: {e}")
            return False
@@ -355,9 +317,9 @@ class SpeakerVerificationClient:
                for speaker_id, speaker_data in self.speaker_db.items():
                    json_data[speaker_id] = {
                        "embedding": speaker_data["embedding"].tolist(),  # numpy array -> list
-                        "env": speaker_data.get("env", "near"),
-                        "threshold": speaker_data.get("threshold", self.threshold),
-                        "registered_at": speaker_data.get("registered_at", time.time())
+                        "env": speaker_data["env"],
+                        "threshold": speaker_data["threshold"],
+                        "registered_at": speaker_data["registered_at"]
                    }
            
            # 使用临时文件 + 原子替换，避免写入过程中崩溃导致数据丢失
--- a/robot_speaker/tts.py
+++ b/robot_speaker/tts.py
@@ -24,9 +24,13 @@ class DashScopeTTSClient(TTSClient):
                 voice: str,
                 card_index: int, 
                 device_index: int,
-                 output_sample_rate: int = 48000,
+                 output_sample_rate: int = 44100,
                 output_channels: int = 2,
                 output_volume: float = 1.0,
+                 tts_source_sample_rate: int = 22050,  # TTS服务固定输出采样率
+                 tts_source_channels: int = 1,  # TTS服务固定输出声道数
+                 tts_ffmpeg_thread_queue_size: int = 1024,  # ffmpeg输入线程队列大小
+                 reference_signal_buffer=None,  # 参考信号缓冲区（用于回声消除）
                 logger=None):
        dashscope.api_key = api_key
        self.model = model
@@ -36,7 +40,12 @@ class DashScopeTTSClient(TTSClient):
        self.output_sample_rate = output_sample_rate
        self.output_channels = output_channels
        self.output_volume = output_volume
+        self.tts_source_sample_rate = tts_source_sample_rate
+        self.tts_source_channels = tts_source_channels
+        self.tts_ffmpeg_thread_queue_size = tts_ffmpeg_thread_queue_size
+        self.reference_signal_buffer = reference_signal_buffer  # 参考信号缓冲区
        self.logger = logger
+        self.current_ffmpeg_pid = None  # 当前ffmpeg进程的PID
        
        # 构建ALSA设备, 允许 ffmpeg 自动重采样 / 重声道
        self.alsa_device = f"plughw:{card_index},{device_index}" if (
@@ -64,7 +73,7 @@ class DashScopeTTSClient(TTSClient):
                   on_chunk=None,
                   interrupt_check=None) -> bool:
        """主流程:流式合成并播放"""
-        callback = _TTSCallback(self, interrupt_check, on_chunk)
+        callback = _TTSCallback(self, interrupt_check, on_chunk, self.reference_signal_buffer)
        # 使用配置的voice，request.voice为None或空时使用self.voice
        voice_to_use = request.voice if request.voice and request.voice.strip() else self.voice
        
@@ -94,24 +103,24 @@ class _TTSCallback(ResultCallback):
    
    def __init__(self, tts_client: DashScopeTTSClient,
                 interrupt_check=None,
-                 on_chunk=None):
+                 on_chunk=None,
+                 reference_signal_buffer=None):
        self.tts_client = tts_client
        self.interrupt_check = interrupt_check
        self.on_chunk = on_chunk
+        self.reference_signal_buffer = reference_signal_buffer  # 参考信号缓冲区
        self._proc = None
        self._interrupted = False
        self._cleaned_up = False
    
    def on_open(self):
-        # 使用ffmpeg播放，自动处理采样率转换（22050 -> 设备采样率）
-        # TTS服务输出固定为22050Hz单声道，ffmpeg会自动转换为设备采样率和声道数
-        tts_output_rate = 22050  # TTS服务固定输出采样率
-        tts_output_channels = 1  # TTS服务固定输出声道数（单声道）
+        # 使用ffmpeg播放，自动处理采样率转换（TTS源采样率 -> 设备采样率）
+        # TTS服务输出固定采样率和声道数，ffmpeg会自动转换为设备采样率和声道数
        ffmpeg_cmd = [
            'ffmpeg',
            '-f', 's16le',            # 原始 PCM
-            '-ar', str(tts_output_rate),  # TTS输出采样率（固定22050）
-            '-ac', str(tts_output_channels),  # TTS输出声道数（固定单声道）
+            '-ar', str(self.tts_client.tts_source_sample_rate),  # TTS输出采样率（从配置文件读取）
+            '-ac', str(self.tts_client.tts_source_channels),  # TTS输出声道数（从配置文件读取）
            '-i', 'pipe:0',            # stdin
            '-f', 'alsa',              # 输出到 ALSA
            '-ar', str(self.tts_client.output_sample_rate),  # 输出设备采样率（从配置文件读取）
@@ -120,7 +129,7 @@ class _TTSCallback(ResultCallback):
            '-fflags', 'nobuffer',     # 减少缓冲
            '-flags', 'low_delay',     # 低延迟
            '-avioflags', 'direct',    # 尝试直通写入 ALSA，减少延迟
-            '-thread_queue_size', '1024', # 输入线程队列大小，防止丢帧
+            '-thread_queue_size', str(self.tts_client.tts_ffmpeg_thread_queue_size),  # 输入线程队列大小（从配置文件读取）
            self.tts_client.alsa_device
        ]
        
@@ -142,6 +151,9 @@ class _TTSCallback(ResultCallback):
            stdout=subprocess.DEVNULL,
            stderr=subprocess.PIPE  # 改为PIPE以便捕获错误
        )
+        # 记录ffmpeg进程PID
+        self.tts_client.current_ffmpeg_pid = self._proc.pid
+        self.tts_client._log("debug", f"ffmpeg进程已启动，PID={self._proc.pid}")
    
    def on_complete(self):
        pass
@@ -167,6 +179,8 @@ class _TTSCallback(ResultCallback):
                self._proc.terminate()
            return
        
+        # 优先写入ffmpeg，避免阻塞播放
+        # 优先写入ffmpeg，避免阻塞播放
        if self._proc and self._proc.stdin and not self._interrupted:
            try:
                self._proc.stdin.write(data)
@@ -177,8 +191,22 @@ class _TTSCallback(ResultCallback):
                    error_msg = self._proc.stderr.read().decode('utf-8', errors='ignore')
                    self.tts_client._log("error", f"ffmpeg错误: {error_msg}")
                self._interrupted = True
-            if self.on_chunk:
-                self.on_chunk(data)
+        
+        # 将音频数据添加到参考信号缓冲区（用于回声消除）
+        # 在写入ffmpeg之后处理，避免阻塞播放
+        if self.reference_signal_buffer and data:
+            try:
+                self.reference_signal_buffer.add_reference(
+                    data, 
+                    source_sample_rate=self.tts_client.tts_source_sample_rate,
+                    source_channels=self.tts_client.tts_source_channels
+                )
+            except Exception as e:
+                # 参考信号处理失败不应影响播放
+                self.tts_client._log("warning", f"参考信号处理失败: {e}")
+        
+        if self.on_chunk:
+            self.on_chunk(data)
    
    def cleanup(self):
        """清理资源"""
@@ -208,4 +236,8 @@ class _TTSCallback(ResultCallback):
                        self._proc.wait(timeout=0.1)
                    except:
                        pass
+        
+        # 清空PID记录
+        if self.tts_client.current_ffmpeg_pid == self._proc.pid:
+            self.tts_client.current_ffmpeg_pid = None