refactor: 删除回声消除相关代码，支持从hivecore_robot_drivers/img_dev获取图片

2026-01-20 09:28:57 +08:00
parent 71062701e1
commit 98c0eb5ca5
24 changed files with 925 additions and 726 deletions
--- a/config/knowledge.json
+++ b/config/knowledge.json
@@ -3,7 +3,7 @@
    {
      "id": "robot_identity",
      "patterns": [
-        "ni shi shei"
+        "ni shi shui"
      ],
      "answer": "我叫二狗，是蜂核科技的机器人，很高兴为你服务"
    },
--- a/config/speakers.json
+++ b/config/speakers.json
@@ -595,5 +595,204 @@
    "env": "near",
    "threshold": 0.55,
    "registered_at": 1768530001.2158406
+  },
+  "user_1768845491": {
+    "embedding": [
+      0.015937702730298042,
+      0.001559161813929677,
+      -0.049822624772787094,
+      0.09989305585622787,
+      0.018279563635587692,
+      -0.2004263699054718,
+      0.006144546903669834,
+      0.005965661257505417,
+      0.012017739936709404,
+      0.020486492663621902,
+      -0.040278736501932144,
+      -0.05860017612576485,
+      0.04651434347033501,
+      0.1633484661579132,
+      -0.03308954834938049,
+      -0.022089917212724686,
+      0.12966565787792206,
+      -0.001390158082358539,
+      -0.05404090881347656,
+      -0.048333823680877686,
+      0.023206135258078575,
+      -0.05033773183822632,
+      -0.10477420687675476,
+      0.10657669603824615,
+      -0.09571000933647156,
+      -0.05239453166723251,
+      0.03246476873755455,
+      0.04507458209991455,
+      0.027843689545989037,
+      -0.15640732645988464,
+      -0.01717425137758255,
+      0.053287796676158905,
+      0.07642859220504761,
+      0.16121289134025574,
+      -0.034773923456668854,
+      0.042213018983602524,
+      0.03897469863295555,
+      -0.0613938644528389,
+      -0.01999823749065399,
+      -0.03844919428229332,
+      -0.08077485114336014,
+      0.11703582853078842,
+      -0.01661379635334015,
+      -0.11473247408866882,
+      -0.021961240097880363,
+      -0.150223046541214,
+      0.08799541741609573,
+      0.019122444093227386,
+      -0.04347413778305054,
+      0.054035451263189316,
+      -0.05809119716286659,
+      -0.05327044054865837,
+      0.026238180696964264,
+      -0.08530712872743607,
+      -0.005645385477691889,
+      0.09096740186214447,
+      0.08434951305389404,
+      0.17141343653202057,
+      -0.005146870389580727,
+      -0.08602679520845413,
+      -0.07365834712982178,
+      0.05543521046638489,
+      -0.1374669075012207,
+      -0.10697560012340546,
+      -0.009140565991401672,
+      0.004680712707340717,
+      0.12675900757312775,
+      0.0848039835691452,
+      -0.04435611888766289,
+      0.19222386181354523,
+      0.16802501678466797,
+      0.006006766576319933,
+      -0.002293711993843317,
+      0.031131887808442116,
+      0.0426473505795002,
+      -0.0454414002597332,
+      0.0455852746963501,
+      0.01862845942378044,
+      -0.056163739413022995,
+      -0.030417200177907944,
+      -0.08624715358018875,
+      0.01799696497619152,
+      0.05537595972418785,
+      0.02721775881946087,
+      -0.024693293496966362,
+      -0.09453072398900986,
+      0.03656083345413208,
+      -0.019727006554603577,
+      -0.04557542875409126,
+      0.026594148948788643,
+      0.06597225368022919,
+      -0.07168523222208023,
+      -0.17211276292800903,
+      -0.06902605295181274,
+      -0.01896323822438717,
+      -0.03277217224240303,
+      0.10336172580718994,
+      0.028470057994127274,
+      0.10734961926937103,
+      -0.09156577289104462,
+      0.06228775903582573,
+      0.07215031236410141,
+      0.0239338967949152,
+      0.158150315284729,
+      -0.10234662145376205,
+      -0.02522525191307068,
+      -0.051316920667886734,
+      0.0021482903975993395,
+      0.07491917163133621,
+      -0.02104146033525467,
+      -0.07901310920715332,
+      0.07640012353658676,
+      -0.06093406304717064,
+      -0.13202868402004242,
+      -0.06267445534467697,
+      0.08388402312994003,
+      -0.05089619383215904,
+      -0.04823632910847664,
+      0.01031999196857214,
+      -0.023580649867653847,
+      -0.09613402187824249,
+      0.02805117331445217,
+      0.07453802227973938,
+      0.12510700523853302,
+      0.14248521625995636,
+      -0.042960282415151596,
+      0.037714891135692596,
+      -0.07789590954780579,
+      0.013859922997653484,
+      0.059469543397426605,
+      -0.06383980810642242,
+      0.029322469606995583,
+      0.044372908771038055,
+      0.012625147588551044,
+      0.015539717860519886,
+      0.019373703747987747,
+      0.02928899973630905,
+      0.015136508271098137,
+      0.018850654363632202,
+      0.10889417678117752,
+      -0.026799343526363373,
+      -0.0837407261133194,
+      0.055622730404138565,
+      -0.05373093858361244,
+      -0.07665497809648514,
+      -0.060409802943468094,
+      0.1106465607881546,
+      -0.13180992007255554,
+      0.05790461599826813,
+      -0.006277923472225666,
+      0.016103282570838928,
+      -0.0385354720056057,
+      -0.032628193497657776,
+      -0.07809191197156906,
+      0.024083232507109642,
+      -0.08718746900558472,
+      -0.0539533905684948,
+      -0.11702725291252136,
+      0.027705105021595955,
+      0.06656485795974731,
+      -0.05842652544379234,
+      0.03137844428420067,
+      0.11062013357877731,
+      -0.002389072673395276,
+      -0.040558233857154846,
+      -0.02512812428176403,
+      -0.00023564467846881598,
+      0.04711990803480148,
+      0.022769151255488396,
+      -0.013735070824623108,
+      0.07807290554046631,
+      -0.047492094337940216,
+      -0.04897252842783928,
+      0.006663929205387831,
+      -0.11178303509950638,
+      -0.008013523183763027,
+      0.06803164631128311,
+      0.008022366091609001,
+      -0.04196283593773842,
+      -0.025105053558945656,
+      0.0431133434176445,
+      -0.07424937933683395,
+      0.0432509183883667,
+      0.09608350694179535,
+      -0.15923553705215454,
+      -0.028727376833558083,
+      0.01354081928730011,
+      0.01657080464065075,
+      -0.02491777203977108,
+      -0.008332896046340466,
+      0.06449767202138901,
+      0.10712931305170059
+    ],
+    "env": "near",
+    "threshold": 0.55,
+    "registered_at": 1768845491.8086796
  }
 }
--- a/config/voice.yaml
+++ b/config/voice.yaml
@@ -20,20 +20,20 @@ audio:
  microphone:
    device_index: 3  # 指向 iFLYTEK-M2 (hw:1,0)
    sample_rate: 48000  # 尝试使用硬件原生采样率 48kHz，避免重采样可能导致的问题
+    # device_index: -1  # 使用系统默认输入设备
+    # sample_rate: 16000
    channels: 1  # 输入声道数：单声道（MONO，适合语音采集）
    chunk: 1024
    heartbeat_interval: 2.0  # 心跳间隔（秒），用于定期输出录音状态
  soundcard:
    card_index: 1  # USB Audio Device (card 1)
    device_index: 0  # USB Audio [USB Audio] (device 0)
+    sample_rate: 48000  # 输出采样率：48kHz（iFLYTEK 支持 48000）
    # card_index: -1  # 使用默认声卡
    # device_index: -1  # 使用默认输出设备
-    sample_rate: 48000  # 输出采样率：48kHz（iFLYTEK 支持 48000）
+    # sample_rate: 44100  # 输出采样率：默认 44100
    channels: 2  # 输出声道数：立体声（2声道，FL+FR）
    volume: 1.0  # 音量比例（0.0-1.0，0.2表示20%音量）
-  echo_cancellation:
-    enable: false # 是否启用回声消除
-    max_duration_ms: 500  # 参考信号缓冲区最大时长（毫秒）
  tts:
    source_sample_rate: 22050  # TTS服务固定输出采样率（DashScope服务固定值，不可修改）
    source_channels: 1  # TTS服务固定输出声道数（DashScope服务固定值，不可修改）
@@ -56,15 +56,7 @@ system:
  sv_threshold: 0.55  # 声纹识别阈值（0.0-1.0，值越小越宽松，值越大越严格）
  sv_speaker_db_path: "~/hivecore_robot_os1/config/speakers.json"  # 声纹数据库保存路径（JSON格式，相对于ROS2包share目录）
  sv_buffer_size: 240000  # 声纹验证录音缓冲区大小（样本数，48kHz下5秒=240000）
-  sv_registration_silence_threshold_ms: 500  # 声纹注册状态下的静音阈值（毫秒）

 camera:
-  serial_number: "405622075404"  # 相机序列号（Intel RealSense D435）
-  rgb:
-    width: 640   # 图像宽度
-    height: 480  # 图像高度
-    fps: 30      # 帧率（支持：6, 10, 15, 30, 60）
-    format: "RGB8"  # 图像格式：RGB8, BGR8
  image:
    jpeg_quality: 85  # JPEG压缩质量（0-100，85是质量和大小平衡点）
-    max_size: "1280x720"  # 最大尺寸
--- a/launch/voice.launch.py
+++ b/launch/voice.launch.py
@@ -1,10 +1,24 @@
 from launch import LaunchDescription
 from launch_ros.actions import Node
+from launch.actions import SetEnvironmentVariable
+import os


 def generate_launch_description():
    """启动语音交互节点，所有参数从 voice.yaml 读取"""
+    # 获取interfaces包的install路径
+    interfaces_install_path = os.path.expanduser('~/ros_learn/hivecore_robot_interfaces/install')
+    
+    # 设置AMENT_PREFIX_PATH，确保能找到interfaces包的消息类型
+    ament_prefix_path = os.environ.get('AMENT_PREFIX_PATH', '')
+    if interfaces_install_path not in ament_prefix_path:
+        if ament_prefix_path:
+            ament_prefix_path = f'{ament_prefix_path}:{interfaces_install_path}'
+        else:
+            ament_prefix_path = interfaces_install_path
+    
    return LaunchDescription([
+        SetEnvironmentVariable('AMENT_PREFIX_PATH', ament_prefix_path),
        Node(
            package='robot_speaker',
            executable='robot_speaker_node',
--- a/package.xml
+++ b/package.xml
@@ -9,6 +9,8 @@

  <depend>rclpy</depend>
  <depend>std_msgs</depend>
+  <depend>sensor_msgs</depend>
+  <depend>cv_bridge</depend>
  <depend>ament_index_python</depend>
  <depend>interfaces</depend>

--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ pypinyin>=0.49.0
 rclpy>=3.0.0
 pyrealsense2>=2.54.0
 Pillow>=10.0.0
-numpy>=1.24.0
+numpy>=1.24.0,<2.0.0  # cv_bridge需要NumPy 1.x，NumPy 2.x会导致段错误
 PyYAML>=6.0
 aec-audio-processing
 modelscope>=1.33.0
--- a/robot_speaker/bridge/init.py
+++ b/robot_speaker/bridge/init.py
@@ -1,2 +1,4 @@
 # Bridge package for connecting LLM outputs to brain execution.

+
+
--- a/robot_speaker/core/init.py
+++ b/robot_speaker/core/init.py
@@ -3,3 +3,5 @@



+
+
--- a/robot_speaker/core/intent_router.py
+++ b/robot_speaker/core/intent_router.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from typing import Optional
 import os
 import yaml
+import json
 from ament_index_python.packages import get_package_share_directory

 from pypinyin import pinyin, Style
@@ -12,7 +13,7 @@ class IntentResult:
    intent: str  # "skill_sequence" | "kb_qa" | "chat_text" | "chat_camera"
    text: str
    need_camera: bool
-    camera_mode: Optional[str]  # "head" | "left_hand" | "right_hand" | None
+    camera_mode: Optional[str]  # "top" | "left" | "right" | None
    system_prompt: Optional[str]


@@ -21,13 +22,24 @@ class IntentRouter:
        self.camera_capture_keywords = [
            "pai zhao", "pai ge zhao", "pai zhang zhao"
        ]
-        self.skill_keywords = [
-            "ban xiang zi"
+        # 动作词列表（拼音）- 用于检测技能序列意图
+        self.action_verbs = [
+            "zou", "zou liang bu", "zou ji bu",  # 走、走两步、走几步
+            "na", "na qi", "na zhu",  # 拿、拿起、拿住
+            "ban", "ban yun",  # 搬、搬运
+            "zhua", "zhua qu",  # 抓、抓取
+            "tui", "tui dong",  # 推、推动
+            "la", "la dong",  # 拉、拉动
+            "yi dong", "qian jin", "hou tui",  # 移动、前进、后退
+            "kong zhi", "cao zuo",  # 控制、操作
+            "fang xia", "fang zhi",  # 放下、放置
+            "ju qi", "sheng qi",  # 举起、升起
        ]
        self.kb_keywords = [
-            "ni shi shei", "ni de ming zi"
+            "ni shi shui", "ni de ming zi"
        ]
        self._cached_skill_names: list[str] | None = None
+        self._cached_kb_data: list[dict] | None = None

    def _load_brain_skill_names(self) -> list[str]:
        if self._cached_skill_names is not None:
@@ -53,35 +65,36 @@ class IntentRouter:
        py_list = pinyin(''.join(chars), style=Style.NORMAL)
        return ' '.join([item[0] for item in py_list]).lower().strip()

-    def is_skill_sequence_intent(self, text: str) -> bool:
-        text_pinyin = self.to_pinyin(text)
-        return any(k in text_pinyin for k in self.skill_keywords)
+    def is_skill_sequence_intent(self, text: str, text_pinyin: str | None = None) -> bool:
+        if text_pinyin is None:
+            text_pinyin = self.to_pinyin(text)
+        
+        # 检查动作词（精确匹配）
+        return any(action in text_pinyin for action in self.action_verbs)


-    def check_camera_command(self, text: str) -> tuple[bool, Optional[str]]:
+    def check_camera_command(self, text: str, text_pinyin: str | None = None) -> tuple[bool, Optional[str]]:
+        """检查是否包含拍照指令，返回(是否需要相机, 相机模式)"""
        if not text:
            return False, None
-        text_pinyin = self.to_pinyin(text)
-        for keyword in self.camera_capture_keywords:
-            if keyword in text_pinyin:
-                return True, self.detect_camera_mode(text)
+        if text_pinyin is None:
+            text_pinyin = self.to_pinyin(text)
+        # 精确匹配：关键词必须作为完整短语出现在文本拼音中
+        if any(keyword in text_pinyin for keyword in self.camera_capture_keywords):
+            return True, self.detect_camera_mode(text, text_pinyin)
        return False, None

-    def detect_camera_mode(self, text: str) -> str:
-        text_pinyin = self.to_pinyin(text)
-        left_keys = ["zuo shou", "zuo bi", "zuo bian"]
-        right_keys = ["you shou", "you bi", "you bian"]
-        head_keys = ["tou", "nao dai"]
-        for kw in left_keys:
-            if kw in text_pinyin:
-                return "left_hand"
-        for kw in right_keys:
-            if kw in text_pinyin:
-                return "right_hand"
-        for kw in head_keys:
-            if kw in text_pinyin:
-                return "head"
-        return "head"
+    def detect_camera_mode(self, text: str, text_pinyin: str | None = None) -> str:
+        """检测相机模式，返回与相机驱动匹配的position值：left/right/top"""
+        if text_pinyin is None:
+            text_pinyin = self.to_pinyin(text)
+        if any(kw in text_pinyin for kw in ["zuo shou", "zuo bi", "zuo bian"]):
+            return "left"
+        if any(kw in text_pinyin for kw in ["you shou", "you bi", "you bian"]):
+            return "right"
+        if any(kw in text_pinyin for kw in ["tou", "nao dai"]):
+            return "top"
+        return "top"  # 默认头部相机

    def build_skill_prompt(self) -> str:
        skills = self._load_brain_skill_names()
@@ -93,7 +106,7 @@ class IntentRouter:
        )
        return (
            "你是机器人任务规划器。\n"
-            "本任务必须拍照。请根据用户请求选择使用哪个相机拍照（默认头部相机），并结合当前环境信息生成简洁、可执行的技能序列。\n"
+            "本任务必须拍照。请根据用户请求选择使用哪个相机拍照，并结合当前环境信息生成简洁、可执行的技能序列。\n"
            "【重要】如果对话历史中包含【执行结果】或【执行状态】，请参考上一轮技能序列的执行情况，根据成功/失败信息调整本次技能序列。\n"
            "【输出格式要求】只输出逗号分隔的技能名称，不要任何解释说明。\n"
            + skill_guard
@@ -109,12 +122,38 @@ class IntentRouter:
            "你是一个智能语音助手。\n"
            "请自然、简短地与用户对话。"
        )
-
-    def build_kb_prompt(self) -> str:
-        return (
-            "你是蜂核科技的员工。\n"
-            "请基于知识库信息回答用户问题，回答要准确简洁。"
-        )
+    
+    def _load_kb_data(self) -> list[dict]:
+        """加载知识库数据"""
+        if self._cached_kb_data is not None:
+            return self._cached_kb_data
+        kb_data = []
+        try:
+            robot_speaker_share = get_package_share_directory("robot_speaker")
+            kb_path = os.path.join(robot_speaker_share, "config", "knowledge.json")
+            with open(kb_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                kb_data = data["entries"]
+        except Exception as e:
+            kb_data = []
+        self._cached_kb_data = kb_data
+        return kb_data
+    
+    def search_kb(self, text: str) -> Optional[str]:
+        """检索知识库，返回匹配的答案"""
+        if not text:
+            return None
+        text_pinyin = self.to_pinyin(text)
+        kb_data = self._load_kb_data()
+       
+        for entry in kb_data:
+            patterns = entry["patterns"]
+            for pattern in patterns:
+                if pattern in text_pinyin:
+                    answer = entry["answer"]
+                    if answer:
+                        return answer
+        return None

    def build_default_system_prompt(self) -> str:
        return (
@@ -125,12 +164,11 @@ class IntentRouter:
        )

    def route(self, text: str) -> IntentResult:
-        need_camera, camera_mode = self.check_camera_command(text)
        text_pinyin = self.to_pinyin(text)
+        need_camera, camera_mode = self.check_camera_command(text, text_pinyin)

-        if self.is_skill_sequence_intent(text):
-            if camera_mode is None:
-                camera_mode = "head"
+        if self.is_skill_sequence_intent(text, text_pinyin):
+            # 用户没有指定相机模式时，保持 None，使用第一个收到的消息
            return IntentResult(
                intent="skill_sequence",
                text=text,
@@ -139,13 +177,14 @@ class IntentRouter:
                system_prompt=self.build_skill_prompt()
            )

-        if any(k in text_pinyin for k in self.kb_keywords):
+        # 精确匹配：关键词必须作为完整短语出现在文本拼音中
+        if any(keyword in text_pinyin for keyword in self.kb_keywords):
            return IntentResult(
                intent="kb_qa",
                text=text,
                need_camera=False,
                camera_mode=None,
-                system_prompt=self.build_kb_prompt()
+                system_prompt=None  # kb_qa不走LLM，不需要system_prompt
            )

        return IntentResult(
--- a/robot_speaker/core/register_speaker_node.py
+++ b/robot_speaker/core/register_speaker_node.py
@@ -15,7 +15,6 @@ from ament_index_python.packages import get_package_share_directory

 from robot_speaker.perception.audio_pipeline import VADDetector, AudioRecorder
 from robot_speaker.perception.speaker_verifier import SpeakerVerificationClient
-from robot_speaker.perception.echo_cancellation import ReferenceSignalBuffer
 from robot_speaker.models.asr.dashscope import DashScopeASR
 from robot_speaker.models.tts.dashscope import DashScopeTTSClient
 from robot_speaker.core.types import TTSRequest
@@ -45,13 +44,6 @@ class RegisterSpeakerNode(Node):
            sample_rate=self.sample_rate
        )

-        # 创建参考信号缓冲区（用于回声消除）
-        self.reference_signal_buffer = ReferenceSignalBuffer(
-            max_duration_ms=self.audio_echo_cancellation_max_duration_ms,
-            sample_rate=self.sample_rate,
-            channels=self.output_channels
-        ) if self.audio_echo_cancellation_enabled else None
-
        self.audio_recorder = AudioRecorder(
            device_index=self.input_device_index,
            sample_rate=self.sample_rate,
@@ -71,8 +63,6 @@ class RegisterSpeakerNode(Node):
            on_audio_chunk=self._on_audio_chunk,
            should_put_to_queue=self._should_put_to_queue,
            get_silence_threshold=lambda: self.silence_duration_ms,
-            enable_echo_cancellation=self.audio_echo_cancellation_enabled,  # 启用回声消除，保持与主程序一致
-            reference_signal_buffer=self.reference_signal_buffer,
            logger=self.get_logger()
        )
        
@@ -122,7 +112,6 @@ class RegisterSpeakerNode(Node):
            tts_source_sample_rate=self.audio_tts_source_sample_rate,
            tts_source_channels=self.audio_tts_source_channels,
            tts_ffmpeg_thread_queue_size=self.audio_tts_ffmpeg_thread_queue_size,
-            reference_signal_buffer=self.reference_signal_buffer,
            logger=self.get_logger()
        )

@@ -170,9 +159,6 @@ class RegisterSpeakerNode(Node):
        self.output_channels = soundcard['channels']
        self.output_volume = soundcard['volume']

-        echo = audio.get('echo_cancellation', {})
-        self.audio_echo_cancellation_enabled = echo['enable']
-        self.audio_echo_cancellation_max_duration_ms = echo.get('max_duration_ms', 200)

        tts_audio = audio.get('tts', {})
        self.audio_tts_source_sample_rate = tts_audio.get('source_sample_rate', 22050)
--- a/robot_speaker/core/robot_speaker_node.py
+++ b/robot_speaker/core/robot_speaker_node.py
--- a/robot_speaker/models/init.py
+++ b/robot_speaker/models/init.py
@@ -3,3 +3,5 @@



+
+
--- a/robot_speaker/models/asr/init.py
+++ b/robot_speaker/models/asr/init.py
@@ -3,3 +3,5 @@



+
+
--- a/robot_speaker/models/asr/base.py
+++ b/robot_speaker/models/asr/base.py
@@ -11,3 +11,5 @@ class ASRClient:



+
+
--- a/robot_speaker/models/llm/init.py
+++ b/robot_speaker/models/llm/init.py
@@ -3,3 +3,5 @@



+
+
--- a/robot_speaker/models/llm/base.py
+++ b/robot_speaker/models/llm/base.py
@@ -13,3 +13,5 @@ class LLMClient:



+
+
--- a/robot_speaker/models/tts/init.py
+++ b/robot_speaker/models/tts/init.py
@@ -3,3 +3,5 @@



+
+
--- a/robot_speaker/models/tts/base.py
+++ b/robot_speaker/models/tts/base.py
@@ -12,3 +12,5 @@ class TTSClient:



+
+
--- a/robot_speaker/perception/init.py
+++ b/robot_speaker/perception/init.py
@@ -3,3 +3,5 @@



+
+
--- a/robot_speaker/perception/audio_pipeline.py
+++ b/robot_speaker/perception/audio_pipeline.py
@@ -1,12 +1,11 @@
 """
-音频处理模块：录音 + VAD + 回声消除
+音频处理模块：录音 + VAD
 """
 import time
 import pyaudio
 import webrtcvad
 import struct
 import queue
-from .echo_cancellation import EchoCanceller, ReferenceSignalBuffer


 class VADDetector:
@@ -35,8 +34,6 @@ class AudioRecorder:
                 on_audio_chunk=None,  # 音频chunk回调（用于声纹录音等，可选）
                 should_put_to_queue=None,  # 检查是否应该将音频放入队列（用于阻止ASR，可选）
                 get_silence_threshold=None,  # 获取动态静音阈值（毫秒，可选）
-                 enable_echo_cancellation: bool = True,  # 是否启用回声消除
-                 reference_signal_buffer: ReferenceSignalBuffer = None,  # 参考信号缓冲区（可选）
                 logger=None):
        self.device_index = device_index
        self.sample_rate = sample_rate
@@ -97,39 +94,6 @@ class AudioRecorder:

        self.format = pyaudio.paInt16
        self._debug_counter = 0
-        
-        # 回声消除相关
-        self.enable_echo_cancellation = enable_echo_cancellation
-        self.reference_signal_buffer = reference_signal_buffer
-        if enable_echo_cancellation:
-            # 初始化回声消除器（在录音线程中同步处理，不是单独线程）
-            # frame_size设置为chunk大小，确保每次处理一个chunk
-            frame_size = chunk
-            try:
-                # 获取参考信号声道数（从reference_signal_buffer获取，因为它是根据播放声道数创建的）
-                ref_channels = self.reference_signal_buffer.channels if self.reference_signal_buffer else 1
-                self.echo_canceller = EchoCanceller(
-                    sample_rate=sample_rate,
-                    frame_size=frame_size,
-                    channels=self.channels,  # 麦克风输入：1声道
-                    ref_channels=ref_channels,  # 参考信号：播放声道数（2声道）
-                    logger=logger
-                )
-                if self.echo_canceller.aec is not None:
-                    if logger:
-                        logger.info(f"回声消除器已启用: sample_rate={sample_rate}, frame_size={frame_size}")
-                else:
-                    if logger:
-                        logger.warning("回声消除器初始化失败，将禁用回声消除功能")
-                    self.enable_echo_cancellation = False
-                    self.echo_canceller = None
-            except Exception as e:
-                if logger:
-                    logger.warning(f"回声消除器初始化失败: {e}，将禁用回声消除功能")
-                self.enable_echo_cancellation = False
-                self.echo_canceller = None
-        else:
-            self.echo_canceller = None
    
    def record_with_vad(self):
        """录音线程：VAD + 能量检测"""
@@ -163,19 +127,7 @@ class AudioRecorder:
            while not self.stop_flag():
                # exception_on_overflow=False, 宁可丢帧，也不阻塞
                data = stream.read(self.chunk, exception_on_overflow=False)
-                
-                # 回声消除处理
                processed_data = data
-                if self.enable_echo_cancellation and self.echo_canceller and self.reference_signal_buffer:
-                    try:
-                        # 获取参考信号（长度与麦克风信号匹配）
-                        ref_signal = self.reference_signal_buffer.get_reference(num_samples=self.chunk)
-                        # 执行回声消除
-                        processed_data = self.echo_canceller.process(data, ref_signal)
-                    except Exception as e:
-                        if self.logger:
-                            self.logger.warning(f"回声消除处理失败: {e}，使用原始音频")
-                        processed_data = data
                
                # 检查是否应该将音频放入队列（用于阻止ASR，例如无声纹文件时需要注册）
                if self.should_put_to_queue():
--- a/robot_speaker/perception/camera_client.py
+++ b/robot_speaker/perception/camera_client.py
@@ -1,131 +0,0 @@
-"""
-相机模块 - RealSense相机封装
-"""
-import numpy as np
-import contextlib
-
-
-class CameraClient:
-    def __init__(self, 
-                 serial_number: str | None,
-                 width: int,
-                 height: int,
-                 fps: int,
-                 format: str,
-                 logger=None):
-        self.serial_number = serial_number
-        self.width = width
-        self.height = height
-        self.fps = fps
-        self.format = format
-        self.logger = logger
-        
-        self.pipeline = None
-        self.config = None
-        self._is_initialized = False
-        self._rs = None
-    
-    def _log(self, level: str, msg: str):
-        if self.logger:
-            getattr(self.logger, level, self.logger.info)(msg)
-        else:
-            print(f"[相机] {msg}")
-    
-    def initialize(self) -> bool:
-        """
-        初始化并启动相机管道
-        """
-        if self._is_initialized:
-            return True
-        
-        try:
-            import pyrealsense2 as rs
-            self._rs = rs
-            
-            self.pipeline = rs.pipeline()
-            self.config = rs.config()
-            
-            if self.serial_number:
-                self.config.enable_device(self.serial_number)
-            
-            self.config.enable_stream(
-                rs.stream.color, 
-                self.width, 
-                self.height, 
-                rs.format.rgb8 if self.format == 'RGB8' else rs.format.bgr8,
-                self.fps
-            )
-            
-            self.pipeline.start(self.config)
-            self._is_initialized = True
-            self._log("info", f"相机已启动并保持运行: {self.width}x{self.height}@{self.fps}fps")
-            return True
-        except Exception as e:
-            self._log("error", f"相机初始化失败: {e}")
-            self.cleanup()
-            return False
-    
-    def cleanup(self):
-        """停止相机管道，释放资源"""
-        if self.pipeline:
-            self.pipeline.stop()
-            self._log("info", "相机已停止")
-        self.pipeline = None
-        self.config = None
-        self._is_initialized = False
-    
-    def capture_rgb(self) -> np.ndarray | None:
-        """
-        从运行中的相机管道捕获一帧RGB图像
-        """
-        if not self._is_initialized:
-            self._log("error", "相机未初始化，无法捕获图像")
-            return None
-        
-        try:
-            frames = self.pipeline.wait_for_frames()
-            color_frame = frames.get_color_frame()
-            
-            return np.asanyarray(color_frame.get_data())
-        except Exception as e:
-            self._log("error", f"捕获图像失败: {e}")
-            return None
-    
-    @contextlib.contextmanager
-    def capture_context(self):
-        """
-        上下文管理器：拍照并自动清理资源
-        """
-        image_data = self.capture_rgb()
-        try:
-            yield image_data
-        finally:
-            if image_data is not None:
-                del image_data
-    
-    def capture_multiple(self, count: int = 1) -> list[np.ndarray]:
-        """
-        捕获多张图像（为未来扩展准备）
-        """
-        images = []
-        for i in range(count):
-            img = self.capture_rgb()
-            if img is not None:
-                images.append(img)
-            else:
-                self._log("warning", f"第{i+1}张图像捕获失败")
-        return images
-    
-    @contextlib.contextmanager
-    def capture_multiple_context(self, count: int = 1):
-        """
-        上下文管理器：捕获多张图像并自动清理资源
-        """
-        images = self.capture_multiple(count)
-        try:
-            yield images
-        finally:
-            for img in images:
-                del img
-            images.clear()
-
--- a/robot_speaker/perception/echo_cancellation.py
+++ b/robot_speaker/perception/echo_cancellation.py
@@ -1,98 +0,0 @@
-import collections
-import numpy as np
-
-
-class ReferenceSignalBuffer:
-    """参考信号缓冲区"""
-    
-    def __init__(self, sample_rate: int, channels: int, max_duration_ms: int | None = None,
-                 buffer_seconds: float = 5.0):
-        self.sample_rate = int(sample_rate)
-        self.channels = int(channels)
-        if max_duration_ms is not None:
-            buffer_seconds = max(float(max_duration_ms) / 1000.0, 0.1)
-        self.max_samples = int(self.sample_rate * buffer_seconds)
-        self._buffer = collections.deque(maxlen=self.max_samples * self.channels)
-    
-    def add_reference(self, data: bytes, source_sample_rate: int, source_channels: int):
-        if source_sample_rate != self.sample_rate or source_channels != self.channels:
-            return
-        samples = np.frombuffer(data, dtype=np.int16)
-        self._buffer.extend(samples.tolist())
-    
-    def get_reference(self, num_samples: int) -> bytes:
-        needed = int(num_samples) * self.channels
-        if needed <= 0:
-            return b""
-        if len(self._buffer) < needed:
-            data = list(self._buffer) + [0] * (needed - len(self._buffer))
-        else:
-            data = list(self._buffer)[-needed:]
-        return np.array(data, dtype=np.int16).tobytes()
-
-
-class EchoCanceller:
-    """回声消除器（基于 aec-audio-processing）"""
-    
-    def __init__(self, sample_rate: int, frame_size: int, channels: int, ref_channels: int, logger=None):
-        self.sample_rate = int(sample_rate)
-        self.frame_size = int(frame_size)
-        self.channels = int(channels)
-        self.ref_channels = int(ref_channels)
-        self.logger = logger
-        self.aec = None
-        self._process_reverse = None
-        self._frame_bytes = int(self.sample_rate / 100) * self.channels * 2  # 10ms, int16
-        self._ref_frame_bytes = int(self.sample_rate / 100) * self.ref_channels * 2
-        try:
-            from aec_audio_processing import AudioProcessor
-            self.aec = AudioProcessor(enable_aec=True, enable_ns=False, enable_agc=False)
-            self.aec.set_stream_format(self.sample_rate, self.channels)
-            if hasattr(self.aec, "set_reverse_stream_format"):
-                self.aec.set_reverse_stream_format(self.sample_rate, self.ref_channels)
-            if hasattr(self.aec, "set_stream_delay"):
-                self.aec.set_stream_delay(0)
-            if hasattr(self.aec, "process_reverse_stream"):
-                self._process_reverse = self.aec.process_reverse_stream
-            elif hasattr(self.aec, "process_reverse"):
-                self._process_reverse = self.aec.process_reverse
-        except Exception:
-            self.aec = None
-    
-    def process(self, mic_data: bytes, ref_data: bytes) -> bytes:
-        if not self.aec:
-            return mic_data
-        if not mic_data:
-            return mic_data
-
-        try:
-            out_chunks = []
-            total_len = len(mic_data)
-            frame_bytes = self._frame_bytes
-            ref_frame_bytes = self._ref_frame_bytes
-
-            frame_count = (total_len + frame_bytes - 1) // frame_bytes
-            for i in range(frame_count):
-                m_start = i * frame_bytes
-                m_end = m_start + frame_bytes
-                mic_frame = mic_data[m_start:m_end]
-                if len(mic_frame) < frame_bytes:
-                    mic_frame = mic_frame + b"\x00" * (frame_bytes - len(mic_frame))
-
-                if ref_data:
-                    r_start = i * ref_frame_bytes
-                    r_end = r_start + ref_frame_bytes
-                    ref_frame = ref_data[r_start:r_end]
-                    if len(ref_frame) < ref_frame_bytes:
-                        ref_frame = ref_frame + b"\x00" * (ref_frame_bytes - len(ref_frame))
-                    if self._process_reverse:
-                        self._process_reverse(ref_frame)
-
-                processed = self.aec.process_stream(mic_frame)
-                out_chunks.append(processed if processed is not None else mic_frame)
-
-            return b"".join(out_chunks)[:total_len]
-        except Exception as e:
-            if self.logger:
-                self.logger.warning(f"回声消除处理失败: {e}，使用原始音频")
-            return mic_data
--- a/robot_speaker/understanding/init.py
+++ b/robot_speaker/understanding/init.py
@@ -3,3 +3,5 @@



+
+
--- a/view_camera.py
+++ b/view_camera.py
@@ -1,68 +0,0 @@
-#!/usr/bin/env python3
-"""
-查看相机画面的简单脚本
-按空格键保存当前帧，按'q'键退出
-"""
-import sys
-import cv2
-import numpy as np
-try:
-    import pyrealsense2 as rs
-except ImportError:
-    print("错误: 未安装pyrealsense2，请运行: pip install pyrealsense2")
-    sys.exit(1)
-
-def main():
-    # 配置相机
-    pipeline = rs.pipeline()
-    config = rs.config()
-    
-    # 启用彩色流
-    config.enable_stream(rs.stream.color, 640, 480, rs.format.rgb8, 30)
-    
-    # 启动管道
-    pipeline.start(config)
-    print("相机已启动，按空格键保存图片，按'q'键退出")
-    
-    frame_count = 0
-    try:
-        while True:
-            # 等待一帧
-            frames = pipeline.wait_for_frames()
-            color_frame = frames.get_color_frame()
-            
-            if not color_frame:
-                continue
-            
-            # 转换为numpy数组 (RGB格式)
-            color_image = np.asanyarray(color_frame.get_data())
-            
-            # OpenCV使用BGR格式，需要转换
-            bgr_image = cv2.cvtColor(color_image, cv2.COLOR_RGB2BGR)
-            
-            # 显示图像
-            cv2.imshow('Camera View', bgr_image)
-            
-            # 等待按键
-            key = cv2.waitKey(1) & 0xFF
-            
-            if key == ord('q'):
-                print("退出...")
-                break
-            elif key == ord(' '):  # 空格键保存
-                frame_count += 1
-                filename = f'camera_frame_{frame_count:04d}.jpg'
-                cv2.imwrite(filename, bgr_image)
-                print(f"已保存: {filename}")
-    
-    except KeyboardInterrupt:
-        print("\n中断...")
-    finally:
-        pipeline.stop()
-        cv2.destroyAllWindows()
-        print("相机已关闭")
-
-if __name__ == '__main__':
-    main()
-
-
				`@@ -1,2 +1,4 @@`
				`# Bridge package for connecting LLM outputs to brain execution.`