feat: improved AEC, keyboard shortcuts, dedup participants, dev-fast profile

AEC improvements: - Reduce echo tail from 100ms to 30ms (3.3x faster, suited for laptops) - Add double-talk detection: freeze adaptation when near-end speaks - Add residual echo suppression - Disable AEC by default in --android mode (macOS has built-in AEC) CLI features: - Keyboard shortcuts: m=mic mute, s=speaker mute, q=quit (raw terminal mode) - Dedup participants in RoomUpdate display (same fingerprint+alias shown once) - Add dev-fast profile (opt-level 2 with incremental compilation) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 10:15:23 +04:00
parent cfb48df1ef
commit 1b00b5e2a4
6 changed files with 428 additions and 69 deletions
--- a/crates/wzp-codec/src/aec.rs
+++ b/crates/wzp-codec/src/aec.rs
@@ -1,25 +1,40 @@
 //! Acoustic Echo Cancellation using NLMS adaptive filter.
-//! Processes 480-sample (10ms) sub-frames at 48kHz.
+//!
+//! Improvements over naive NLMS:
+//! - Double-talk detection: freezes adaptation when near-end speech dominates,
+//!   preventing the filter from cancelling the local speaker's voice.
+//! - Short default tail (30ms) tuned for laptops/phones where speaker and mic
+//!   are close together.
+//! - Residual suppression: attenuates output when echo estimate is confident.

-/// NLMS (Normalized Least Mean Squares) adaptive filter echo canceller.
-///
-/// Removes acoustic echo by modelling the echo path between the far-end
-/// (speaker) signal and the near-end (microphone) signal, then subtracting
-/// the estimated echo from the near-end in real time.
+/// NLMS (Normalized Least Mean Squares) adaptive filter echo canceller
+/// with double-talk detection.
 pub struct EchoCanceller {
    filter_coeffs: Vec<f32>,
    filter_len: usize,
    far_end_buf: Vec<f32>,
    far_end_pos: usize,
+    /// NLMS step size (adaptation rate).
    mu: f32,
    enabled: bool,
+    /// Running far-end power estimate (for double-talk detection).
+    far_power_avg: f32,
+    /// Running near-end power estimate (for double-talk detection).
+    near_power_avg: f32,
+    /// Smoothing factor for power estimates.
+    power_alpha: f32,
+    /// Double-talk threshold: if near/far power ratio exceeds this,
+    /// freeze adaptation to protect near-end speech.
+    dt_threshold: f32,
+    /// Residual echo suppression factor (0.0 = none, 1.0 = full).
+    suppress: f32,
 }

 impl EchoCanceller {
    /// Create a new echo canceller.
    ///
    /// * `sample_rate` — typically 48000
-    /// * `filter_ms`   — echo-tail length in milliseconds (e.g. 100 for 100 ms)
+    /// * `filter_ms`   — echo-tail length in milliseconds (30ms recommended for laptops)
    pub fn new(sample_rate: u32, filter_ms: u32) -> Self {
        let filter_len = (sample_rate as usize) * (filter_ms as usize) / 1000;
        Self {
@@ -27,8 +42,13 @@ impl EchoCanceller {
            filter_len,
            far_end_buf: vec![0.0f32; filter_len],
            far_end_pos: 0,
-            mu: 0.01,
+            mu: 0.005,
            enabled: true,
+            far_power_avg: 0.0,
+            near_power_avg: 0.0,
+            power_alpha: 0.01,
+            dt_threshold: 4.0,
+            suppress: 0.6,
        }
    }

@@ -45,9 +65,7 @@ impl EchoCanceller {

    /// Process a near-end (microphone) frame, removing the estimated echo.
    ///
-    /// Returns the echo-return-loss enhancement (ERLE) as a ratio: the RMS of
-    /// the original near-end divided by the RMS of the residual.  Values > 1.0
-    /// mean echo was reduced.
+    /// Returns the echo-return-loss enhancement (ERLE) as a ratio.
    pub fn process_frame(&mut self, nearend: &mut [i16]) -> f32 {
        if !self.enabled {
            return 1.0;
@@ -56,34 +74,44 @@ impl EchoCanceller {
        let n = nearend.len();
        let fl = self.filter_len;

+        // Compute frame-level power for double-talk detection.
+        let near_power: f32 = nearend.iter().map(|&s| {
+            let f = s as f32;
+            f * f
+        }).sum::<f32>() / n as f32;
+
+        let far_start = (self.far_end_pos + fl * ((n / fl) + 1) - n) % fl;
+        let far_power: f32 = (0..n).map(|i| {
+            let fe = self.far_end_buf[(far_start + i) % fl];
+            fe * fe
+        }).sum::<f32>() / n as f32;
+
+        // Smooth power estimates
+        self.far_power_avg += self.power_alpha * (far_power - self.far_power_avg);
+        self.near_power_avg += self.power_alpha * (near_power - self.near_power_avg);
+
+        // Double-talk detection: if near-end is much louder than far-end,
+        // the local speaker is active — freeze adaptation.
+        let adapt = if self.far_power_avg < 1.0 {
+            // No far-end signal — nothing to cancel, skip adaptation
+            false
+        } else {
+            let ratio = self.near_power_avg / (self.far_power_avg + 1.0);
+            ratio < self.dt_threshold
+        };
+
        let mut sum_near_sq: f64 = 0.0;
        let mut sum_err_sq: f64 = 0.0;

        for i in 0..n {
            let near_f = nearend[i] as f32;

-            // --- estimate echo as dot(coeffs, farend_window) ---
-            // The far-end window for this sample starts at
-            //   (far_end_pos - 1 - i) mod filter_len   (most recent)
-            // and goes back filter_len samples.
+            // Estimate echo: dot(coeffs, farend_window)
+            let base = (self.far_end_pos + fl * ((n / fl) + 2) + i - n) % fl;
+
            let mut echo_est: f32 = 0.0;
            let mut power: f32 = 0.0;

-            // Position of the most-recent far-end sample for this near-end sample.
-            // far_end_pos points to the *next write* position, so the most-recent
-            // sample written is at far_end_pos - 1.  We have already called
-            // feed_farend for this block, so the relevant samples are the last
-            // filter_len entries ending just before the current write position,
-            // offset by how far we are into this near-end frame.
-            //
-            // For sample i of the near-end frame, the corresponding far-end
-            // "now" is far_end_pos - n + i  (wrapping).
-            // far_end_pos points to next-write, so most recent sample is at
-            // far_end_pos - 1.  For the i-th near-end sample we want the
-            // far-end "now" to be at (far_end_pos - n + i).  We add fl
-            // repeatedly to avoid underflow on the usize subtraction.
-            let base = (self.far_end_pos + fl * ((n / fl) + 2) + i - n) % fl;
-
            for k in 0..fl {
                let fe_idx = (base + fl - k) % fl;
                let fe = self.far_end_buf[fe_idx];
@@ -93,27 +121,40 @@ impl EchoCanceller {

            let error = near_f - echo_est;

-            // --- NLMS coefficient update ---
-            let norm = power + 1.0; // +1 regularisation to avoid div-by-zero
-            let step = self.mu * error / norm;
+            // NLMS coefficient update — only when not in double-talk
+            if adapt && power > 1.0 {
+                let norm = power + 1.0;
+                let step = self.mu * error / norm;

-            for k in 0..fl {
-                let fe_idx = (base + fl - k) % fl;
-                let fe = self.far_end_buf[fe_idx];
-                self.filter_coeffs[k] += step * fe;
+                for k in 0..fl {
+                    let fe_idx = (base + fl - k) % fl;
+                    let fe = self.far_end_buf[fe_idx];
+                    self.filter_coeffs[k] += step * fe;
+                }
            }

-            // Clamp output
-            let out = error.max(-32768.0).min(32767.0);
+            // Residual echo suppression: when far-end is active, attenuate
+            // the residual to reduce perceptible echo.
+            let out = if self.far_power_avg > 100.0 && !adapt {
+                // Double-talk: pass through near-end with minimal suppression
+                error
+            } else if self.far_power_avg > 100.0 {
+                // Far-end active, not double-talk: apply suppression
+                error * (1.0 - self.suppress * (echo_est.abs() / (near_f.abs() + 1.0)).min(1.0))
+            } else {
+                // No far-end: pass through
+                error
+            };
+
+            let out = out.max(-32768.0).min(32767.0);
            nearend[i] = out as i16;

            sum_near_sq += (near_f as f64) * (near_f as f64);
            sum_err_sq += (out as f64) * (out as f64);
        }

-        // ERLE ratio
        if sum_err_sq < 1.0 {
-            return 100.0; // near-perfect cancellation
+            return 100.0;
        }
        (sum_near_sq / sum_err_sq).sqrt() as f32
    }
@@ -129,12 +170,12 @@ impl EchoCanceller {
    }

    /// Reset the adaptive filter to its initial state.
-    ///
-    /// Zeroes out all filter coefficients and the far-end circular buffer.
    pub fn reset(&mut self) {
        self.filter_coeffs.iter_mut().for_each(|c| *c = 0.0);
        self.far_end_buf.iter_mut().for_each(|s| *s = 0.0);
        self.far_end_pos = 0;
+        self.far_power_avg = 0.0;
+        self.near_power_avg = 0.0;
    }
 }

@@ -144,15 +185,15 @@ mod tests {

    #[test]
    fn aec_creates_with_correct_filter_len() {
-        let aec = EchoCanceller::new(48000, 100);
-        assert_eq!(aec.filter_len, 4800);
-        assert_eq!(aec.filter_coeffs.len(), 4800);
-        assert_eq!(aec.far_end_buf.len(), 4800);
+        let aec = EchoCanceller::new(48000, 30);
+        assert_eq!(aec.filter_len, 1440);
+        assert_eq!(aec.filter_coeffs.len(), 1440);
+        assert_eq!(aec.far_end_buf.len(), 1440);
    }

    #[test]
    fn aec_passthrough_when_disabled() {
-        let mut aec = EchoCanceller::new(48000, 100);
+        let mut aec = EchoCanceller::new(48000, 30);
        aec.set_enabled(false);
        assert!(!aec.is_enabled());

@@ -165,7 +206,7 @@ mod tests {

    #[test]
    fn aec_reset_zeroes_state() {
-        let mut aec = EchoCanceller::new(48000, 10); // short for test speed
+        let mut aec = EchoCanceller::new(48000, 10);
        let farend: Vec<i16> = (0..480).map(|i| ((i * 37) % 1000) as i16).collect();
        aec.feed_farend(&farend);

@@ -178,13 +219,9 @@ mod tests {

    #[test]
    fn aec_reduces_echo_of_known_signal() {
-        // Use a small filter for speed.  Feed a known far-end signal, then
-        // present the *same* signal as near-end (perfect echo, no room).
-        // After adaptation the output energy should drop.
-        let filter_ms = 5; // 240 taps at 48 kHz
+        let filter_ms = 5;
        let mut aec = EchoCanceller::new(48000, filter_ms);

-        // Generate a simple repeating pattern.
        let frame_len = 480usize;
        let make_frame = |offset: usize| -> Vec<i16> {
            (0..frame_len)
@@ -195,18 +232,15 @@ mod tests {
                .collect()
        };

-        // Warm up the adaptive filter with several frames.
        let mut last_erle = 1.0f32;
        for frame_idx in 0..40 {
            let farend = make_frame(frame_idx * frame_len);
            aec.feed_farend(&farend);

-            // Near-end = exact copy of far-end (pure echo).
            let mut nearend = farend.clone();
            last_erle = aec.process_frame(&mut nearend);
        }

-        // After 40 frames the ERLE should be meaningfully > 1.
        assert!(
            last_erle > 1.0,
            "expected ERLE > 1.0 after adaptation, got {last_erle}"
@@ -216,13 +250,41 @@ mod tests {
    #[test]
    fn aec_silence_passthrough() {
        let mut aec = EchoCanceller::new(48000, 10);
-        // Feed silence far-end
        aec.feed_farend(&vec![0i16; 480]);
-        // Near-end is silence too
        let mut frame = vec![0i16; 480];
        let erle = aec.process_frame(&mut frame);
        assert!(erle >= 1.0);
-        // Output should still be silence
        assert!(frame.iter().all(|&s| s == 0));
    }
+
+    #[test]
+    fn aec_preserves_nearend_during_doubletalk() {
+        // When only near-end is active (no far-end), output should
+        // closely match input — the AEC should not suppress speech.
+        let mut aec = EchoCanceller::new(48000, 30);
+
+        let frame_len = 960;
+        let nearend_signal: Vec<i16> = (0..frame_len)
+            .map(|i| {
+                let t = i as f64 / 48000.0;
+                (10000.0 * (2.0 * std::f64::consts::PI * 440.0 * t).sin()) as i16
+            })
+            .collect();
+
+        // Feed silence as far-end
+        aec.feed_farend(&vec![0i16; frame_len]);
+
+        let mut frame = nearend_signal.clone();
+        aec.process_frame(&mut frame);
+
+        // Output energy should be close to input energy (not suppressed)
+        let input_energy: f64 = nearend_signal.iter().map(|&s| (s as f64).powi(2)).sum();
+        let output_energy: f64 = frame.iter().map(|&s| (s as f64).powi(2)).sum();
+        let ratio = output_energy / input_energy;
+
+        assert!(
+            ratio > 0.8,
+            "near-end speech should be preserved, energy ratio = {ratio:.3}"
+        );
+    }
 }