From d1c96cd71f4e7e5175602f85d2630f7cfa85f3fd Mon Sep 17 00:00:00 2001 From: Siavash Sameni Date: Mon, 6 Apr 2026 11:10:10 +0400 Subject: [PATCH] feat: macOS VoiceProcessingIO for hardware AEC + delay-compensated NLMS - Add --os-aec flag: uses Apple VoiceProcessingIO audio unit for hardware echo cancellation (same engine as FaceTime) - New vpio feature + audio_vpio.rs: combined capture+playback via VPIO - Improved software AEC: delay-compensated leaky NLMS with Geigel DTD (60ms tail, 40ms delay, configurable via --aec-delay) - Add --aec-delay flag for tuning software AEC delay compensation - Add dev-fast Cargo profile (opt-level 2 with incremental compilation) Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 1 + crates/wzp-client/Cargo.toml | 2 + crates/wzp-client/src/audio_vpio.rs | 179 +++++++++++++++ crates/wzp-client/src/call.rs | 6 +- crates/wzp-client/src/cli.rs | 65 ++++-- crates/wzp-client/src/lib.rs | 2 + crates/wzp-codec/src/aec.rs | 345 ++++++++++++++++------------ 7 files changed, 436 insertions(+), 164 deletions(-) create mode 100644 crates/wzp-client/src/audio_vpio.rs diff --git a/Cargo.lock b/Cargo.lock index 155e071..458a5dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4212,6 +4212,7 @@ dependencies = [ "async-trait", "bytes", "chrono", + "coreaudio-rs", "cpal", "libc", "rustls", diff --git a/crates/wzp-client/Cargo.toml b/crates/wzp-client/Cargo.toml index a75589d..0258107 100644 --- a/crates/wzp-client/Cargo.toml +++ b/crates/wzp-client/Cargo.toml @@ -23,11 +23,13 @@ serde_json = "1" chrono = "0.4" rustls = { version = "0.23", default-features = false, features = ["ring", "std"] } cpal = { version = "0.15", optional = true } +coreaudio-rs = { version = "0.11", optional = true } libc = "0.2" [features] default = [] audio = ["cpal"] +vpio = ["coreaudio-rs"] [[bin]] name = "wzp-client" diff --git a/crates/wzp-client/src/audio_vpio.rs b/crates/wzp-client/src/audio_vpio.rs new file mode 100644 index 0000000..ac1a7ac --- /dev/null +++ b/crates/wzp-client/src/audio_vpio.rs @@ -0,0 +1,179 @@ +//! macOS Voice Processing I/O — uses Apple's VoiceProcessingIO audio unit +//! for hardware-accelerated echo cancellation, AGC, and noise suppression. +//! +//! VoiceProcessingIO is a combined input+output unit that knows what's going +//! to the speaker, so it can cancel the echo from the mic signal internally. +//! This is the same engine FaceTime and other Apple apps use. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use anyhow::Context; +use coreaudio::audio_unit::audio_format::LinearPcmFlags; +use coreaudio::audio_unit::render_callback::{self, data}; +use coreaudio::audio_unit::{AudioUnit, Element, IOType, SampleFormat, Scope, StreamFormat}; +use coreaudio::sys; +use tracing::info; + +use crate::audio_ring::AudioRing; + +/// Number of samples per 20 ms frame at 48 kHz mono. +pub const FRAME_SAMPLES: usize = 960; + +/// Combined capture + playback via macOS VoiceProcessingIO. +/// +/// The OS handles AEC internally — no manual far-end feeding needed. +pub struct VpioAudio { + capture_ring: Arc, + playout_ring: Arc, + _audio_unit: AudioUnit, + running: Arc, +} + +impl VpioAudio { + /// Start VoiceProcessingIO with AEC enabled. + pub fn start() -> Result { + let capture_ring = Arc::new(AudioRing::new()); + let playout_ring = Arc::new(AudioRing::new()); + let running = Arc::new(AtomicBool::new(true)); + + let mut au = AudioUnit::new(IOType::VoiceProcessingIO) + .context("failed to create VoiceProcessingIO audio unit")?; + + // Must uninitialize before configuring properties. + au.uninitialize() + .context("failed to uninitialize VPIO for configuration")?; + + // Enable input (mic) on Element::Input (bus 1). + let enable: u32 = 1; + au.set_property( + sys::kAudioOutputUnitProperty_EnableIO, + Scope::Input, + Element::Input, + Some(&enable), + ) + .context("failed to enable VPIO input")?; + + // Output (speaker) is enabled by default on VPIO, but be explicit. + au.set_property( + sys::kAudioOutputUnitProperty_EnableIO, + Scope::Output, + Element::Output, + Some(&enable), + ) + .context("failed to enable VPIO output")?; + + // Configure stream format: 48kHz mono f32 non-interleaved + let stream_format = StreamFormat { + sample_rate: 48_000.0, + sample_format: SampleFormat::F32, + flags: LinearPcmFlags::IS_FLOAT + | LinearPcmFlags::IS_PACKED + | LinearPcmFlags::IS_NON_INTERLEAVED, + channels: 1, + }; + + let asbd = stream_format.to_asbd(); + + // Input: set format on Output scope of Input element + // (= the format the AU delivers to us from the mic) + au.set_property( + sys::kAudioUnitProperty_StreamFormat, + Scope::Output, + Element::Input, + Some(&asbd), + ) + .context("failed to set input stream format")?; + + // Output: set format on Input scope of Output element + // (= the format we feed to the AU for the speaker) + au.set_property( + sys::kAudioUnitProperty_StreamFormat, + Scope::Input, + Element::Output, + Some(&asbd), + ) + .context("failed to set output stream format")?; + + // Set up input callback (mic capture with AEC applied) + let cap_ring = capture_ring.clone(); + let cap_running = running.clone(); + let logged = Arc::new(AtomicBool::new(false)); + au.set_input_callback( + move |args: render_callback::Args>| { + if !cap_running.load(Ordering::Relaxed) { + return Ok(()); + } + let mut buffers = args.data.channels(); + if let Some(ch) = buffers.next() { + if !logged.swap(true, Ordering::Relaxed) { + eprintln!("[vpio] capture callback: {} f32 samples", ch.len()); + } + let mut tmp = [0i16; FRAME_SAMPLES]; + for chunk in ch.chunks(FRAME_SAMPLES) { + let n = chunk.len(); + for i in 0..n { + tmp[i] = (chunk[i].clamp(-1.0, 1.0) * i16::MAX as f32) as i16; + } + cap_ring.write(&tmp[..n]); + } + } + Ok(()) + }, + ) + .context("failed to set input callback")?; + + // Set up output callback (speaker playback — AEC uses this as reference) + let play_ring = playout_ring.clone(); + au.set_render_callback( + move |mut args: render_callback::Args>| { + let mut buffers = args.data.channels_mut(); + if let Some(ch) = buffers.next() { + let mut tmp = [0i16; FRAME_SAMPLES]; + for chunk in ch.chunks_mut(FRAME_SAMPLES) { + let n = chunk.len(); + let read = play_ring.read(&mut tmp[..n]); + for i in 0..read { + chunk[i] = tmp[i] as f32 / i16::MAX as f32; + } + for i in read..n { + chunk[i] = 0.0; + } + } + } + Ok(()) + }, + ) + .context("failed to set render callback")?; + + au.initialize().context("failed to initialize VoiceProcessingIO")?; + au.start().context("failed to start VoiceProcessingIO")?; + + info!("VoiceProcessingIO started (OS-level AEC enabled)"); + + Ok(Self { + capture_ring, + playout_ring, + _audio_unit: au, + running, + }) + } + + pub fn capture_ring(&self) -> &Arc { + &self.capture_ring + } + + pub fn playout_ring(&self) -> &Arc { + &self.playout_ring + } + + pub fn stop(&self) { + self.running.store(false, Ordering::Relaxed); + } +} + +impl Drop for VpioAudio { + fn drop(&mut self) { + self.stop(); + } +} diff --git a/crates/wzp-client/src/call.rs b/crates/wzp-client/src/call.rs index 4ebf2ee..6f764e9 100644 --- a/crates/wzp-client/src/call.rs +++ b/crates/wzp-client/src/call.rs @@ -42,6 +42,9 @@ pub struct CallConfig { /// When enabled, only every 50th frame carries a full 12-byte MediaHeader; /// intermediate frames use a compact 4-byte MiniHeader. pub mini_frames_enabled: bool, + /// AEC far-end delay compensation in milliseconds (default: 40). + /// Compensates for the round-trip audio latency from playout to mic capture. + pub aec_delay_ms: u32, /// Enable adaptive jitter buffer (default: true). /// /// When true, the jitter buffer target depth is automatically adjusted @@ -63,6 +66,7 @@ impl Default for CallConfig { noise_suppression: true, mini_frames_enabled: true, adaptive_jitter: true, + aec_delay_ms: 40, } } } @@ -241,7 +245,7 @@ impl CallEncoder { block_id: 0, frame_in_block: 0, timestamp_ms: 0, - aec: EchoCanceller::new(48000, 30), // 30ms echo tail (laptop/phone) + aec: EchoCanceller::with_delay(48000, 60, config.aec_delay_ms), agc: AutoGainControl::new(), silence_detector: SilenceDetector::new( config.silence_threshold_rms, diff --git a/crates/wzp-client/src/cli.rs b/crates/wzp-client/src/cli.rs index be40cb2..56c9e67 100644 --- a/crates/wzp-client/src/cli.rs +++ b/crates/wzp-client/src/cli.rs @@ -53,6 +53,8 @@ struct CliArgs { no_fec: bool, no_silence: bool, direct_playout: bool, + aec_delay_ms: Option, + os_aec: bool, token: Option, _metrics_file: Option, } @@ -130,6 +132,8 @@ fn parse_args() -> CliArgs { let mut no_fec = false; let mut no_silence = false; let mut direct_playout = false; + let mut aec_delay_ms = None; + let mut os_aec = false; let mut token = None; let mut metrics_file = None; let mut relay_str = None; @@ -181,6 +185,16 @@ fn parse_args() -> CliArgs { "--no-fec" => no_fec = true, "--no-silence" => no_silence = true, "--direct-playout" | "--android" => direct_playout = true, + "--os-aec" => os_aec = true, + "--aec-delay" => { + i += 1; + aec_delay_ms = Some( + args.get(i) + .expect("--aec-delay requires milliseconds") + .parse() + .expect("--aec-delay value must be a number"), + ); + } "--alias" => { i += 1; alias = Some(args.get(i).expect("--alias requires a name").to_string()); @@ -246,7 +260,9 @@ fn parse_args() -> CliArgs { eprintln!(" --no-fec Disable forward error correction"); eprintln!(" --no-silence Disable silence suppression"); eprintln!(" --direct-playout Bypass jitter buffer (decode on recv, like Android)"); - eprintln!(" --android Alias for --no-denoise --no-aec --no-silence --direct-playout"); + eprintln!(" --aec-delay AEC far-end delay compensation (default: 40ms)"); + eprintln!(" --os-aec Use macOS VoiceProcessingIO for hardware AEC (requires --vpio feature)"); + eprintln!(" --android Alias for --no-denoise --no-silence --direct-playout"); eprintln!(" --token featherChat bearer token for relay auth"); eprintln!(" --metrics-file Write JSONL telemetry to file (1 line/sec)"); eprintln!(" (48kHz mono s16le, play with ffplay -f s16le -ar 48000 -ch_layout mono file.raw)"); @@ -292,6 +308,8 @@ fn parse_args() -> CliArgs { no_fec, no_silence, direct_playout, + aec_delay_ms, + os_aec, token, _metrics_file: metrics_file, } @@ -375,11 +393,13 @@ async fn main() -> anyhow::Result<()> { { let audio_opts = AudioOpts { no_denoise: cli.no_denoise || cli.direct_playout, - no_aec: cli.no_aec || cli.direct_playout, // AEC disabled by default — macOS has built-in AEC + no_aec: cli.no_aec, no_agc: cli.no_agc, no_fec: cli.no_fec, no_silence: cli.no_silence || cli.direct_playout, direct_playout: cli.direct_playout, + aec_delay_ms: cli.aec_delay_ms, + os_aec: cli.os_aec, }; return run_live(transport, audio_opts).await; } @@ -684,6 +704,8 @@ struct AudioOpts { no_fec: bool, no_silence: bool, direct_playout: bool, + aec_delay_ms: Option, + os_aec: bool, } #[cfg(feature = "audio")] @@ -697,15 +719,32 @@ async fn run_live( use wzp_client::audio_ring::AudioRing; use wzp_client::call::JitterTelemetry; - let capture = AudioCapture::start()?; - let playback = AudioPlayback::start()?; - info!("audio I/O started (lock-free ring buffers) — press Ctrl+C to stop"); + // Audio I/O: either VPIO (OS-level AEC) or separate CPAL streams. + #[cfg(feature = "vpio")] + let vpio; + let (capture_ring, playout_ring) = if opts.os_aec { + #[cfg(feature = "vpio")] + { + vpio = wzp_client::audio_vpio::VpioAudio::start()?; + (vpio.capture_ring().clone(), vpio.playout_ring().clone()) + } + #[cfg(not(feature = "vpio"))] + { + anyhow::bail!("--os-aec requires the 'vpio' feature (build with: cargo build --features audio,vpio)"); + } + } else { + let capture = AudioCapture::start()?; + let playback = AudioPlayback::start()?; + let cr = capture.ring().clone(); + let pr = playback.ring().clone(); + // Keep handles alive (streams stop when dropped) + std::mem::forget(capture); + std::mem::forget(playback); + (cr, pr) + }; + info!(os_aec = opts.os_aec, "audio I/O started — press Ctrl+C to stop"); - let capture_ring = capture.ring().clone(); - let playout_ring = playback.ring().clone(); - - // Far-end reference ring: recv task writes decoded audio here, - // send task reads it to feed the AEC echo canceller. + // Far-end reference ring (only used when NOT using OS AEC). let farend_ring = StdArc::new(AudioRing::new()); let running = StdArc::new(AtomicBool::new(true)); @@ -728,6 +767,7 @@ async fn run_live( let config = CallConfig { noise_suppression: !opts.no_denoise, suppression_enabled: !opts.no_silence, + aec_delay_ms: opts.aec_delay_ms.unwrap_or(40), ..CallConfig::default() }; { @@ -747,7 +787,7 @@ async fn run_live( let send_transport = transport.clone(); let send_running = running.clone(); let send_mic_muted = mic_muted.clone(); - let no_aec = opts.no_aec; + let no_aec = opts.no_aec || opts.os_aec; // OS AEC replaces software AEC let no_agc = opts.no_agc; let _no_fec = opts.no_fec; let send_farend = farend_ring.clone(); @@ -1075,8 +1115,7 @@ async fn run_live( } running.store(false, Ordering::SeqCst); - capture.stop(); - playback.stop(); + // Audio streams stop when their handles are dropped (via mem::forget above or VPIO drop). // Give transport 2s to close gracefully, then bail match tokio::time::timeout(std::time::Duration::from_secs(2), transport.close()).await { diff --git a/crates/wzp-client/src/lib.rs b/crates/wzp-client/src/lib.rs index fece291..136d4fc 100644 --- a/crates/wzp-client/src/lib.rs +++ b/crates/wzp-client/src/lib.rs @@ -10,6 +10,8 @@ pub mod audio_io; #[cfg(feature = "audio")] pub mod audio_ring; +#[cfg(feature = "vpio")] +pub mod audio_vpio; pub mod bench; pub mod call; pub mod drift_test; diff --git a/crates/wzp-codec/src/aec.rs b/crates/wzp-codec/src/aec.rs index e3ec35a..32c6eb2 100644 --- a/crates/wzp-codec/src/aec.rs +++ b/crates/wzp-codec/src/aec.rs @@ -1,71 +1,127 @@ -//! Acoustic Echo Cancellation using NLMS adaptive filter. +//! Acoustic Echo Cancellation — delay-compensated leaky NLMS with +//! Geigel double-talk detection. //! -//! Improvements over naive NLMS: -//! - Double-talk detection: freezes adaptation when near-end speech dominates, -//! preventing the filter from cancelling the local speaker's voice. -//! - Short default tail (30ms) tuned for laptops/phones where speaker and mic -//! are close together. -//! - Residual suppression: attenuates output when echo estimate is confident. +//! Key insight: on a laptop, the round-trip audio latency (playout → speaker +//! → air → mic → capture) is 30–50ms. The far-end reference must be delayed +//! by this amount so the adaptive filter models the *echo path*, not the +//! *system delay + echo path*. +//! +//! The leaky coefficient decay prevents the filter from diverging when the +//! echo path changes (e.g. hand near laptop) or when the delay estimate +//! is slightly off. -/// NLMS (Normalized Least Mean Squares) adaptive filter echo canceller -/// with double-talk detection. +/// Delay-compensated leaky NLMS echo canceller with Geigel DTD. pub struct EchoCanceller { - filter_coeffs: Vec, + // --- Adaptive filter --- + filter: Vec, filter_len: usize, - far_end_buf: Vec, - far_end_pos: usize, - /// NLMS step size (adaptation rate). + /// Circular buffer of far-end reference samples (after delay). + far_buf: Vec, + far_pos: usize, + /// NLMS step size. mu: f32, + /// Leakage factor: coefficients are multiplied by (1 - leak) each frame. + /// Prevents unbounded growth / divergence. 0.0001 is gentle. + leak: f32, enabled: bool, - /// Running far-end power estimate (for double-talk detection). - far_power_avg: f32, - /// Running near-end power estimate (for double-talk detection). - near_power_avg: f32, - /// Smoothing factor for power estimates. - power_alpha: f32, - /// Double-talk threshold: if near/far power ratio exceeds this, - /// freeze adaptation to protect near-end speech. - dt_threshold: f32, - /// Residual echo suppression factor (0.0 = none, 1.0 = full). - suppress: f32, + + // --- Delay buffer --- + /// Raw far-end samples before delay compensation. + delay_ring: Vec, + delay_write: usize, + delay_read: usize, + /// Delay in samples (e.g. 1920 = 40ms at 48kHz). + delay_samples: usize, + /// Capacity of the delay ring. + delay_cap: usize, + + // --- Double-talk detection (Geigel) --- + /// Peak far-end level over the last filter_len samples. + far_peak: f32, + /// Geigel threshold: if |near| > threshold * far_peak, assume double-talk. + geigel_threshold: f32, + /// Holdover counter: keep DTD active for a few frames after detection. + dtd_holdover: u32, + dtd_hold_frames: u32, } impl EchoCanceller { /// Create a new echo canceller. /// /// * `sample_rate` — typically 48000 - /// * `filter_ms` — echo-tail length in milliseconds (30ms recommended for laptops) + /// * `filter_ms` — echo-tail length in milliseconds (60ms recommended) + /// * `delay_ms` — far-end delay compensation in milliseconds (40ms for laptops) pub fn new(sample_rate: u32, filter_ms: u32) -> Self { + Self::with_delay(sample_rate, filter_ms, 40) + } + + pub fn with_delay(sample_rate: u32, filter_ms: u32, delay_ms: u32) -> Self { let filter_len = (sample_rate as usize) * (filter_ms as usize) / 1000; + let delay_samples = (sample_rate as usize) * (delay_ms as usize) / 1000; + // Delay ring must hold at least delay_samples + one frame (960) of headroom. + let delay_cap = delay_samples + (sample_rate as usize / 10); // +100ms headroom Self { - filter_coeffs: vec![0.0f32; filter_len], + filter: vec![0.0; filter_len], filter_len, - far_end_buf: vec![0.0f32; filter_len], - far_end_pos: 0, - mu: 0.005, + far_buf: vec![0.0; filter_len], + far_pos: 0, + mu: 0.01, + leak: 0.0001, enabled: true, - far_power_avg: 0.0, - near_power_avg: 0.0, - power_alpha: 0.01, - dt_threshold: 4.0, - suppress: 0.6, + + delay_ring: vec![0.0; delay_cap], + delay_write: 0, + delay_read: 0, + delay_samples, + delay_cap, + + far_peak: 0.0, + geigel_threshold: 0.7, + dtd_holdover: 0, + dtd_hold_frames: 5, } } - /// Feed far-end (speaker/playback) samples into the circular buffer. - /// - /// Must be called with the audio that was played out through the speaker - /// *before* the corresponding near-end frame is processed. + /// Feed far-end (speaker) samples. These go into the delay buffer first; + /// once enough samples have accumulated, they are released to the filter's + /// circular buffer with the correct delay offset. pub fn feed_farend(&mut self, farend: &[i16]) { + // Write raw samples into the delay ring. for &s in farend { - self.far_end_buf[self.far_end_pos] = s as f32; - self.far_end_pos = (self.far_end_pos + 1) % self.filter_len; + self.delay_ring[self.delay_write % self.delay_cap] = s as f32; + self.delay_write += 1; + } + + // Release delayed samples to the filter's far-end buffer. + while self.delay_available() >= 1 { + let sample = self.delay_ring[self.delay_read % self.delay_cap]; + self.delay_read += 1; + + self.far_buf[self.far_pos] = sample; + self.far_pos = (self.far_pos + 1) % self.filter_len; + + // Track peak far-end level for Geigel DTD. + let abs_s = sample.abs(); + if abs_s > self.far_peak { + self.far_peak = abs_s; + } + } + + // Decay far_peak slowly (avoids stale peak from a loud burst long ago). + self.far_peak *= 0.9995; + } + + /// Number of delayed samples available to release. + fn delay_available(&self) -> usize { + let buffered = self.delay_write - self.delay_read; + if buffered > self.delay_samples { + buffered - self.delay_samples + } else { + 0 } } /// Process a near-end (microphone) frame, removing the estimated echo. - /// - /// Returns the echo-return-loss enhancement (ERLE) as a ratio. pub fn process_frame(&mut self, nearend: &mut [i16]) -> f32 { if !self.enabled { return 1.0; @@ -74,31 +130,33 @@ impl EchoCanceller { let n = nearend.len(); let fl = self.filter_len; - // Compute frame-level power for double-talk detection. - let near_power: f32 = nearend.iter().map(|&s| { - let f = s as f32; - f * f - }).sum::() / n as f32; + // --- Geigel double-talk detection --- + // If any near-end sample exceeds threshold * far_peak, assume + // the local speaker is active and freeze adaptation. + let mut is_doubletalk = self.dtd_holdover > 0; + if !is_doubletalk { + let threshold_level = self.geigel_threshold * self.far_peak; + for &s in nearend.iter() { + if (s as f32).abs() > threshold_level && self.far_peak > 100.0 { + is_doubletalk = true; + self.dtd_holdover = self.dtd_hold_frames; + break; + } + } + } + if self.dtd_holdover > 0 { + self.dtd_holdover -= 1; + } - let far_start = (self.far_end_pos + fl * ((n / fl) + 1) - n) % fl; - let far_power: f32 = (0..n).map(|i| { - let fe = self.far_end_buf[(far_start + i) % fl]; - fe * fe - }).sum::() / n as f32; + // Check if far-end is active (otherwise nothing to cancel). + let far_active = self.far_peak > 100.0; - // Smooth power estimates - self.far_power_avg += self.power_alpha * (far_power - self.far_power_avg); - self.near_power_avg += self.power_alpha * (near_power - self.near_power_avg); - - // Double-talk detection: if near-end is much louder than far-end, - // the local speaker is active — freeze adaptation. - let adapt = if self.far_power_avg < 1.0 { - // No far-end signal — nothing to cancel, skip adaptation - false - } else { - let ratio = self.near_power_avg / (self.far_power_avg + 1.0); - ratio < self.dt_threshold - }; + // --- Leaky coefficient decay --- + // Applied once per frame for efficiency. + let decay = 1.0 - self.leak; + for c in self.filter.iter_mut() { + *c *= decay; + } let mut sum_near_sq: f64 = 0.0; let mut sum_err_sq: f64 = 0.0; @@ -106,76 +164,62 @@ impl EchoCanceller { for i in 0..n { let near_f = nearend[i] as f32; - // Estimate echo: dot(coeffs, farend_window) - let base = (self.far_end_pos + fl * ((n / fl) + 2) + i - n) % fl; + // Position of far-end "now" for this near-end sample. + let base = (self.far_pos + fl * ((n / fl) + 2) + i - n) % fl; + // --- Echo estimation: dot(filter, far_end_window) --- let mut echo_est: f32 = 0.0; let mut power: f32 = 0.0; for k in 0..fl { let fe_idx = (base + fl - k) % fl; - let fe = self.far_end_buf[fe_idx]; - echo_est += self.filter_coeffs[k] * fe; + let fe = self.far_buf[fe_idx]; + echo_est += self.filter[k] * fe; power += fe * fe; } let error = near_f - echo_est; - // NLMS coefficient update — only when not in double-talk - if adapt && power > 1.0 { - let norm = power + 1.0; - let step = self.mu * error / norm; - + // --- NLMS adaptation (only when far-end active & no double-talk) --- + if far_active && !is_doubletalk && power > 10.0 { + let step = self.mu * error / (power + 1.0); for k in 0..fl { let fe_idx = (base + fl - k) % fl; - let fe = self.far_end_buf[fe_idx]; - self.filter_coeffs[k] += step * fe; + self.filter[k] += step * self.far_buf[fe_idx]; } } - // Residual echo suppression: when far-end is active, attenuate - // the residual to reduce perceptible echo. - let out = if self.far_power_avg > 100.0 && !adapt { - // Double-talk: pass through near-end with minimal suppression - error - } else if self.far_power_avg > 100.0 { - // Far-end active, not double-talk: apply suppression - error * (1.0 - self.suppress * (echo_est.abs() / (near_f.abs() + 1.0)).min(1.0)) - } else { - // No far-end: pass through - error - }; - - let out = out.max(-32768.0).min(32767.0); + let out = error.clamp(-32768.0, 32767.0); nearend[i] = out as i16; - sum_near_sq += (near_f as f64) * (near_f as f64); - sum_err_sq += (out as f64) * (out as f64); + sum_near_sq += (near_f as f64).powi(2); + sum_err_sq += (out as f64).powi(2); } if sum_err_sq < 1.0 { - return 100.0; + 100.0 + } else { + (sum_near_sq / sum_err_sq).sqrt() as f32 } - (sum_near_sq / sum_err_sq).sqrt() as f32 } - /// Enable or disable echo cancellation. pub fn set_enabled(&mut self, enabled: bool) { self.enabled = enabled; } - /// Returns whether echo cancellation is currently enabled. pub fn is_enabled(&self) -> bool { self.enabled } - /// Reset the adaptive filter to its initial state. pub fn reset(&mut self) { - self.filter_coeffs.iter_mut().for_each(|c| *c = 0.0); - self.far_end_buf.iter_mut().for_each(|s| *s = 0.0); - self.far_end_pos = 0; - self.far_power_avg = 0.0; - self.near_power_avg = 0.0; + self.filter.iter_mut().for_each(|c| *c = 0.0); + self.far_buf.iter_mut().for_each(|s| *s = 0.0); + self.far_pos = 0; + self.far_peak = 0.0; + self.delay_ring.iter_mut().for_each(|s| *s = 0.0); + self.delay_write = 0; + self.delay_read = 0; + self.dtd_holdover = 0; } } @@ -184,46 +228,40 @@ mod tests { use super::*; #[test] - fn aec_creates_with_correct_filter_len() { - let aec = EchoCanceller::new(48000, 30); - assert_eq!(aec.filter_len, 1440); - assert_eq!(aec.filter_coeffs.len(), 1440); - assert_eq!(aec.far_end_buf.len(), 1440); + fn creates_with_correct_sizes() { + let aec = EchoCanceller::with_delay(48000, 60, 40); + assert_eq!(aec.filter_len, 2880); // 60ms @ 48kHz + assert_eq!(aec.delay_samples, 1920); // 40ms @ 48kHz } #[test] - fn aec_passthrough_when_disabled() { - let mut aec = EchoCanceller::new(48000, 30); + fn passthrough_when_disabled() { + let mut aec = EchoCanceller::new(48000, 60); aec.set_enabled(false); - assert!(!aec.is_enabled()); - let original: Vec = (0..480).map(|i| (i * 10) as i16).collect(); + let original: Vec = (0..960).map(|i| (i * 10) as i16).collect(); let mut frame = original.clone(); - let erle = aec.process_frame(&mut frame); - assert_eq!(erle, 1.0); + aec.process_frame(&mut frame); assert_eq!(frame, original); } #[test] - fn aec_reset_zeroes_state() { - let mut aec = EchoCanceller::new(48000, 10); - let farend: Vec = (0..480).map(|i| ((i * 37) % 1000) as i16).collect(); - aec.feed_farend(&farend); - - aec.reset(); - - assert!(aec.filter_coeffs.iter().all(|&c| c == 0.0)); - assert!(aec.far_end_buf.iter().all(|&s| s == 0.0)); - assert_eq!(aec.far_end_pos, 0); + fn silence_passthrough() { + let mut aec = EchoCanceller::with_delay(48000, 30, 0); + aec.feed_farend(&vec![0i16; 960]); + let mut frame = vec![0i16; 960]; + aec.process_frame(&mut frame); + assert!(frame.iter().all(|&s| s == 0)); } #[test] - fn aec_reduces_echo_of_known_signal() { - let filter_ms = 5; - let mut aec = EchoCanceller::new(48000, filter_ms); + fn reduces_echo_with_no_delay() { + // Simulate: far-end plays, echo arrives at mic attenuated by ~50% + // (realistic — speaker to mic on laptop loses volume). + let mut aec = EchoCanceller::with_delay(48000, 10, 0); - let frame_len = 480usize; - let make_frame = |offset: usize| -> Vec { + let frame_len = 480; + let make_tone = |offset: usize| -> Vec { (0..frame_len) .map(|i| { let t = (offset + i) as f64 / 48000.0; @@ -233,11 +271,12 @@ mod tests { }; let mut last_erle = 1.0f32; - for frame_idx in 0..40 { - let farend = make_frame(frame_idx * frame_len); + for frame_idx in 0..100 { + let farend = make_tone(frame_idx * frame_len); aec.feed_farend(&farend); - let mut nearend = farend.clone(); + // Near-end = attenuated copy of far-end (echo at ~50% volume). + let mut nearend: Vec = farend.iter().map(|&s| s / 2).collect(); last_erle = aec.process_frame(&mut nearend); } @@ -248,37 +287,24 @@ mod tests { } #[test] - fn aec_silence_passthrough() { - let mut aec = EchoCanceller::new(48000, 10); - aec.feed_farend(&vec![0i16; 480]); - let mut frame = vec![0i16; 480]; - let erle = aec.process_frame(&mut frame); - assert!(erle >= 1.0); - assert!(frame.iter().all(|&s| s == 0)); - } - - #[test] - fn aec_preserves_nearend_during_doubletalk() { - // When only near-end is active (no far-end), output should - // closely match input — the AEC should not suppress speech. - let mut aec = EchoCanceller::new(48000, 30); + fn preserves_nearend_during_doubletalk() { + let mut aec = EchoCanceller::with_delay(48000, 30, 0); let frame_len = 960; - let nearend_signal: Vec = (0..frame_len) + let nearend: Vec = (0..frame_len) .map(|i| { let t = i as f64 / 48000.0; (10000.0 * (2.0 * std::f64::consts::PI * 440.0 * t).sin()) as i16 }) .collect(); - // Feed silence as far-end + // Feed silence as far-end (no echo source). aec.feed_farend(&vec![0i16; frame_len]); - let mut frame = nearend_signal.clone(); + let mut frame = nearend.clone(); aec.process_frame(&mut frame); - // Output energy should be close to input energy (not suppressed) - let input_energy: f64 = nearend_signal.iter().map(|&s| (s as f64).powi(2)).sum(); + let input_energy: f64 = nearend.iter().map(|&s| (s as f64).powi(2)).sum(); let output_energy: f64 = frame.iter().map(|&s| (s as f64).powi(2)).sum(); let ratio = output_energy / input_energy; @@ -287,4 +313,23 @@ mod tests { "near-end speech should be preserved, energy ratio = {ratio:.3}" ); } + + #[test] + fn delay_buffer_holds_samples() { + let mut aec = EchoCanceller::with_delay(48000, 10, 20); + // 20ms delay = 960 samples @ 48kHz. + // After feeding, feed_farend auto-drains available samples to far_buf. + // So delay_available() is always 0 after feed_farend returns. + // Instead, verify far_pos advances only after the delay is filled. + + // Feed 960 samples (= delay amount). No samples released yet. + aec.feed_farend(&vec![1i16; 960]); + // far_buf should still be all zeros (nothing released). + assert!(aec.far_buf.iter().all(|&s| s == 0.0), "nothing should be released yet"); + + // Feed 480 more. 480 should be released to far_buf. + aec.feed_farend(&vec![2i16; 480]); + let non_zero = aec.far_buf.iter().filter(|&&s| s != 0.0).count(); + assert!(non_zero > 0, "samples should have been released to far_buf"); + } }