feat: macOS VoiceProcessingIO for hardware AEC + delay-compensated NLMS

- Add --os-aec flag: uses Apple VoiceProcessingIO audio unit for hardware echo cancellation (same engine as FaceTime) - New vpio feature + audio_vpio.rs: combined capture+playback via VPIO - Improved software AEC: delay-compensated leaky NLMS with Geigel DTD (60ms tail, 40ms delay, configurable via --aec-delay) - Add --aec-delay flag for tuning software AEC delay compensation - Add dev-fast Cargo profile (opt-level 2 with incremental compilation) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 11:10:10 +04:00
parent 1b00b5e2a4
commit d1c96cd71f
7 changed files with 436 additions and 164 deletions
--- a/crates/wzp-client/src/audio_vpio.rs
+++ b/crates/wzp-client/src/audio_vpio.rs
@@ -0,0 +1,179 @@
+//! macOS Voice Processing I/O — uses Apple's VoiceProcessingIO audio unit
+//! for hardware-accelerated echo cancellation, AGC, and noise suppression.
+//!
+//! VoiceProcessingIO is a combined input+output unit that knows what's going
+//! to the speaker, so it can cancel the echo from the mic signal internally.
+//! This is the same engine FaceTime and other Apple apps use.
+
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+
+use anyhow::Context;
+use coreaudio::audio_unit::audio_format::LinearPcmFlags;
+use coreaudio::audio_unit::render_callback::{self, data};
+use coreaudio::audio_unit::{AudioUnit, Element, IOType, SampleFormat, Scope, StreamFormat};
+use coreaudio::sys;
+use tracing::info;
+
+use crate::audio_ring::AudioRing;
+
+/// Number of samples per 20 ms frame at 48 kHz mono.
+pub const FRAME_SAMPLES: usize = 960;
+
+/// Combined capture + playback via macOS VoiceProcessingIO.
+///
+/// The OS handles AEC internally — no manual far-end feeding needed.
+pub struct VpioAudio {
+    capture_ring: Arc<AudioRing>,
+    playout_ring: Arc<AudioRing>,
+    _audio_unit: AudioUnit,
+    running: Arc<AtomicBool>,
+}
+
+impl VpioAudio {
+    /// Start VoiceProcessingIO with AEC enabled.
+    pub fn start() -> Result<Self, anyhow::Error> {
+        let capture_ring = Arc::new(AudioRing::new());
+        let playout_ring = Arc::new(AudioRing::new());
+        let running = Arc::new(AtomicBool::new(true));
+
+        let mut au = AudioUnit::new(IOType::VoiceProcessingIO)
+            .context("failed to create VoiceProcessingIO audio unit")?;
+
+        // Must uninitialize before configuring properties.
+        au.uninitialize()
+            .context("failed to uninitialize VPIO for configuration")?;
+
+        // Enable input (mic) on Element::Input (bus 1).
+        let enable: u32 = 1;
+        au.set_property(
+            sys::kAudioOutputUnitProperty_EnableIO,
+            Scope::Input,
+            Element::Input,
+            Some(&enable),
+        )
+        .context("failed to enable VPIO input")?;
+
+        // Output (speaker) is enabled by default on VPIO, but be explicit.
+        au.set_property(
+            sys::kAudioOutputUnitProperty_EnableIO,
+            Scope::Output,
+            Element::Output,
+            Some(&enable),
+        )
+        .context("failed to enable VPIO output")?;
+
+        // Configure stream format: 48kHz mono f32 non-interleaved
+        let stream_format = StreamFormat {
+            sample_rate: 48_000.0,
+            sample_format: SampleFormat::F32,
+            flags: LinearPcmFlags::IS_FLOAT
+                | LinearPcmFlags::IS_PACKED
+                | LinearPcmFlags::IS_NON_INTERLEAVED,
+            channels: 1,
+        };
+
+        let asbd = stream_format.to_asbd();
+
+        // Input: set format on Output scope of Input element
+        // (= the format the AU delivers to us from the mic)
+        au.set_property(
+            sys::kAudioUnitProperty_StreamFormat,
+            Scope::Output,
+            Element::Input,
+            Some(&asbd),
+        )
+        .context("failed to set input stream format")?;
+
+        // Output: set format on Input scope of Output element
+        // (= the format we feed to the AU for the speaker)
+        au.set_property(
+            sys::kAudioUnitProperty_StreamFormat,
+            Scope::Input,
+            Element::Output,
+            Some(&asbd),
+        )
+        .context("failed to set output stream format")?;
+
+        // Set up input callback (mic capture with AEC applied)
+        let cap_ring = capture_ring.clone();
+        let cap_running = running.clone();
+        let logged = Arc::new(AtomicBool::new(false));
+        au.set_input_callback(
+            move |args: render_callback::Args<data::NonInterleaved<f32>>| {
+                if !cap_running.load(Ordering::Relaxed) {
+                    return Ok(());
+                }
+                let mut buffers = args.data.channels();
+                if let Some(ch) = buffers.next() {
+                    if !logged.swap(true, Ordering::Relaxed) {
+                        eprintln!("[vpio] capture callback: {} f32 samples", ch.len());
+                    }
+                    let mut tmp = [0i16; FRAME_SAMPLES];
+                    for chunk in ch.chunks(FRAME_SAMPLES) {
+                        let n = chunk.len();
+                        for i in 0..n {
+                            tmp[i] = (chunk[i].clamp(-1.0, 1.0) * i16::MAX as f32) as i16;
+                        }
+                        cap_ring.write(&tmp[..n]);
+                    }
+                }
+                Ok(())
+            },
+        )
+        .context("failed to set input callback")?;
+
+        // Set up output callback (speaker playback — AEC uses this as reference)
+        let play_ring = playout_ring.clone();
+        au.set_render_callback(
+            move |mut args: render_callback::Args<data::NonInterleaved<f32>>| {
+                let mut buffers = args.data.channels_mut();
+                if let Some(ch) = buffers.next() {
+                    let mut tmp = [0i16; FRAME_SAMPLES];
+                    for chunk in ch.chunks_mut(FRAME_SAMPLES) {
+                        let n = chunk.len();
+                        let read = play_ring.read(&mut tmp[..n]);
+                        for i in 0..read {
+                            chunk[i] = tmp[i] as f32 / i16::MAX as f32;
+                        }
+                        for i in read..n {
+                            chunk[i] = 0.0;
+                        }
+                    }
+                }
+                Ok(())
+            },
+        )
+        .context("failed to set render callback")?;
+
+        au.initialize().context("failed to initialize VoiceProcessingIO")?;
+        au.start().context("failed to start VoiceProcessingIO")?;
+
+        info!("VoiceProcessingIO started (OS-level AEC enabled)");
+
+        Ok(Self {
+            capture_ring,
+            playout_ring,
+            _audio_unit: au,
+            running,
+        })
+    }
+
+    pub fn capture_ring(&self) -> &Arc<AudioRing> {
+        &self.capture_ring
+    }
+
+    pub fn playout_ring(&self) -> &Arc<AudioRing> {
+        &self.playout_ring
+    }
+
+    pub fn stop(&self) {
+        self.running.store(false, Ordering::Relaxed);
+    }
+}
+
+impl Drop for VpioAudio {
+    fn drop(&mut self) {
+        self.stop();
+    }
+}
--- a/crates/wzp-client/src/call.rs
+++ b/crates/wzp-client/src/call.rs
@@ -42,6 +42,9 @@ pub struct CallConfig {
    /// When enabled, only every 50th frame carries a full 12-byte MediaHeader;
    /// intermediate frames use a compact 4-byte MiniHeader.
    pub mini_frames_enabled: bool,
+    /// AEC far-end delay compensation in milliseconds (default: 40).
+    /// Compensates for the round-trip audio latency from playout to mic capture.
+    pub aec_delay_ms: u32,
    /// Enable adaptive jitter buffer (default: true).
    ///
    /// When true, the jitter buffer target depth is automatically adjusted
@@ -63,6 +66,7 @@ impl Default for CallConfig {
            noise_suppression: true,
            mini_frames_enabled: true,
            adaptive_jitter: true,
+            aec_delay_ms: 40,
        }
    }
 }
@@ -241,7 +245,7 @@ impl CallEncoder {
            block_id: 0,
            frame_in_block: 0,
            timestamp_ms: 0,
-            aec: EchoCanceller::new(48000, 30), // 30ms echo tail (laptop/phone)
+            aec: EchoCanceller::with_delay(48000, 60, config.aec_delay_ms),
            agc: AutoGainControl::new(),
            silence_detector: SilenceDetector::new(
                config.silence_threshold_rms,
--- a/crates/wzp-client/src/cli.rs
+++ b/crates/wzp-client/src/cli.rs
@@ -53,6 +53,8 @@ struct CliArgs {
    no_fec: bool,
    no_silence: bool,
    direct_playout: bool,
+    aec_delay_ms: Option<u32>,
+    os_aec: bool,
    token: Option<String>,
    _metrics_file: Option<String>,
 }
@@ -130,6 +132,8 @@ fn parse_args() -> CliArgs {
    let mut no_fec = false;
    let mut no_silence = false;
    let mut direct_playout = false;
+    let mut aec_delay_ms = None;
+    let mut os_aec = false;
    let mut token = None;
    let mut metrics_file = None;
    let mut relay_str = None;
@@ -181,6 +185,16 @@ fn parse_args() -> CliArgs {
            "--no-fec" => no_fec = true,
            "--no-silence" => no_silence = true,
            "--direct-playout" | "--android" => direct_playout = true,
+            "--os-aec" => os_aec = true,
+            "--aec-delay" => {
+                i += 1;
+                aec_delay_ms = Some(
+                    args.get(i)
+                        .expect("--aec-delay requires milliseconds")
+                        .parse()
+                        .expect("--aec-delay value must be a number"),
+                );
+            }
            "--alias" => {
                i += 1;
                alias = Some(args.get(i).expect("--alias requires a name").to_string());
@@ -246,7 +260,9 @@ fn parse_args() -> CliArgs {
                eprintln!("  --no-fec              Disable forward error correction");
                eprintln!("  --no-silence          Disable silence suppression");
                eprintln!("  --direct-playout      Bypass jitter buffer (decode on recv, like Android)");
-                eprintln!("  --android             Alias for --no-denoise --no-aec --no-silence --direct-playout");
+                eprintln!("  --aec-delay <ms>      AEC far-end delay compensation (default: 40ms)");
+                eprintln!("  --os-aec              Use macOS VoiceProcessingIO for hardware AEC (requires --vpio feature)");
+                eprintln!("  --android             Alias for --no-denoise --no-silence --direct-playout");
                eprintln!("  --token <token>        featherChat bearer token for relay auth");
                eprintln!("  --metrics-file <path>  Write JSONL telemetry to file (1 line/sec)");
                eprintln!("                         (48kHz mono s16le, play with ffplay -f s16le -ar 48000 -ch_layout mono file.raw)");
@@ -292,6 +308,8 @@ fn parse_args() -> CliArgs {
        no_fec,
        no_silence,
        direct_playout,
+        aec_delay_ms,
+        os_aec,
        token,
        _metrics_file: metrics_file,
    }
@@ -375,11 +393,13 @@ async fn main() -> anyhow::Result<()> {
        {
            let audio_opts = AudioOpts {
                no_denoise: cli.no_denoise || cli.direct_playout,
-                no_aec: cli.no_aec || cli.direct_playout, // AEC disabled by default — macOS has built-in AEC
+                no_aec: cli.no_aec,
                no_agc: cli.no_agc,
                no_fec: cli.no_fec,
                no_silence: cli.no_silence || cli.direct_playout,
                direct_playout: cli.direct_playout,
+                aec_delay_ms: cli.aec_delay_ms,
+                os_aec: cli.os_aec,
            };
            return run_live(transport, audio_opts).await;
        }
@@ -684,6 +704,8 @@ struct AudioOpts {
    no_fec: bool,
    no_silence: bool,
    direct_playout: bool,
+    aec_delay_ms: Option<u32>,
+    os_aec: bool,
 }

 #[cfg(feature = "audio")]
@@ -697,15 +719,32 @@ async fn run_live(
    use wzp_client::audio_ring::AudioRing;
    use wzp_client::call::JitterTelemetry;

-    let capture = AudioCapture::start()?;
-    let playback = AudioPlayback::start()?;
-    info!("audio I/O started (lock-free ring buffers) — press Ctrl+C to stop");
+    // Audio I/O: either VPIO (OS-level AEC) or separate CPAL streams.
+    #[cfg(feature = "vpio")]
+    let vpio;
+    let (capture_ring, playout_ring) = if opts.os_aec {
+        #[cfg(feature = "vpio")]
+        {
+            vpio = wzp_client::audio_vpio::VpioAudio::start()?;
+            (vpio.capture_ring().clone(), vpio.playout_ring().clone())
+        }
+        #[cfg(not(feature = "vpio"))]
+        {
+            anyhow::bail!("--os-aec requires the 'vpio' feature (build with: cargo build --features audio,vpio)");
+        }
+    } else {
+        let capture = AudioCapture::start()?;
+        let playback = AudioPlayback::start()?;
+        let cr = capture.ring().clone();
+        let pr = playback.ring().clone();
+        // Keep handles alive (streams stop when dropped)
+        std::mem::forget(capture);
+        std::mem::forget(playback);
+        (cr, pr)
+    };
+    info!(os_aec = opts.os_aec, "audio I/O started — press Ctrl+C to stop");

-    let capture_ring = capture.ring().clone();
-    let playout_ring = playback.ring().clone();
-
-    // Far-end reference ring: recv task writes decoded audio here,
-    // send task reads it to feed the AEC echo canceller.
+    // Far-end reference ring (only used when NOT using OS AEC).
    let farend_ring = StdArc::new(AudioRing::new());

    let running = StdArc::new(AtomicBool::new(true));
@@ -728,6 +767,7 @@ async fn run_live(
    let config = CallConfig {
        noise_suppression: !opts.no_denoise,
        suppression_enabled: !opts.no_silence,
+        aec_delay_ms: opts.aec_delay_ms.unwrap_or(40),
        ..CallConfig::default()
    };
    {
@@ -747,7 +787,7 @@ async fn run_live(
    let send_transport = transport.clone();
    let send_running = running.clone();
    let send_mic_muted = mic_muted.clone();
-    let no_aec = opts.no_aec;
+    let no_aec = opts.no_aec || opts.os_aec; // OS AEC replaces software AEC
    let no_agc = opts.no_agc;
    let _no_fec = opts.no_fec;
    let send_farend = farend_ring.clone();
@@ -1075,8 +1115,7 @@ async fn run_live(
    }

    running.store(false, Ordering::SeqCst);
-    capture.stop();
-    playback.stop();
+    // Audio streams stop when their handles are dropped (via mem::forget above or VPIO drop).

    // Give transport 2s to close gracefully, then bail
    match tokio::time::timeout(std::time::Duration::from_secs(2), transport.close()).await {
--- a/crates/wzp-client/src/lib.rs
+++ b/crates/wzp-client/src/lib.rs
@@ -10,6 +10,8 @@
 pub mod audio_io;
 #[cfg(feature = "audio")]
 pub mod audio_ring;
+#[cfg(feature = "vpio")]
+pub mod audio_vpio;
 pub mod bench;
 pub mod call;
 pub mod drift_test;