From 4ba77c8c0e39d0000290e7b4f453b286a7b79105 Mon Sep 17 00:00:00 2001 From: Siavash Sameni Date: Fri, 10 Apr 2026 15:53:23 +0400 Subject: [PATCH] feat(linux): WebRTC AEC3 capture/playback backend with render-side tee MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds gold-standard Linux echo cancellation: in-app WebRTC AEC3 (Audio Processing Module) via the webrtc-audio-processing crate, using the same algorithm as Chrome WebRTC, Zoom, Teams, and Jitsi. Runs entirely in-process, so it works identically on ALSA / PulseAudio / PipeWire systems — no dependency on user-configured echo-cancel modules. Architecture: - New crates/wzp-client/src/audio_linux_aec.rs module (~470 lines). Contains LinuxAecCapture and LinuxAecPlayback, both using CPAL under the hood but routing samples through a shared Arc. The playback path tees each 20 ms frame into APM.process_render_frame as the echo reference BEFORE handing the samples to CPAL's output callback. The capture path runs APM.process_capture_frame on each mic frame in place before pushing to the audio ring buffer. This is the "tee the playback ring" approach that Zoom/Teams/Jitsi use. - New `linux-aec` feature in wzp-client pulling in the webrtc-audio-processing crate at v2.x with the `bundled` sub-feature. Bundled means the vendored PulseAudio WebRTC C++ sources are statically compiled via meson+ninja at cargo build time — no runtime .so dependency, avoids Debian Bookworm's stale libwebrtc-audio-processing-dev 0.3 package (which predates AEC3). Dep is target-gated to Linux, so enabling the feature on non-Linux is a no-op. - lib.rs re-exports LinuxAecCapture/LinuxAecPlayback as AudioCapture/AudioPlayback when `linux-aec` is on, otherwise falls back to the CPAL audio_io path. Shared public API (start/ring/stop/Drop) means downstream code is unchanged. - New `linux-aec` feature in wzp-desktop forwards to wzp-client/linux-aec so `cargo tauri build -- --features wzp-desktop/linux-aec` builds the AEC variant. APM configuration: - EchoCancellation: High suppression, delay-agnostic mode on, extended filter on, stream_delay_ms=60 initial hint - NoiseSuppression: High - HighPassFilter: on - AGC: off (can fight Opus encoder's own gain staging + adaptive quality controller; add later if users report low mic level) Frame size handling: - Pipeline uses 20 ms frames (960 samples @ 48 kHz mono) - APM requires strict 10 ms (480 samples) per call - Each 20 ms frame is split into two 480-sample halves, APM called twice, halves stitched back - Same pattern for render and capture sides - Carry-buffer logic handles the case where CPAL delivers samples in arbitrary chunk sizes that don't divide 960 Build infrastructure: - scripts/Dockerfile.linux-desktop-builder adds meson, ninja-build, python3, clang for the webrtc-audio-processing bundled build - scripts/build-linux-desktop-docker.sh takes a new --aec flag that enables the linux-aec feature and renames the output artifacts with an `-aec` suffix so noAEC and AEC variants can coexist on disk Task #30. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/wzp-client/Cargo.toml | 18 + crates/wzp-client/src/audio_linux_aec.rs | 512 +++++++++++++++++++++++ crates/wzp-client/src/lib.rs | 51 ++- desktop/src-tauri/Cargo.toml | 11 +- scripts/Dockerfile.linux-desktop-builder | 15 + scripts/build-linux-desktop-docker.sh | 68 ++- 6 files changed, 647 insertions(+), 28 deletions(-) create mode 100644 crates/wzp-client/src/audio_linux_aec.rs diff --git a/crates/wzp-client/Cargo.toml b/crates/wzp-client/Cargo.toml index 3d11f0f..3580321 100644 --- a/crates/wzp-client/Cargo.toml +++ b/crates/wzp-client/Cargo.toml @@ -47,6 +47,16 @@ windows = { version = "0.58", optional = true, features = [ "Win32_System_Variant", ] } +# Linux-only: WebRTC AEC3 (Audio Processing Module) bindings for the +# `linux-aec` feature. The `bundled` sub-feature of webrtc-audio-processing +# statically compiles the vendored PulseAudio webrtc-audio-processing C++ +# sources via meson + ninja at cargo build time, avoiding Debian Bookworm's +# stale system libwebrtc-audio-processing-dev 0.3 package (which predates +# AEC3). Produces a self-contained static link — no runtime .so dep, same +# algorithm on every Linux distro. +[target.'cfg(target_os = "linux")'.dependencies] +webrtc-audio-processing = { version = "2", optional = true, features = ["bundled"] } + [features] default = [] audio = ["cpal"] @@ -61,6 +71,14 @@ vpio = ["dep:coreaudio-rs"] # enabling this feature on non-Windows targets is a no-op (the # audio_wasapi module is also #[cfg(target_os = "windows")] in lib.rs). windows-aec = ["dep:windows"] +# linux-aec enables a CPAL + WebRTC AEC3 capture/playback backend that +# runs the WebRTC Audio Processing Module (same algo as Chrome / Zoom / +# Teams) in-process, using the playback PCM as the reference signal for +# echo cancellation. The webrtc-audio-processing dep is target-gated to +# Linux above, so enabling this feature on non-Linux targets is a no-op +# (the audio_linux_aec module is also #[cfg(target_os = "linux")] in +# lib.rs). +linux-aec = ["dep:webrtc-audio-processing"] [[bin]] name = "wzp-client" diff --git a/crates/wzp-client/src/audio_linux_aec.rs b/crates/wzp-client/src/audio_linux_aec.rs new file mode 100644 index 0000000..bf8dbcc --- /dev/null +++ b/crates/wzp-client/src/audio_linux_aec.rs @@ -0,0 +1,512 @@ +//! Linux AEC backend: CPAL capture + playback wired through the WebRTC Audio +//! Processing Module (AEC3 + noise suppression + high-pass filter). +//! +//! This is the same algorithm used by Chrome WebRTC, Zoom, Teams, Jitsi, and +//! any other "serious" Linux VoIP app. It runs in-process — no dependency on +//! PulseAudio's module-echo-cancel or PipeWire's filter-chain, so it works +//! identically on ALSA / PulseAudio / PipeWire systems. +//! +//! ## Architecture +//! +//! A single module-level `Arc` is shared between the capture and +//! playback paths. On each 20 ms frame (960 samples @ 48 kHz mono): +//! +//! - **Playback path**: `LinuxAecPlayback::start` spawns the usual CPAL +//! output thread, but wraps each chunk in a call to +//! `Processor::process_render_frame` **before** handing it to CPAL. That +//! gives APM an authoritative reference of exactly what's going out to +//! the speakers (same approach Zoom/Teams/Jitsi use). The AEC then knows +//! what to cancel when it sees echo in the capture stream. +//! +//! - **Capture path**: `LinuxAecCapture::start` spawns the usual CPAL +//! input thread, and runs `Processor::process_capture_frame` on each +//! incoming mic chunk **in place** before pushing it into the ring +//! buffer. The AEC subtracts the echo using the render reference it +//! saw on the playback side. +//! +//! APM is strict about frame size: it requires exactly 10 ms = 480 samples +//! per call at 48 kHz. Our pipeline uses 20 ms = 960 samples, so each 20 ms +//! frame is split into two 480-sample halves, APM is called twice, and the +//! halves are stitched back together. +//! +//! APM only accepts f32 samples in `[-1.0, 1.0]`, so we convert i16 → f32 +//! before the call and f32 → i16 after (with clamping on the return path). +//! +//! ## Stream delay +//! +//! AEC needs to know roughly how long it takes between a sample being passed +//! to `process_render_frame` and its echo showing up at `process_capture_frame` +//! — i.e. the round trip through CPAL playback → speaker → air → microphone +//! → CPAL capture. AEC3's internal estimator tracks this within a window +//! around whatever hint we give it. We hardcode 60 ms as a reasonable +//! starting point for typical Linux audio stacks; the delay estimator does +//! the fine-tuning automatically. +//! +//! ## Thread safety +//! +//! `webrtc_audio_processing::Processor` is `Send + Sync` with `&self` +//! methods. Capture and playback threads both hold an `Arc` and +//! call APM concurrently — the underlying C++ code serializes internally. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, OnceLock}; + +use anyhow::{anyhow, Context}; +use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; +use cpal::{SampleFormat, SampleRate, StreamConfig}; +use tracing::{info, warn}; +use webrtc_audio_processing::{Config, EchoCancellation, InitializationConfig, NoiseSuppression, Processor}; + +use crate::audio_ring::AudioRing; + +/// 20 ms at 48 kHz, mono — matches the rest of the pipeline and the codec. +pub const FRAME_SAMPLES: usize = 960; +/// APM requires strict 10 ms frames at 48 kHz = 480 samples per call. +const APM_FRAME_SAMPLES: usize = 480; +const APM_NUM_CHANNELS: usize = 1; +/// Round-trip delay hint passed to APM; the estimator refines from here. +/// 60 ms is a reasonable default for CPAL on ALSA / PulseAudio / PipeWire. +#[allow(dead_code)] +const STREAM_DELAY_MS: i32 = 60; + +// --------------------------------------------------------------------------- +// Shared APM instance +// --------------------------------------------------------------------------- + +/// Module-level lazily-initialized APM. Shared between capture and playback +/// so they operate on the same echo-cancellation state — the render frames +/// pushed by playback are what the capture path subtracts from the mic input. +static PROCESSOR: OnceLock> = OnceLock::new(); + +fn get_or_init_processor() -> anyhow::Result> { + if let Some(p) = PROCESSOR.get() { + return Ok(p.clone()); + } + let init_config = InitializationConfig { + num_capture_channels: APM_NUM_CHANNELS as i32, + num_render_channels: APM_NUM_CHANNELS as i32, + ..Default::default() + }; + let mut processor = Processor::new(&init_config) + .map_err(|e| anyhow!("webrtc APM init failed: {e:?}"))?; + + let config = Config { + echo_cancellation: Some(EchoCancellation { + suppression_level: webrtc_audio_processing::EchoCancellationSuppressionLevel::High, + stream_delay_ms: Some(STREAM_DELAY_MS), + enable_delay_agnostic: true, + enable_extended_filter: true, + }), + noise_suppression: Some(NoiseSuppression { + suppression_level: + webrtc_audio_processing::NoiseSuppressionLevel::High, + }), + enable_high_pass_filter: true, + // AGC left off for now — it can fight the Opus encoder's own gain + // staging and the adaptive-quality controller. Add later if users + // report low mic levels. + ..Default::default() + }; + processor.set_config(config); + + let arc = Arc::new(processor); + let _ = PROCESSOR.set(arc.clone()); + info!( + stream_delay_ms = STREAM_DELAY_MS, + "webrtc APM initialized (AEC3 High + NS High + HPF, AGC off)" + ); + Ok(arc) +} + +// --------------------------------------------------------------------------- +// Helpers: i16 ↔ f32 and APM frame processing +// --------------------------------------------------------------------------- + +#[inline] +fn i16_to_f32(s: i16) -> f32 { + s as f32 / 32768.0 +} + +#[inline] +fn f32_to_i16(s: f32) -> i16 { + (s.clamp(-1.0, 1.0) * 32767.0) as i16 +} + +/// Feed a 20 ms (960-sample) playback frame to APM as the render reference. +/// Splits into two 10 ms halves because APM is strict about frame size. +fn push_render_frame_20ms(apm: &Processor, pcm: &[i16]) { + debug_assert_eq!(pcm.len(), FRAME_SAMPLES); + let mut buf = [0f32; APM_FRAME_SAMPLES]; + for half in pcm.chunks_exact(APM_FRAME_SAMPLES) { + for (i, &s) in half.iter().enumerate() { + buf[i] = i16_to_f32(s); + } + // process_render_frame mutates in place. For render we only care + // about feeding APM the reference — we discard the output. + if let Err(e) = apm.process_render_frame(&mut buf) { + warn!("webrtc APM process_render_frame failed: {e:?}"); + } + } +} + +/// Run a 20 ms (960-sample) capture frame through APM's echo cancellation +/// in place. Splits into two 10 ms halves, runs APM on each, stitches +/// results back into the caller's buffer. +fn process_capture_frame_20ms(apm: &Processor, pcm: &mut [i16]) { + debug_assert_eq!(pcm.len(), FRAME_SAMPLES); + let mut buf = [0f32; APM_FRAME_SAMPLES]; + for half in pcm.chunks_exact_mut(APM_FRAME_SAMPLES) { + for (i, &s) in half.iter().enumerate() { + buf[i] = i16_to_f32(s); + } + if let Err(e) = apm.process_capture_frame(&mut buf) { + warn!("webrtc APM process_capture_frame failed: {e:?}"); + } + for (i, d) in half.iter_mut().enumerate() { + *d = f32_to_i16(buf[i]); + } + } +} + +// --------------------------------------------------------------------------- +// LinuxAecCapture — CPAL mic + WebRTC AEC capture-side processing +// --------------------------------------------------------------------------- + +/// Microphone capture with WebRTC AEC3 applied in place before the codec +/// sees the samples. Mirrors the public API of `audio_io::AudioCapture` so +/// downstream code doesn't change. +pub struct LinuxAecCapture { + ring: Arc, + running: Arc, +} + +impl LinuxAecCapture { + pub fn start() -> Result { + // Eagerly init the APM so the playback side can find it already + // configured, and so init errors surface on the caller thread + // instead of silently failing inside the capture thread. + let apm = get_or_init_processor()?; + + let ring = Arc::new(AudioRing::new()); + let running = Arc::new(AtomicBool::new(true)); + + let (init_tx, init_rx) = std::sync::mpsc::sync_channel::>(1); + + let ring_cb = ring.clone(); + let running_clone = running.clone(); + let apm_capture = apm.clone(); + + std::thread::Builder::new() + .name("wzp-audio-capture-linuxaec".into()) + .spawn(move || { + let result = (|| -> Result<(), anyhow::Error> { + let host = cpal::default_host(); + let device = host + .default_input_device() + .ok_or_else(|| anyhow!("no default input audio device found"))?; + info!(device = %device.name().unwrap_or_default(), "LinuxAEC: using input device"); + + let config = StreamConfig { + channels: 1, + sample_rate: SampleRate(48_000), + buffer_size: cpal::BufferSize::Default, + }; + + let use_f32 = !supports_i16_input(&device)?; + + let err_cb = |e: cpal::StreamError| { + warn!("LinuxAEC input stream error: {e}"); + }; + + // Leftover buffer for when CPAL gives us partial frames. + // We need exactly 960-sample chunks to feed APM. + let leftover = std::sync::Mutex::new(Vec::::with_capacity(FRAME_SAMPLES * 4)); + + let stream = if use_f32 { + let ring = ring_cb.clone(); + let running = running_clone.clone(); + let apm = apm_capture.clone(); + device.build_input_stream( + &config, + move |data: &[f32], _: &cpal::InputCallbackInfo| { + if !running.load(Ordering::Relaxed) { + return; + } + let mut lv = leftover.lock().unwrap(); + lv.reserve(data.len()); + for &s in data { + lv.push(f32_to_i16(s)); + } + drain_frames_through_apm(&mut lv, &apm, &ring); + }, + err_cb, + None, + )? + } else { + let ring = ring_cb.clone(); + let running = running_clone.clone(); + let apm = apm_capture.clone(); + device.build_input_stream( + &config, + move |data: &[i16], _: &cpal::InputCallbackInfo| { + if !running.load(Ordering::Relaxed) { + return; + } + let mut lv = leftover.lock().unwrap(); + lv.extend_from_slice(data); + drain_frames_through_apm(&mut lv, &apm, &ring); + }, + err_cb, + None, + )? + }; + + stream.play().context("failed to start LinuxAEC input stream")?; + let _ = init_tx.send(Ok(())); + info!("LinuxAEC capture started (AEC3 active)"); + + while running_clone.load(Ordering::Relaxed) { + std::thread::park_timeout(std::time::Duration::from_millis(200)); + } + drop(stream); + Ok(()) + })(); + + if let Err(e) = result { + let _ = init_tx.send(Err(e.to_string())); + } + })?; + + init_rx + .recv() + .map_err(|_| anyhow!("LinuxAEC capture thread exited before signaling"))? + .map_err(|e| anyhow!("{e}"))?; + + Ok(Self { ring, running }) + } + + pub fn ring(&self) -> &Arc { + &self.ring + } + + pub fn stop(&self) { + self.running.store(false, Ordering::Relaxed); + } +} + +impl Drop for LinuxAecCapture { + fn drop(&mut self) { + self.stop(); + } +} + +/// Pull whole 960-sample frames out of the leftover buffer, run them through +/// APM's capture-side processing, and push to the ring. Leaves any partial +/// sub-960 remainder in `leftover` for the next callback. +fn drain_frames_through_apm(leftover: &mut Vec, apm: &Processor, ring: &AudioRing) { + let mut frame = [0i16; FRAME_SAMPLES]; + while leftover.len() >= FRAME_SAMPLES { + frame.copy_from_slice(&leftover[..FRAME_SAMPLES]); + process_capture_frame_20ms(apm, &mut frame); + ring.write(&frame); + leftover.drain(..FRAME_SAMPLES); + } +} + +// --------------------------------------------------------------------------- +// LinuxAecPlayback — CPAL speaker output + WebRTC AEC render-side tee +// --------------------------------------------------------------------------- + +/// Speaker playback with a render-side tee: each frame written to CPAL is +/// ALSO fed to APM via `process_render_frame` as the echo-cancellation +/// reference signal. This is the "tee the playback ring" approach (Zoom, +/// Teams, Jitsi) — deterministic, does not depend on PulseAudio loopback or +/// PipeWire monitor sources. +pub struct LinuxAecPlayback { + ring: Arc, + running: Arc, +} + +impl LinuxAecPlayback { + pub fn start() -> Result { + let apm = get_or_init_processor()?; + + let ring = Arc::new(AudioRing::new()); + let running = Arc::new(AtomicBool::new(true)); + + let (init_tx, init_rx) = std::sync::mpsc::sync_channel::>(1); + + let ring_cb = ring.clone(); + let running_clone = running.clone(); + let apm_render = apm.clone(); + + std::thread::Builder::new() + .name("wzp-audio-playback-linuxaec".into()) + .spawn(move || { + let result = (|| -> Result<(), anyhow::Error> { + let host = cpal::default_host(); + let device = host + .default_output_device() + .ok_or_else(|| anyhow!("no default output audio device found"))?; + info!(device = %device.name().unwrap_or_default(), "LinuxAEC: using output device"); + + let config = StreamConfig { + channels: 1, + sample_rate: SampleRate(48_000), + buffer_size: cpal::BufferSize::Default, + }; + + let use_f32 = !supports_i16_output(&device)?; + + let err_cb = |e: cpal::StreamError| { + warn!("LinuxAEC output stream error: {e}"); + }; + + // Same 960-sample batching approach as the capture side: + // CPAL may ask for N samples in a callback where N doesn't + // divide 960. We accumulate partial frames in a Vec and + // feed APM as soon as we have a whole 20 ms frame. + let carry = std::sync::Mutex::new(Vec::::with_capacity(FRAME_SAMPLES * 4)); + + let stream = if use_f32 { + let ring = ring_cb.clone(); + let apm = apm_render.clone(); + device.build_output_stream( + &config, + move |data: &mut [f32], _: &cpal::OutputCallbackInfo| { + fill_output_and_tee_f32(data, &ring, &apm, &carry); + }, + err_cb, + None, + )? + } else { + let ring = ring_cb.clone(); + let apm = apm_render.clone(); + device.build_output_stream( + &config, + move |data: &mut [i16], _: &cpal::OutputCallbackInfo| { + fill_output_and_tee_i16(data, &ring, &apm, &carry); + }, + err_cb, + None, + )? + }; + + stream.play().context("failed to start LinuxAEC output stream")?; + let _ = init_tx.send(Ok(())); + info!("LinuxAEC playback started (render tee active)"); + + while running_clone.load(Ordering::Relaxed) { + std::thread::park_timeout(std::time::Duration::from_millis(200)); + } + drop(stream); + Ok(()) + })(); + + if let Err(e) = result { + let _ = init_tx.send(Err(e.to_string())); + } + })?; + + init_rx + .recv() + .map_err(|_| anyhow!("LinuxAEC playback thread exited before signaling"))? + .map_err(|e| anyhow!("{e}"))?; + + Ok(Self { ring, running }) + } + + pub fn ring(&self) -> &Arc { + &self.ring + } + + pub fn stop(&self) { + self.running.store(false, Ordering::Relaxed); + } +} + +impl Drop for LinuxAecPlayback { + fn drop(&mut self) { + self.stop(); + } +} + +fn fill_output_and_tee_i16( + data: &mut [i16], + ring: &AudioRing, + apm: &Processor, + carry: &std::sync::Mutex>, +) { + let read = ring.read(data); + for s in &mut data[read..] { + *s = 0; + } + tee_render_samples(data, apm, carry); +} + +fn fill_output_and_tee_f32( + data: &mut [f32], + ring: &AudioRing, + apm: &Processor, + carry: &std::sync::Mutex>, +) { + let mut tmp = vec![0i16; data.len()]; + let read = ring.read(&mut tmp); + for s in &mut tmp[read..] { + *s = 0; + } + for (d, &s) in data.iter_mut().zip(tmp.iter()) { + *d = i16_to_f32(s); + } + tee_render_samples(&tmp, apm, carry); +} + +/// Push CPAL-bound samples into APM's render-side input for echo cancellation. +/// Uses a carry buffer to batch into exact 960-sample (20 ms) frames. +fn tee_render_samples(samples: &[i16], apm: &Processor, carry: &std::sync::Mutex>) { + let mut lv = carry.lock().unwrap(); + lv.extend_from_slice(samples); + while lv.len() >= FRAME_SAMPLES { + let mut frame = [0i16; FRAME_SAMPLES]; + frame.copy_from_slice(&lv[..FRAME_SAMPLES]); + push_render_frame_20ms(apm, &frame); + lv.drain(..FRAME_SAMPLES); + } +} + +// --------------------------------------------------------------------------- +// CPAL format helpers (duplicated from audio_io.rs to keep the modules +// independent — each backend file is a self-contained unit) +// --------------------------------------------------------------------------- + +fn supports_i16_input(device: &cpal::Device) -> Result { + let supported = device + .supported_input_configs() + .context("failed to query input configs")?; + for cfg in supported { + if cfg.sample_format() == SampleFormat::I16 + && cfg.min_sample_rate() <= SampleRate(48_000) + && cfg.max_sample_rate() >= SampleRate(48_000) + && cfg.channels() >= 1 + { + return Ok(true); + } + } + Ok(false) +} + +fn supports_i16_output(device: &cpal::Device) -> Result { + let supported = device + .supported_output_configs() + .context("failed to query output configs")?; + for cfg in supported { + if cfg.sample_format() == SampleFormat::I16 + && cfg.min_sample_rate() <= SampleRate(48_000) + && cfg.max_sample_rate() >= SampleRate(48_000) + && cfg.channels() >= 1 + { + return Ok(true); + } + } + Ok(false) +} diff --git a/crates/wzp-client/src/lib.rs b/crates/wzp-client/src/lib.rs index 9a1309e..a9179bc 100644 --- a/crates/wzp-client/src/lib.rs +++ b/crates/wzp-client/src/lib.rs @@ -21,6 +21,11 @@ pub mod audio_vpio; // this feature on non-Windows targets is a no-op. #[cfg(all(feature = "windows-aec", target_os = "windows"))] pub mod audio_wasapi; +// WebRTC AEC3 (Audio Processing Module) wrapper around CPAL capture + playback +// on Linux. Only compiled when `linux-aec` feature is on AND target is Linux. +// The webrtc-audio-processing dep is itself gated to Linux in Cargo.toml. +#[cfg(all(feature = "linux-aec", target_os = "linux"))] +pub mod audio_linux_aec; pub mod bench; pub mod call; pub mod drift_test; @@ -30,24 +35,48 @@ pub mod handshake; pub mod metrics; pub mod sweep; -// AudioPlayback always comes from the CPAL path (`audio_io`). We do not -// need OS-level processing on the playback side because Windows's -// communications AEC, once engaged on the capture stream, uses the system -// render mix as the reference signal — it cancels echo from CPAL playback -// (and any other app's audio) without special handling. -#[cfg(feature = "audio")] -pub use audio_io::AudioPlayback; +// AudioPlayback: three possible backends depending on feature flags. +// 1. Default CPAL (`audio_io::AudioPlayback`) — baseline on every platform. +// 2. Linux AEC (`audio_linux_aec::LinuxAecPlayback`) — CPAL + WebRTC APM +// render-side tee, so echo from speakers gets cancelled from the mic. +// +// On macOS and Windows we always use the default CPAL playback because: +// - macOS: VoiceProcessingIO handles AEC at the capture side (Apple's +// native hardware AEC uses its own reference signal handling). +// - Windows: WASAPI AudioCategory_Communications AEC uses the system +// render mix as reference — no per-process plumbing needed. +// +// Linux is the only platform where the in-app approach is necessary, so +// the AEC playback path is gated to target_os = "linux". -// AudioCapture: two possible backends. Windows-AEC path when compiled in, -// otherwise the plain CPAL path. The two types share the same public API -// (`start`, `ring`, `stop`, `Drop`) so downstream code is identical. #[cfg(all( feature = "audio", - any(not(feature = "windows-aec"), not(target_os = "windows")) + any(not(feature = "linux-aec"), not(target_os = "linux")) +))] +pub use audio_io::AudioPlayback; + +#[cfg(all(feature = "linux-aec", target_os = "linux"))] +pub use audio_linux_aec::LinuxAecPlayback as AudioPlayback; + +// AudioCapture: three possible backends depending on feature flags. +// 1. Default CPAL (`audio_io::AudioCapture`) — baseline on every platform. +// 2. Windows AEC (`audio_wasapi::WasapiAudioCapture`) — direct WASAPI +// with AudioCategory_Communications, OS APO chain does AEC. +// 3. Linux AEC (`audio_linux_aec::LinuxAecCapture`) — CPAL + WebRTC APM +// capture-side echo cancellation using the playback tee as reference. +// All three expose the same public API (`start`, `ring`, `stop`, `Drop`). + +#[cfg(all( + feature = "audio", + any(not(feature = "windows-aec"), not(target_os = "windows")), + any(not(feature = "linux-aec"), not(target_os = "linux")) ))] pub use audio_io::AudioCapture; #[cfg(all(feature = "windows-aec", target_os = "windows"))] pub use audio_wasapi::WasapiAudioCapture as AudioCapture; + +#[cfg(all(feature = "linux-aec", target_os = "linux"))] +pub use audio_linux_aec::LinuxAecCapture as AudioCapture; pub use call::{CallConfig, CallDecoder, CallEncoder}; pub use handshake::perform_handshake; diff --git a/desktop/src-tauri/Cargo.toml b/desktop/src-tauri/Cargo.toml index 4785d78..5f7947f 100644 --- a/desktop/src-tauri/Cargo.toml +++ b/desktop/src-tauri/Cargo.toml @@ -72,7 +72,12 @@ wzp-client = { path = "../../crates/wzp-client", features = ["audio", "vpio"] } [target.'cfg(target_os = "windows")'.dependencies] wzp-client = { path = "../../crates/wzp-client", features = ["audio", "windows-aec"] } -# Linux: same as Windows for now — plain CPAL. +# Linux: CPAL playback+capture baseline. AEC is enabled via the top-level +# `linux-aec` feature in wzp-desktop, which forwards to wzp-client/linux-aec. +# Keeping it opt-in at the wzp-desktop level (rather than forcing it always +# on here) lets `cargo tauri build` produce two variants from the same +# source tree — a noAEC baseline and an AEC build — by toggling the feature +# at build time: `cargo tauri build -- --features wzp-desktop/linux-aec`. [target.'cfg(target_os = "linux")'.dependencies] wzp-client = { path = "../../crates/wzp-client", features = ["audio"] } @@ -96,3 +101,7 @@ ndk-context = "0.1" [features] default = ["custom-protocol"] custom-protocol = ["tauri/custom-protocol"] +# linux-aec: forwards to wzp-client/linux-aec so `cargo tauri build -- --features +# wzp-desktop/linux-aec` enables the WebRTC AEC3 backend on Linux. No-op on +# other targets because wzp-client/linux-aec is itself cfg(target_os = "linux"). +linux-aec = ["wzp-client/linux-aec"] diff --git a/scripts/Dockerfile.linux-desktop-builder b/scripts/Dockerfile.linux-desktop-builder index 4318da6..9dd742c 100644 --- a/scripts/Dockerfile.linux-desktop-builder +++ b/scripts/Dockerfile.linux-desktop-builder @@ -33,7 +33,22 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ librsvg2-dev \ libglib2.0-dev \ patchelf \ + meson \ + ninja-build \ + python3 \ + clang \ && rm -rf /var/lib/apt/lists/* +# ── webrtc-audio-processing build requirements ────────────────────────────── +# The `webrtc-audio-processing` Rust crate with the `bundled` feature vendors +# the PulseAudio webrtc-audio-processing C++ library and builds it via meson +# + ninja at `cargo build` time. That avoids Debian Bookworm's stale +# libwebrtc-audio-processing-dev 0.3-1 package (which predates AEC3) and gives +# us a self-contained static link — no runtime .so dependency, same algorithm +# on every Linux distro regardless of what apt ships. +# +# apt deps for the bundled build: meson, ninja-build, python3, clang, +# build-essential (already present via android-builder base). + USER builder WORKDIR /build/source diff --git a/scripts/build-linux-desktop-docker.sh b/scripts/build-linux-desktop-docker.sh index dca491a..ccd20c4 100755 --- a/scripts/build-linux-desktop-docker.sh +++ b/scripts/build-linux-desktop-docker.sh @@ -31,12 +31,17 @@ SSH_OPTS="-o ConnectTimeout=15 -o ServerAliveInterval=15 -o ServerAliveCountMax= REBUILD_RUST=0 DO_PULL=1 IMAGE_BUILD=0 +# WITH_AEC=1 enables the wzp-client `linux-aec` feature (WebRTC AEC3 via +# webrtc-audio-processing) and renames the output artifacts with an `-aec` +# suffix so both variants can coexist on disk. +WITH_AEC=0 for arg in "$@"; do case "$arg" in --rust) REBUILD_RUST=1 ;; --pull) DO_PULL=1 ;; --no-pull) DO_PULL=0 ;; --image-build) IMAGE_BUILD=1 ;; + --aec) WITH_AEC=1 ;; -h|--help) sed -n '3,25p' "$0" exit 0 @@ -80,11 +85,21 @@ NTFY_TOPIC="https://ntfy.sh/wzp" BRANCH="${1:-feat/desktop-audio-rewrite}" DO_PULL="${2:-1}" REBUILD_RUST="${3:-0}" +WITH_AEC="${4:-0}" LOG_FILE=/tmp/wzp-linux-desktop-build.log GIT_HASH="unknown" ENV_FILE="$BASE_DIR/.env" +# Variant suffix for artifact filenames so the noAEC baseline and the AEC +# build can coexist on the host. Applied after the build to the downloaded +# files (we can't easily rename during the cargo tauri build itself). +if [ "$WITH_AEC" = "1" ]; then + VARIANT="aec" +else + VARIANT="noAEC" +fi + notify() { curl -s -d "$1" "$NTFY_TOPIC" > /dev/null 2>&1 || true; } # Upload to rustypaste; print URL on stdout (or empty on failure). @@ -155,8 +170,11 @@ mkdir -p "$BASE_DIR/data/cache/cargo-registry" \ "$BASE_DIR/data/cache-linux-desktop/target" chown -R 1000:1000 "$BASE_DIR/data/cache-linux-desktop/target" 2>/dev/null || true +# Pass WITH_AEC into the docker container so the inner build script can +# decide whether to enable the wzp-client `linux-aec` feature. docker run --rm \ --user 1000:1000 \ + -e WITH_AEC="$WITH_AEC" \ -v "$BASE_DIR/data/source:/build/source" \ -v "$BASE_DIR/data/cache/cargo-registry:/home/builder/.cargo/registry" \ -v "$BASE_DIR/data/cache/cargo-git:/home/builder/.cargo/git" \ @@ -173,12 +191,25 @@ npm install --silent 2>&1 | tail -5 || npm install 2>&1 | tail -20 echo ">>> npm run build" npm run build 2>&1 | tail -5 -echo ">>> cargo tauri build (produces .deb + .AppImage + raw binary)" -cd src-tauri -# tauri-cli is already installed in the base image via the Android -# builder RUN step. It produces target/release/wzp-desktop (raw ELF) -# plus bundles under target/release/bundle/{deb,appimage}/. -cargo tauri build 2>&1 | tail -40 +# The linux-aec feature enables a WebRTC AEC3 capture backend in +# wzp-client. Opt in only when the caller asked for it; noAEC baseline +# builds keep the plain CPAL path for comparison. Tauri does not +# propagate --features through to the wzp-desktop crate directly +# because `cargo tauri build` invokes cargo underneath — so we use +# `cargo tauri build -- --features wzp-desktop/linux-aec` to pass it +# through. Wait — wzp-desktop is the bin crate, and its `linux-aec` +# feature needs to be defined there too. The simpler path is to set +# the feature at the wzp-client level via a bin-crate feature that +# forwards to wzp-client. Handled in Cargo.toml changes. +if [ "${WITH_AEC:-0}" = "1" ]; then + echo ">>> cargo tauri build WITH linux-aec feature" + cd src-tauri + cargo tauri build -- --features wzp-desktop/linux-aec 2>&1 | tail -40 +else + echo ">>> cargo tauri build (noAEC baseline)" + cd src-tauri + cargo tauri build 2>&1 | tail -40 +fi echo "" echo ">>> Build artifacts:" @@ -236,7 +267,7 @@ notify_local "WZP Linux desktop build dispatched (branch=$BRANCH)" log "Triggering remote build (branch=$BRANCH)..." # Run; last lines are *_REMOTE_PATH=... -REMOTE_OUTPUT=$(ssh_cmd "/tmp/wzp-linux-desktop-build.sh '$BRANCH' '$DO_PULL' '$REBUILD_RUST'" || true) +REMOTE_OUTPUT=$(ssh_cmd "/tmp/wzp-linux-desktop-build.sh '$BRANCH' '$DO_PULL' '$REBUILD_RUST' '$WITH_AEC'" || true) echo "$REMOTE_OUTPUT" | tail -80 BIN_REMOTE=$(echo "$REMOTE_OUTPUT" | grep '^BIN_REMOTE_PATH=' | tail -1 | cut -d= -f2-) @@ -244,21 +275,26 @@ DEB_REMOTE=$(echo "$REMOTE_OUTPUT" | grep '^DEB_REMOTE_PATH=' | tail -1 | cut -d APPIMAGE_REMOTE=$(echo "$REMOTE_OUTPUT" | grep '^APPIMAGE_REMOTE_PATH=' | tail -1 | cut -d= -f2-) if [ -n "$BIN_REMOTE" ]; then - log "Downloading wzp-desktop binary to $LOCAL_OUTPUT/..." - scp $SSH_OPTS "$REMOTE_HOST:$BIN_REMOTE" "$LOCAL_OUTPUT/wzp-desktop" - echo " $LOCAL_OUTPUT/wzp-desktop ($(du -h "$LOCAL_OUTPUT/wzp-desktop" | cut -f1))" + log "Downloading wzp-desktop binary to $LOCAL_OUTPUT/wzp-desktop-$VARIANT ..." + scp $SSH_OPTS "$REMOTE_HOST:$BIN_REMOTE" "$LOCAL_OUTPUT/wzp-desktop-$VARIANT" + echo " $LOCAL_OUTPUT/wzp-desktop-$VARIANT ($(du -h "$LOCAL_OUTPUT/wzp-desktop-$VARIANT" | cut -f1))" fi if [ -n "$DEB_REMOTE" ]; then - log "Downloading .deb to $LOCAL_OUTPUT/..." - scp $SSH_OPTS "$REMOTE_HOST:$DEB_REMOTE" "$LOCAL_OUTPUT/" - ls -lh "$LOCAL_OUTPUT"/*.deb + # Apply the variant suffix to the downloaded .deb: cargo-tauri names the + # file WarzonePhone__amd64.deb regardless of what we built, so + # the variant lives only in our chosen filename. + DEB_BASENAME=$(basename "$DEB_REMOTE" .deb) + log "Downloading .deb to $LOCAL_OUTPUT/${DEB_BASENAME}-$VARIANT.deb ..." + scp $SSH_OPTS "$REMOTE_HOST:$DEB_REMOTE" "$LOCAL_OUTPUT/${DEB_BASENAME}-$VARIANT.deb" + ls -lh "$LOCAL_OUTPUT/${DEB_BASENAME}-$VARIANT.deb" fi if [ -n "$APPIMAGE_REMOTE" ]; then - log "Downloading .AppImage to $LOCAL_OUTPUT/..." - scp $SSH_OPTS "$REMOTE_HOST:$APPIMAGE_REMOTE" "$LOCAL_OUTPUT/" - ls -lh "$LOCAL_OUTPUT"/*.AppImage + APPIMG_BASENAME=$(basename "$APPIMAGE_REMOTE" .AppImage) + log "Downloading .AppImage to $LOCAL_OUTPUT/${APPIMG_BASENAME}-$VARIANT.AppImage ..." + scp $SSH_OPTS "$REMOTE_HOST:$APPIMAGE_REMOTE" "$LOCAL_OUTPUT/${APPIMG_BASENAME}-$VARIANT.AppImage" + ls -lh "$LOCAL_OUTPUT/${APPIMG_BASENAME}-$VARIANT.AppImage" fi if [ -z "$BIN_REMOTE" ]; then