diff --git a/Cargo.lock b/Cargo.lock index 58d0a68..b40893a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6438,8 +6438,8 @@ dependencies = [ "webview2-com-sys", "windows 0.61.3", "windows-core 0.61.2", - "windows-implement", - "windows-interface", + "windows-implement 0.60.2", + "windows-interface 0.59.3", ] [[package]] @@ -6520,6 +6520,16 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6" +dependencies = [ + "windows-core 0.58.0", + "windows-targets 0.52.6", +] + [[package]] name = "windows" version = "0.61.3" @@ -6552,14 +6562,27 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-core" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba6d44ec8c2591c134257ce647b7ea6b20335bf6379a27dac5f1641fcf59f99" +dependencies = [ + "windows-implement 0.58.0", + "windows-interface 0.58.0", + "windows-result 0.2.0", + "windows-strings 0.1.0", + "windows-targets 0.52.6", +] + [[package]] name = "windows-core" version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" dependencies = [ - "windows-implement", - "windows-interface", + "windows-implement 0.60.2", + "windows-interface 0.59.3", "windows-link 0.1.3", "windows-result 0.3.4", "windows-strings 0.4.2", @@ -6571,8 +6594,8 @@ version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ - "windows-implement", - "windows-interface", + "windows-implement 0.60.2", + "windows-interface 0.59.3", "windows-link 0.2.1", "windows-result 0.4.1", "windows-strings 0.5.1", @@ -6589,6 +6612,17 @@ dependencies = [ "windows-threading", ] +[[package]] +name = "windows-implement" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "windows-implement" version = "0.60.2" @@ -6600,6 +6634,17 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "windows-interface" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "windows-interface" version = "0.59.3" @@ -6653,6 +6698,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-result" version = "0.3.4" @@ -6671,6 +6725,16 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result 0.2.0", + "windows-targets 0.52.6", +] + [[package]] name = "windows-strings" version = "0.4.2" @@ -7153,6 +7217,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "windows 0.58.0", "wzp-codec", "wzp-crypto", "wzp-fec", diff --git a/crates/wzp-client/Cargo.toml b/crates/wzp-client/Cargo.toml index b9f6a1b..6160a74 100644 --- a/crates/wzp-client/Cargo.toml +++ b/crates/wzp-client/Cargo.toml @@ -31,6 +31,21 @@ libc = "0.2" [target.'cfg(target_os = "macos")'.dependencies] coreaudio-rs = { version = "0.11", optional = true } +# Windows-only: direct WASAPI bindings for the `windows-aec` feature. +# `windows` is Microsoft's official Rust COM bindings crate. We pull in +# only the audio + COM subfeatures we need — the crate is organized as +# a massive optional-feature tree, so enabling just these keeps compile +# times reasonable (~5s for these features vs ~60s for the full crate). +[target.'cfg(target_os = "windows")'.dependencies] +windows = { version = "0.58", optional = true, features = [ + "Win32_Foundation", + "Win32_Media_Audio", + "Win32_System_Com", + "Win32_System_Com_StructuredStorage", + "Win32_System_Threading", + "Win32_System_Variant", +] } + [features] default = [] audio = ["cpal"] @@ -38,6 +53,13 @@ audio = ["cpal"] # so enabling this feature on Windows/Linux is a no-op (the audio_vpio # module is also #[cfg(target_os = "macos")] in lib.rs). vpio = ["dep:coreaudio-rs"] +# windows-aec enables a direct WASAPI capture backend that opens the +# microphone under AudioCategory_Communications, turning on Windows's +# OS-level communications audio processing (AEC + noise suppression + +# AGC). The `windows` dep is itself target-gated to Windows above, so +# enabling this feature on non-Windows targets is a no-op (the +# audio_wasapi module is also #[cfg(target_os = "windows")] in lib.rs). +windows-aec = ["dep:windows"] [[bin]] name = "wzp-client" diff --git a/crates/wzp-client/src/audio_wasapi.rs b/crates/wzp-client/src/audio_wasapi.rs new file mode 100644 index 0000000..e98dfa7 --- /dev/null +++ b/crates/wzp-client/src/audio_wasapi.rs @@ -0,0 +1,328 @@ +//! Direct WASAPI microphone capture with Windows's OS-level AEC enabled. +//! +//! Bypasses CPAL and opens the default capture endpoint directly via +//! `IMMDeviceEnumerator` + `IAudioClient2::SetClientProperties`, setting +//! `AudioClientProperties.eCategory = AudioCategory_Communications`. That's +//! the switch that tells Windows "this is a VoIP call" — the OS then +//! enables its communications audio processing chain (AEC, noise +//! suppression, automatic gain control) for the stream. AEC operates at +//! the OS level using the currently-playing audio as the reference +//! signal, so it cancels echo from our CPAL playback (and any other app's +//! audio) without us having to plumb a reference signal ourselves. +//! +//! Platform: Windows only, compiled only when the `windows-aec` feature +//! is enabled. Mirrors the public API of `audio_io::AudioCapture` so +//! `wzp-client`'s lib.rs can transparently re-export either one as +//! `AudioCapture`. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use anyhow::{anyhow, Context}; +use tracing::{info, warn}; +use windows::core::{Interface, GUID}; +use windows::Win32::Foundation::{CloseHandle, BOOL, HANDLE, WAIT_OBJECT_0}; +use windows::Win32::Media::Audio::{ + eCapture, eCommunications, AudioCategory_Communications, AudioClientProperties, + IAudioCaptureClient, IAudioClient, IAudioClient2, IMMDeviceEnumerator, MMDeviceEnumerator, + AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM, + AUDCLNT_STREAMFLAGS_EVENTCALLBACK, AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY, WAVEFORMATEX, + WAVE_FORMAT_PCM, +}; +use windows::Win32::System::Com::{ + CoCreateInstance, CoInitializeEx, CoUninitialize, CLSCTX_ALL, COINIT_MULTITHREADED, +}; +use windows::Win32::System::Threading::{CreateEventW, WaitForSingleObject, INFINITE}; + +use crate::audio_ring::AudioRing; + +/// 20 ms at 48 kHz, mono. Matches the rest of the audio pipeline. +pub const FRAME_SAMPLES: usize = 960; + +/// Microphone capture via WASAPI with Windows's communications AEC enabled. +/// +/// The WASAPI capture stream runs on a dedicated OS thread. This handle is +/// `Send + Sync`. Dropping it stops the stream and joins the thread. +pub struct WasapiAudioCapture { + ring: Arc, + running: Arc, + thread: Option>, +} + +impl WasapiAudioCapture { + /// Open the default communications microphone, enable OS AEC, and start + /// streaming PCM into a lock-free ring buffer. + /// + /// Returns only after the capture thread has successfully initialized + /// the stream, or propagates the error back to the caller. + pub fn start() -> Result { + let ring = Arc::new(AudioRing::new()); + let running = Arc::new(AtomicBool::new(true)); + + let (init_tx, init_rx) = std::sync::mpsc::sync_channel::>(1); + let ring_cb = ring.clone(); + let running_cb = running.clone(); + + let thread = std::thread::Builder::new() + .name("wzp-audio-capture-wasapi".into()) + .spawn(move || { + let result = unsafe { capture_thread_main(ring_cb, running_cb.clone(), &init_tx) }; + if let Err(e) = result { + warn!("wasapi capture thread exited with error: {e}"); + // If we failed before signaling init, signal now so the + // caller unblocks. Double-send is harmless (channel is + // bounded to 1 and we only hit the second send path on + // late errors). + let _ = init_tx.send(Err(e.to_string())); + } + }) + .context("failed to spawn WASAPI capture thread")?; + + init_rx + .recv() + .map_err(|_| anyhow!("WASAPI capture thread exited before signaling init"))? + .map_err(|e| anyhow!("{e}"))?; + + Ok(Self { + ring, + running, + thread: Some(thread), + }) + } + + /// Get a reference to the capture ring buffer for direct polling. + pub fn ring(&self) -> &Arc { + &self.ring + } + + /// Stop capturing. + pub fn stop(&self) { + self.running.store(false, Ordering::Relaxed); + } +} + +impl Drop for WasapiAudioCapture { + fn drop(&mut self) { + self.stop(); + if let Some(handle) = self.thread.take() { + // Join best-effort. The thread loop polls `running` every 200ms + // via a short WaitForSingleObject timeout, so it should exit + // within ~200ms of `stop()`. + let _ = handle.join(); + } + } +} + +// --------------------------------------------------------------------------- +// WASAPI thread entry point — everything below this line runs on the +// dedicated wzp-audio-capture-wasapi thread. +// --------------------------------------------------------------------------- + +unsafe fn capture_thread_main( + ring: Arc, + running: Arc, + init_tx: &std::sync::mpsc::SyncSender>, +) -> Result<(), anyhow::Error> { + // COM init for the capture thread. MULTITHREADED because we're not + // running a message pump. Must be balanced by CoUninitialize on exit. + CoInitializeEx(None, COINIT_MULTITHREADED) + .ok() + .context("CoInitializeEx failed")?; + + // Use a guard struct so CoUninitialize runs even on early returns. + struct ComGuard; + impl Drop for ComGuard { + fn drop(&mut self) { + unsafe { CoUninitialize() }; + } + } + let _com_guard = ComGuard; + + let enumerator: IMMDeviceEnumerator = + CoCreateInstance(&MMDeviceEnumerator, None, CLSCTX_ALL) + .context("CoCreateInstance(MMDeviceEnumerator) failed")?; + + // eCommunications role (not eConsole) — this picks the device the user + // has designated for communications in Sound Settings. It's the one + // Windows's AEC is actually tuned for and the one Teams/Zoom use. + let device = enumerator + .GetDefaultAudioEndpoint(eCapture, eCommunications) + .context("GetDefaultAudioEndpoint(eCapture, eCommunications) failed")?; + + if let Ok(name) = device_name(&device) { + info!(device = %name, "opening WASAPI communications capture endpoint"); + } + + let audio_client: IAudioClient = device + .Activate(CLSCTX_ALL, None) + .context("IMMDevice::Activate(IAudioClient) failed")?; + + // IAudioClient2 exposes SetClientProperties, which is the ONLY way to + // set AudioCategory_Communications pre-Initialize. Calling it on the + // base IAudioClient would not compile, and setting it after Initialize + // is a no-op. + let audio_client2: IAudioClient2 = audio_client + .cast() + .context("QueryInterface IAudioClient2 failed")?; + + let mut props = AudioClientProperties { + cbSize: std::mem::size_of::() as u32, + bIsOffload: BOOL(0), + eCategory: AudioCategory_Communications, + // 0 = AUDCLNT_STREAMOPTIONS_NONE. The `windows` crate doesn't + // export the enum constant in all versions, so use 0 directly. + Options: Default::default(), + }; + audio_client2 + .SetClientProperties(&mut props as *mut _) + .context("SetClientProperties(AudioCategory_Communications) failed")?; + + // Request 48 kHz mono i16 directly. AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM + // tells Windows to do any needed format conversion inside the audio + // engine rather than rejecting our format. SRC_DEFAULT_QUALITY picks + // the standard Windows resampler quality (fine for voice). + let wave_format = WAVEFORMATEX { + wFormatTag: WAVE_FORMAT_PCM as u16, + nChannels: 1, + nSamplesPerSec: 48_000, + nAvgBytesPerSec: 48_000 * 2, // 1 ch * 2 bytes/sample * 48000 Hz + nBlockAlign: 2, // 1 ch * 2 bytes/sample + wBitsPerSample: 16, + cbSize: 0, + }; + + // 1,000,000 hns = 100 ms buffer (hns = 100-nanosecond units). Windows + // treats this as the minimum; the engine may give us a larger one. + const BUFFER_DURATION_HNS: i64 = 1_000_000; + + audio_client + .Initialize( + AUDCLNT_SHAREMODE_SHARED, + AUDCLNT_STREAMFLAGS_EVENTCALLBACK + | AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM + | AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY, + BUFFER_DURATION_HNS, + 0, + &wave_format, + Some(&GUID::zeroed()), + ) + .context("IAudioClient::Initialize failed — Windows rejected communications-mode 48k mono i16")?; + + // Event-driven capture: Windows signals this handle each time a new + // audio packet is available. We wait on it from the loop below. + let event = CreateEventW(None, false, false, None) + .context("CreateEventW failed")?; + audio_client + .SetEventHandle(event) + .context("SetEventHandle failed")?; + + let capture_client: IAudioCaptureClient = audio_client + .GetService() + .context("IAudioClient::GetService(IAudioCaptureClient) failed")?; + + audio_client.Start().context("IAudioClient::Start failed")?; + + // Signal to the parent thread that init succeeded before entering the + // hot loop. From this point on, errors get logged but don't propagate + // back to the caller (they'd just cause the ring buffer to stop + // filling, which the main thread detects as underruns). + let _ = init_tx.send(Ok(())); + info!("WASAPI communications-mode capture started with OS AEC enabled"); + + let mut logged_first_packet = false; + + // Main capture loop. Exit when `running` goes false (from Drop or an + // explicit stop() call). + while running.load(Ordering::Relaxed) { + // 200 ms timeout so we check `running` regularly even if the audio + // engine stops delivering packets (e.g. device unplugged). + let wait = WaitForSingleObject(event, 200); + if wait.0 != WAIT_OBJECT_0.0 { + // Timeout or failure — just loop and re-check running. + continue; + } + + // Drain all available packets. Windows may have queued more than + // one since we were last scheduled. + loop { + let packet_length = match capture_client.GetNextPacketSize() { + Ok(n) => n, + Err(e) => { + warn!("GetNextPacketSize failed: {e}"); + break; + } + }; + if packet_length == 0 { + break; + } + + let mut buffer_ptr: *mut u8 = std::ptr::null_mut(); + let mut num_frames: u32 = 0; + let mut flags: u32 = 0; + let mut device_position: u64 = 0; + let mut qpc_position: u64 = 0; + + if let Err(e) = capture_client.GetBuffer( + &mut buffer_ptr, + &mut num_frames, + &mut flags, + Some(&mut device_position), + Some(&mut qpc_position), + ) { + warn!("GetBuffer failed: {e}"); + break; + } + + if num_frames > 0 && !buffer_ptr.is_null() { + if !logged_first_packet { + info!( + frames = num_frames, + flags, "WASAPI capture: first packet received" + ); + logged_first_packet = true; + } + + // Because we asked for 48 kHz mono i16, each frame is + // exactly one i16. Windows's AUTOCONVERTPCM handles the + // conversion from whatever the engine mix format is. + let samples = std::slice::from_raw_parts( + buffer_ptr as *const i16, + num_frames as usize, + ); + ring.write(samples); + } + + if let Err(e) = capture_client.ReleaseBuffer(num_frames) { + warn!("ReleaseBuffer failed: {e}"); + break; + } + } + } + + info!("WASAPI capture thread stopping"); + let _ = audio_client.Stop(); + let _ = CloseHandle(event); + // _com_guard drops here, calling CoUninitialize. + + // Silence INFINITE unused-import warning — it's referenced by the + // `windows` crate's WaitForSingleObject alternative but we use the + // 200 ms timeout variant instead. Explicit suppression for clarity. + let _ = INFINITE; + + Ok(()) +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Best-effort device ID string for logging. Grabbing the friendly name via +/// PKEY_Device_FriendlyName requires IPropertyStore + PROPVARIANT plumbing +/// that's far more ceremony than a log line justifies; the ID is already +/// sufficient to confirm we opened the right endpoint. +unsafe fn device_name( + device: &windows::Win32::Media::Audio::IMMDevice, +) -> Result { + let id = device.GetId().context("IMMDevice::GetId failed")?; + Ok(id.to_string().unwrap_or_else(|_| "".to_string())) +} diff --git a/crates/wzp-client/src/lib.rs b/crates/wzp-client/src/lib.rs index 8d134ff..9a1309e 100644 --- a/crates/wzp-client/src/lib.rs +++ b/crates/wzp-client/src/lib.rs @@ -15,6 +15,12 @@ pub mod audio_ring; // feature on Windows/Linux was previously silently broken. #[cfg(all(feature = "vpio", target_os = "macos"))] pub mod audio_vpio; +// WASAPI-direct capture with Windows's OS-level AEC (AudioCategory_Communications). +// Only compiled when `windows-aec` feature is on AND target is Windows. The +// `windows` dependency is itself gated to Windows in Cargo.toml, so enabling +// this feature on non-Windows targets is a no-op. +#[cfg(all(feature = "windows-aec", target_os = "windows"))] +pub mod audio_wasapi; pub mod bench; pub mod call; pub mod drift_test; @@ -24,7 +30,24 @@ pub mod handshake; pub mod metrics; pub mod sweep; +// AudioPlayback always comes from the CPAL path (`audio_io`). We do not +// need OS-level processing on the playback side because Windows's +// communications AEC, once engaged on the capture stream, uses the system +// render mix as the reference signal — it cancels echo from CPAL playback +// (and any other app's audio) without special handling. #[cfg(feature = "audio")] -pub use audio_io::{AudioCapture, AudioPlayback}; +pub use audio_io::AudioPlayback; + +// AudioCapture: two possible backends. Windows-AEC path when compiled in, +// otherwise the plain CPAL path. The two types share the same public API +// (`start`, `ring`, `stop`, `Drop`) so downstream code is identical. +#[cfg(all( + feature = "audio", + any(not(feature = "windows-aec"), not(target_os = "windows")) +))] +pub use audio_io::AudioCapture; + +#[cfg(all(feature = "windows-aec", target_os = "windows"))] +pub use audio_wasapi::WasapiAudioCapture as AudioCapture; pub use call::{CallConfig, CallDecoder, CallEncoder}; pub use handshake::perform_handshake; diff --git a/desktop/src-tauri/Cargo.toml b/desktop/src-tauri/Cargo.toml index 2ad750d..4785d78 100644 --- a/desktop/src-tauri/Cargo.toml +++ b/desktop/src-tauri/Cargo.toml @@ -62,10 +62,15 @@ wzp-transport = { path = "../../crates/wzp-transport" } [target.'cfg(target_os = "macos")'.dependencies] wzp-client = { path = "../../crates/wzp-client", features = ["audio", "vpio"] } -# Windows: CPAL only. Windows AEC (WASAPI Communications / MF Voice -# Capture DSP) lands in a follow-up task. See task #24. +# Windows: CPAL for playback + direct WASAPI for capture with OS-level +# AEC (AudioCategory_Communications). The wzp-client `windows-aec` +# feature swaps the default CPAL AudioCapture for a WASAPI one that +# opens the mic under AudioCategory_Communications, turning on Windows's +# communications audio processing chain (AEC, NS, AGC). The reference +# signal for AEC is the system render mix, so echo from our CPAL +# playback is cancelled automatically without extra plumbing. [target.'cfg(target_os = "windows")'.dependencies] -wzp-client = { path = "../../crates/wzp-client", features = ["audio"] } +wzp-client = { path = "../../crates/wzp-client", features = ["audio", "windows-aec"] } # Linux: same as Windows for now — plain CPAL. [target.'cfg(target_os = "linux")'.dependencies]