feat: macOS VoiceProcessingIO for hardware AEC + delay-compensated NLMS
Some checks failed
Build Release Binaries / build-amd64 (push) Failing after 3m33s
Some checks failed
Build Release Binaries / build-amd64 (push) Failing after 3m33s
- Add --os-aec flag: uses Apple VoiceProcessingIO audio unit for hardware echo cancellation (same engine as FaceTime) - New vpio feature + audio_vpio.rs: combined capture+playback via VPIO - Improved software AEC: delay-compensated leaky NLMS with Geigel DTD (60ms tail, 40ms delay, configurable via --aec-delay) - Add --aec-delay flag for tuning software AEC delay compensation - Add dev-fast Cargo profile (opt-level 2 with incremental compilation) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,11 +23,13 @@ serde_json = "1"
|
||||
chrono = "0.4"
|
||||
rustls = { version = "0.23", default-features = false, features = ["ring", "std"] }
|
||||
cpal = { version = "0.15", optional = true }
|
||||
coreaudio-rs = { version = "0.11", optional = true }
|
||||
libc = "0.2"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
audio = ["cpal"]
|
||||
vpio = ["coreaudio-rs"]
|
||||
|
||||
[[bin]]
|
||||
name = "wzp-client"
|
||||
|
||||
179
crates/wzp-client/src/audio_vpio.rs
Normal file
179
crates/wzp-client/src/audio_vpio.rs
Normal file
@@ -0,0 +1,179 @@
|
||||
//! macOS Voice Processing I/O — uses Apple's VoiceProcessingIO audio unit
|
||||
//! for hardware-accelerated echo cancellation, AGC, and noise suppression.
|
||||
//!
|
||||
//! VoiceProcessingIO is a combined input+output unit that knows what's going
|
||||
//! to the speaker, so it can cancel the echo from the mic signal internally.
|
||||
//! This is the same engine FaceTime and other Apple apps use.
|
||||
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use coreaudio::audio_unit::audio_format::LinearPcmFlags;
|
||||
use coreaudio::audio_unit::render_callback::{self, data};
|
||||
use coreaudio::audio_unit::{AudioUnit, Element, IOType, SampleFormat, Scope, StreamFormat};
|
||||
use coreaudio::sys;
|
||||
use tracing::info;
|
||||
|
||||
use crate::audio_ring::AudioRing;
|
||||
|
||||
/// Number of samples per 20 ms frame at 48 kHz mono.
|
||||
pub const FRAME_SAMPLES: usize = 960;
|
||||
|
||||
/// Combined capture + playback via macOS VoiceProcessingIO.
|
||||
///
|
||||
/// The OS handles AEC internally — no manual far-end feeding needed.
|
||||
pub struct VpioAudio {
|
||||
capture_ring: Arc<AudioRing>,
|
||||
playout_ring: Arc<AudioRing>,
|
||||
_audio_unit: AudioUnit,
|
||||
running: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
impl VpioAudio {
|
||||
/// Start VoiceProcessingIO with AEC enabled.
|
||||
pub fn start() -> Result<Self, anyhow::Error> {
|
||||
let capture_ring = Arc::new(AudioRing::new());
|
||||
let playout_ring = Arc::new(AudioRing::new());
|
||||
let running = Arc::new(AtomicBool::new(true));
|
||||
|
||||
let mut au = AudioUnit::new(IOType::VoiceProcessingIO)
|
||||
.context("failed to create VoiceProcessingIO audio unit")?;
|
||||
|
||||
// Must uninitialize before configuring properties.
|
||||
au.uninitialize()
|
||||
.context("failed to uninitialize VPIO for configuration")?;
|
||||
|
||||
// Enable input (mic) on Element::Input (bus 1).
|
||||
let enable: u32 = 1;
|
||||
au.set_property(
|
||||
sys::kAudioOutputUnitProperty_EnableIO,
|
||||
Scope::Input,
|
||||
Element::Input,
|
||||
Some(&enable),
|
||||
)
|
||||
.context("failed to enable VPIO input")?;
|
||||
|
||||
// Output (speaker) is enabled by default on VPIO, but be explicit.
|
||||
au.set_property(
|
||||
sys::kAudioOutputUnitProperty_EnableIO,
|
||||
Scope::Output,
|
||||
Element::Output,
|
||||
Some(&enable),
|
||||
)
|
||||
.context("failed to enable VPIO output")?;
|
||||
|
||||
// Configure stream format: 48kHz mono f32 non-interleaved
|
||||
let stream_format = StreamFormat {
|
||||
sample_rate: 48_000.0,
|
||||
sample_format: SampleFormat::F32,
|
||||
flags: LinearPcmFlags::IS_FLOAT
|
||||
| LinearPcmFlags::IS_PACKED
|
||||
| LinearPcmFlags::IS_NON_INTERLEAVED,
|
||||
channels: 1,
|
||||
};
|
||||
|
||||
let asbd = stream_format.to_asbd();
|
||||
|
||||
// Input: set format on Output scope of Input element
|
||||
// (= the format the AU delivers to us from the mic)
|
||||
au.set_property(
|
||||
sys::kAudioUnitProperty_StreamFormat,
|
||||
Scope::Output,
|
||||
Element::Input,
|
||||
Some(&asbd),
|
||||
)
|
||||
.context("failed to set input stream format")?;
|
||||
|
||||
// Output: set format on Input scope of Output element
|
||||
// (= the format we feed to the AU for the speaker)
|
||||
au.set_property(
|
||||
sys::kAudioUnitProperty_StreamFormat,
|
||||
Scope::Input,
|
||||
Element::Output,
|
||||
Some(&asbd),
|
||||
)
|
||||
.context("failed to set output stream format")?;
|
||||
|
||||
// Set up input callback (mic capture with AEC applied)
|
||||
let cap_ring = capture_ring.clone();
|
||||
let cap_running = running.clone();
|
||||
let logged = Arc::new(AtomicBool::new(false));
|
||||
au.set_input_callback(
|
||||
move |args: render_callback::Args<data::NonInterleaved<f32>>| {
|
||||
if !cap_running.load(Ordering::Relaxed) {
|
||||
return Ok(());
|
||||
}
|
||||
let mut buffers = args.data.channels();
|
||||
if let Some(ch) = buffers.next() {
|
||||
if !logged.swap(true, Ordering::Relaxed) {
|
||||
eprintln!("[vpio] capture callback: {} f32 samples", ch.len());
|
||||
}
|
||||
let mut tmp = [0i16; FRAME_SAMPLES];
|
||||
for chunk in ch.chunks(FRAME_SAMPLES) {
|
||||
let n = chunk.len();
|
||||
for i in 0..n {
|
||||
tmp[i] = (chunk[i].clamp(-1.0, 1.0) * i16::MAX as f32) as i16;
|
||||
}
|
||||
cap_ring.write(&tmp[..n]);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.context("failed to set input callback")?;
|
||||
|
||||
// Set up output callback (speaker playback — AEC uses this as reference)
|
||||
let play_ring = playout_ring.clone();
|
||||
au.set_render_callback(
|
||||
move |mut args: render_callback::Args<data::NonInterleaved<f32>>| {
|
||||
let mut buffers = args.data.channels_mut();
|
||||
if let Some(ch) = buffers.next() {
|
||||
let mut tmp = [0i16; FRAME_SAMPLES];
|
||||
for chunk in ch.chunks_mut(FRAME_SAMPLES) {
|
||||
let n = chunk.len();
|
||||
let read = play_ring.read(&mut tmp[..n]);
|
||||
for i in 0..read {
|
||||
chunk[i] = tmp[i] as f32 / i16::MAX as f32;
|
||||
}
|
||||
for i in read..n {
|
||||
chunk[i] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.context("failed to set render callback")?;
|
||||
|
||||
au.initialize().context("failed to initialize VoiceProcessingIO")?;
|
||||
au.start().context("failed to start VoiceProcessingIO")?;
|
||||
|
||||
info!("VoiceProcessingIO started (OS-level AEC enabled)");
|
||||
|
||||
Ok(Self {
|
||||
capture_ring,
|
||||
playout_ring,
|
||||
_audio_unit: au,
|
||||
running,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn capture_ring(&self) -> &Arc<AudioRing> {
|
||||
&self.capture_ring
|
||||
}
|
||||
|
||||
pub fn playout_ring(&self) -> &Arc<AudioRing> {
|
||||
&self.playout_ring
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.running.store(false, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VpioAudio {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
@@ -42,6 +42,9 @@ pub struct CallConfig {
|
||||
/// When enabled, only every 50th frame carries a full 12-byte MediaHeader;
|
||||
/// intermediate frames use a compact 4-byte MiniHeader.
|
||||
pub mini_frames_enabled: bool,
|
||||
/// AEC far-end delay compensation in milliseconds (default: 40).
|
||||
/// Compensates for the round-trip audio latency from playout to mic capture.
|
||||
pub aec_delay_ms: u32,
|
||||
/// Enable adaptive jitter buffer (default: true).
|
||||
///
|
||||
/// When true, the jitter buffer target depth is automatically adjusted
|
||||
@@ -63,6 +66,7 @@ impl Default for CallConfig {
|
||||
noise_suppression: true,
|
||||
mini_frames_enabled: true,
|
||||
adaptive_jitter: true,
|
||||
aec_delay_ms: 40,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -241,7 +245,7 @@ impl CallEncoder {
|
||||
block_id: 0,
|
||||
frame_in_block: 0,
|
||||
timestamp_ms: 0,
|
||||
aec: EchoCanceller::new(48000, 30), // 30ms echo tail (laptop/phone)
|
||||
aec: EchoCanceller::with_delay(48000, 60, config.aec_delay_ms),
|
||||
agc: AutoGainControl::new(),
|
||||
silence_detector: SilenceDetector::new(
|
||||
config.silence_threshold_rms,
|
||||
|
||||
@@ -53,6 +53,8 @@ struct CliArgs {
|
||||
no_fec: bool,
|
||||
no_silence: bool,
|
||||
direct_playout: bool,
|
||||
aec_delay_ms: Option<u32>,
|
||||
os_aec: bool,
|
||||
token: Option<String>,
|
||||
_metrics_file: Option<String>,
|
||||
}
|
||||
@@ -130,6 +132,8 @@ fn parse_args() -> CliArgs {
|
||||
let mut no_fec = false;
|
||||
let mut no_silence = false;
|
||||
let mut direct_playout = false;
|
||||
let mut aec_delay_ms = None;
|
||||
let mut os_aec = false;
|
||||
let mut token = None;
|
||||
let mut metrics_file = None;
|
||||
let mut relay_str = None;
|
||||
@@ -181,6 +185,16 @@ fn parse_args() -> CliArgs {
|
||||
"--no-fec" => no_fec = true,
|
||||
"--no-silence" => no_silence = true,
|
||||
"--direct-playout" | "--android" => direct_playout = true,
|
||||
"--os-aec" => os_aec = true,
|
||||
"--aec-delay" => {
|
||||
i += 1;
|
||||
aec_delay_ms = Some(
|
||||
args.get(i)
|
||||
.expect("--aec-delay requires milliseconds")
|
||||
.parse()
|
||||
.expect("--aec-delay value must be a number"),
|
||||
);
|
||||
}
|
||||
"--alias" => {
|
||||
i += 1;
|
||||
alias = Some(args.get(i).expect("--alias requires a name").to_string());
|
||||
@@ -246,7 +260,9 @@ fn parse_args() -> CliArgs {
|
||||
eprintln!(" --no-fec Disable forward error correction");
|
||||
eprintln!(" --no-silence Disable silence suppression");
|
||||
eprintln!(" --direct-playout Bypass jitter buffer (decode on recv, like Android)");
|
||||
eprintln!(" --android Alias for --no-denoise --no-aec --no-silence --direct-playout");
|
||||
eprintln!(" --aec-delay <ms> AEC far-end delay compensation (default: 40ms)");
|
||||
eprintln!(" --os-aec Use macOS VoiceProcessingIO for hardware AEC (requires --vpio feature)");
|
||||
eprintln!(" --android Alias for --no-denoise --no-silence --direct-playout");
|
||||
eprintln!(" --token <token> featherChat bearer token for relay auth");
|
||||
eprintln!(" --metrics-file <path> Write JSONL telemetry to file (1 line/sec)");
|
||||
eprintln!(" (48kHz mono s16le, play with ffplay -f s16le -ar 48000 -ch_layout mono file.raw)");
|
||||
@@ -292,6 +308,8 @@ fn parse_args() -> CliArgs {
|
||||
no_fec,
|
||||
no_silence,
|
||||
direct_playout,
|
||||
aec_delay_ms,
|
||||
os_aec,
|
||||
token,
|
||||
_metrics_file: metrics_file,
|
||||
}
|
||||
@@ -375,11 +393,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
{
|
||||
let audio_opts = AudioOpts {
|
||||
no_denoise: cli.no_denoise || cli.direct_playout,
|
||||
no_aec: cli.no_aec || cli.direct_playout, // AEC disabled by default — macOS has built-in AEC
|
||||
no_aec: cli.no_aec,
|
||||
no_agc: cli.no_agc,
|
||||
no_fec: cli.no_fec,
|
||||
no_silence: cli.no_silence || cli.direct_playout,
|
||||
direct_playout: cli.direct_playout,
|
||||
aec_delay_ms: cli.aec_delay_ms,
|
||||
os_aec: cli.os_aec,
|
||||
};
|
||||
return run_live(transport, audio_opts).await;
|
||||
}
|
||||
@@ -684,6 +704,8 @@ struct AudioOpts {
|
||||
no_fec: bool,
|
||||
no_silence: bool,
|
||||
direct_playout: bool,
|
||||
aec_delay_ms: Option<u32>,
|
||||
os_aec: bool,
|
||||
}
|
||||
|
||||
#[cfg(feature = "audio")]
|
||||
@@ -697,15 +719,32 @@ async fn run_live(
|
||||
use wzp_client::audio_ring::AudioRing;
|
||||
use wzp_client::call::JitterTelemetry;
|
||||
|
||||
let capture = AudioCapture::start()?;
|
||||
let playback = AudioPlayback::start()?;
|
||||
info!("audio I/O started (lock-free ring buffers) — press Ctrl+C to stop");
|
||||
// Audio I/O: either VPIO (OS-level AEC) or separate CPAL streams.
|
||||
#[cfg(feature = "vpio")]
|
||||
let vpio;
|
||||
let (capture_ring, playout_ring) = if opts.os_aec {
|
||||
#[cfg(feature = "vpio")]
|
||||
{
|
||||
vpio = wzp_client::audio_vpio::VpioAudio::start()?;
|
||||
(vpio.capture_ring().clone(), vpio.playout_ring().clone())
|
||||
}
|
||||
#[cfg(not(feature = "vpio"))]
|
||||
{
|
||||
anyhow::bail!("--os-aec requires the 'vpio' feature (build with: cargo build --features audio,vpio)");
|
||||
}
|
||||
} else {
|
||||
let capture = AudioCapture::start()?;
|
||||
let playback = AudioPlayback::start()?;
|
||||
let cr = capture.ring().clone();
|
||||
let pr = playback.ring().clone();
|
||||
// Keep handles alive (streams stop when dropped)
|
||||
std::mem::forget(capture);
|
||||
std::mem::forget(playback);
|
||||
(cr, pr)
|
||||
};
|
||||
info!(os_aec = opts.os_aec, "audio I/O started — press Ctrl+C to stop");
|
||||
|
||||
let capture_ring = capture.ring().clone();
|
||||
let playout_ring = playback.ring().clone();
|
||||
|
||||
// Far-end reference ring: recv task writes decoded audio here,
|
||||
// send task reads it to feed the AEC echo canceller.
|
||||
// Far-end reference ring (only used when NOT using OS AEC).
|
||||
let farend_ring = StdArc::new(AudioRing::new());
|
||||
|
||||
let running = StdArc::new(AtomicBool::new(true));
|
||||
@@ -728,6 +767,7 @@ async fn run_live(
|
||||
let config = CallConfig {
|
||||
noise_suppression: !opts.no_denoise,
|
||||
suppression_enabled: !opts.no_silence,
|
||||
aec_delay_ms: opts.aec_delay_ms.unwrap_or(40),
|
||||
..CallConfig::default()
|
||||
};
|
||||
{
|
||||
@@ -747,7 +787,7 @@ async fn run_live(
|
||||
let send_transport = transport.clone();
|
||||
let send_running = running.clone();
|
||||
let send_mic_muted = mic_muted.clone();
|
||||
let no_aec = opts.no_aec;
|
||||
let no_aec = opts.no_aec || opts.os_aec; // OS AEC replaces software AEC
|
||||
let no_agc = opts.no_agc;
|
||||
let _no_fec = opts.no_fec;
|
||||
let send_farend = farend_ring.clone();
|
||||
@@ -1075,8 +1115,7 @@ async fn run_live(
|
||||
}
|
||||
|
||||
running.store(false, Ordering::SeqCst);
|
||||
capture.stop();
|
||||
playback.stop();
|
||||
// Audio streams stop when their handles are dropped (via mem::forget above or VPIO drop).
|
||||
|
||||
// Give transport 2s to close gracefully, then bail
|
||||
match tokio::time::timeout(std::time::Duration::from_secs(2), transport.close()).await {
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
pub mod audio_io;
|
||||
#[cfg(feature = "audio")]
|
||||
pub mod audio_ring;
|
||||
#[cfg(feature = "vpio")]
|
||||
pub mod audio_vpio;
|
||||
pub mod bench;
|
||||
pub mod call;
|
||||
pub mod drift_test;
|
||||
|
||||
Reference in New Issue
Block a user