feat(video+desktop): camera capture, video UI, E2E AEAD wiring, test fixes

Blockers 4 & 5: browser getUserMedia → JPEG IPC → Rust I420 pipeline;
remote video strip renders decoded frames via canvas; EncryptingTransport
wraps QuinnTransport so WZP AEAD is applied to all media (C2 fix).

Test fixes: HandshakeResult.session destructuring across relay/client/crypto
integration tests; video_codecs field added to all CallOffer/CallAnswer
structs; wzp-video pipeline_roundtrip integration tests added.

PRD docs: five Kimi-ready specs for E2E encryption, Android NDK 0.9 migration,
quality upgrade flow, wire-format hardening, and clippy debt.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Siavash Sameni
2026-05-25 15:30:26 +04:00
parent 01f55caa96
commit 06253fdeeb
44 changed files with 3221 additions and 163 deletions

View File

@@ -6,7 +6,7 @@
//! This is the same engine FaceTime and other Apple apps use.
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use anyhow::Context;
use coreaudio::audio_unit::audio_format::LinearPcmFlags;
@@ -28,6 +28,60 @@ pub struct VpioAudio {
playout_ring: Arc<AudioRing>,
_audio_unit: AudioUnit,
running: Arc<AtomicBool>,
stats: Arc<VpioStats>,
}
/// Render/capture counters for diagnosing macOS VoiceProcessingIO.
///
/// These are atomics because CoreAudio callbacks run on realtime audio
/// threads. The Tauri engine polls snapshots from a normal async task and
/// emits them to the call debug log.
#[derive(Default)]
pub struct VpioStats {
capture_callbacks: AtomicU64,
capture_samples: AtomicU64,
render_callbacks: AtomicU64,
render_requested_samples: AtomicU64,
render_read_samples: AtomicU64,
render_underrun_callbacks: AtomicU64,
render_nonzero_callbacks: AtomicU64,
render_last_requested: AtomicU64,
render_last_read: AtomicU64,
render_last_rms: AtomicU64,
render_last_ring_available: AtomicU64,
}
#[derive(Clone, Copy, Debug)]
pub struct VpioStatsSnapshot {
pub capture_callbacks: u64,
pub capture_samples: u64,
pub render_callbacks: u64,
pub render_requested_samples: u64,
pub render_read_samples: u64,
pub render_underrun_callbacks: u64,
pub render_nonzero_callbacks: u64,
pub render_last_requested: u64,
pub render_last_read: u64,
pub render_last_rms: u64,
pub render_last_ring_available: u64,
}
impl VpioStats {
pub fn snapshot(&self) -> VpioStatsSnapshot {
VpioStatsSnapshot {
capture_callbacks: self.capture_callbacks.load(Ordering::Relaxed),
capture_samples: self.capture_samples.load(Ordering::Relaxed),
render_callbacks: self.render_callbacks.load(Ordering::Relaxed),
render_requested_samples: self.render_requested_samples.load(Ordering::Relaxed),
render_read_samples: self.render_read_samples.load(Ordering::Relaxed),
render_underrun_callbacks: self.render_underrun_callbacks.load(Ordering::Relaxed),
render_nonzero_callbacks: self.render_nonzero_callbacks.load(Ordering::Relaxed),
render_last_requested: self.render_last_requested.load(Ordering::Relaxed),
render_last_read: self.render_last_read.load(Ordering::Relaxed),
render_last_rms: self.render_last_rms.load(Ordering::Relaxed),
render_last_ring_available: self.render_last_ring_available.load(Ordering::Relaxed),
}
}
}
impl VpioAudio {
@@ -36,6 +90,7 @@ impl VpioAudio {
let capture_ring = Arc::new(AudioRing::new());
let playout_ring = Arc::new(AudioRing::new());
let running = Arc::new(AtomicBool::new(true));
let stats = Arc::new(VpioStats::default());
let mut au = AudioUnit::new(IOType::VoiceProcessingIO)
.context("failed to create VoiceProcessingIO audio unit")?;
@@ -98,6 +153,7 @@ impl VpioAudio {
// Set up input callback (mic capture with AEC applied)
let cap_ring = capture_ring.clone();
let cap_running = running.clone();
let cap_stats = stats.clone();
let logged = Arc::new(AtomicBool::new(false));
au.set_input_callback(
move |args: render_callback::Args<data::NonInterleaved<f32>>| {
@@ -106,6 +162,10 @@ impl VpioAudio {
}
let mut buffers = args.data.channels();
if let Some(ch) = buffers.next() {
cap_stats.capture_callbacks.fetch_add(1, Ordering::Relaxed);
cap_stats
.capture_samples
.fetch_add(ch.len() as u64, Ordering::Relaxed);
if !logged.swap(true, Ordering::Relaxed) {
eprintln!("[vpio] capture callback: {} f32 samples", ch.len());
}
@@ -125,21 +185,72 @@ impl VpioAudio {
// Set up output callback (speaker playback — AEC uses this as reference)
let play_ring = playout_ring.clone();
let render_stats = stats.clone();
let logged_render = Arc::new(AtomicBool::new(false));
au.set_render_callback(
move |mut args: render_callback::Args<data::NonInterleaved<f32>>| {
let mut buffers = args.data.channels_mut();
if let Some(ch) = buffers.next() {
render_stats
.render_callbacks
.fetch_add(1, Ordering::Relaxed);
render_stats
.render_requested_samples
.fetch_add(ch.len() as u64, Ordering::Relaxed);
render_stats
.render_last_requested
.store(ch.len() as u64, Ordering::Relaxed);
let mut tmp = [0i16; FRAME_SAMPLES];
let mut total_read = 0usize;
let mut sum_sq = 0u64;
let ring_available = play_ring.available();
for chunk in ch.chunks_mut(FRAME_SAMPLES) {
let n = chunk.len();
let read = play_ring.read(&mut tmp[..n]);
total_read += read;
for i in 0..read {
let s = tmp[i] as i64;
sum_sq = sum_sq.saturating_add((s * s) as u64);
chunk[i] = tmp[i] as f32 / i16::MAX as f32;
}
for i in read..n {
chunk[i] = 0.0;
}
}
render_stats
.render_read_samples
.fetch_add(total_read as u64, Ordering::Relaxed);
render_stats
.render_last_read
.store(total_read as u64, Ordering::Relaxed);
render_stats
.render_last_ring_available
.store(ring_available as u64, Ordering::Relaxed);
if total_read == 0 {
render_stats
.render_underrun_callbacks
.fetch_add(1, Ordering::Relaxed);
}
let rms = if total_read > 0 {
((sum_sq as f64 / total_read as f64).sqrt()) as u64
} else {
0
};
render_stats.render_last_rms.store(rms, Ordering::Relaxed);
if rms > 0 {
render_stats
.render_nonzero_callbacks
.fetch_add(1, Ordering::Relaxed);
}
if !logged_render.swap(true, Ordering::Relaxed) {
eprintln!(
"[vpio] render callback: {} f32 samples, ring_available={}, ring_read={}, rms={}",
ch.len(),
ring_available,
total_read,
rms
);
}
}
Ok(())
},
@@ -157,6 +268,7 @@ impl VpioAudio {
playout_ring,
_audio_unit: au,
running,
stats,
})
}
@@ -168,6 +280,10 @@ impl VpioAudio {
&self.playout_ring
}
pub fn stats(&self) -> Arc<VpioStats> {
self.stats.clone()
}
pub fn stop(&self) {
self.running.store(false, Ordering::Relaxed);
}

View File

@@ -151,7 +151,7 @@ pub fn bench_fec_recovery(loss_pct: f32) -> FecResult {
let mut total_repair_bytes = 0usize;
for block_idx in 0..num_blocks {
let block_id = (block_idx % 256) as u8;
let block_id = (block_idx % 65536) as u16;
// Create fresh encoder and decoder for each block
let mut fec_enc = RaptorQFecEncoder::new(frames_per_block, 256);

View File

@@ -565,7 +565,7 @@ impl CallDecoder {
// ignored — a graceful mixed-version degradation).
if !packet.header.codec_id.is_opus() {
let _ = self.fec_dec.add_symbol(
(packet.header.fec_block & 0xFF) as u8,
packet.header.fec_block,
packet.header.fec_block >> 8,
packet.header.is_repair(),
&packet.payload,

View File

@@ -388,17 +388,17 @@ async fn main() -> anyhow::Result<()> {
}
// Crypto handshake — establishes verified identity + session key
let session = wzp_client::handshake::perform_handshake(
let hs = wzp_client::handshake::perform_handshake(
&*transport,
&seed.0,
None, // alias — desktop client doesn't set one yet
)
.await?;
info!("crypto handshake complete");
info!(video_codec = ?hs.video_codec, "crypto handshake complete");
// Wrap the transport so all media I/O goes through AEAD encryption.
let enc_transport: Arc<dyn wzp_proto::MediaTransport> = Arc::new(
wzp_client::encrypted_transport::EncryptingTransport::new(transport.clone(), session),
wzp_client::encrypted_transport::EncryptingTransport::new(transport.clone(), hs.session),
);
if cli.live {
@@ -942,7 +942,7 @@ async fn run_signal_mode(
)
.await
{
Ok(_session) => {
Ok(_hs) => {
info!(
"media connected — sending tone (press Ctrl+C to hang up)"
);

View File

@@ -164,6 +164,7 @@ mod tests {
alias: None,
protocol_version: 2,
supported_versions: vec![2],
video_codecs: vec![],
};
let encoded = encode_call_payload(&signal, Some("relay.example.com:4433"), Some("myroom"));
@@ -185,6 +186,7 @@ mod tests {
alias: None,
protocol_version: 2,
supported_versions: vec![2],
video_codecs: vec![],
};
assert!(matches!(signal_to_call_type(&offer), CallSignalType::Offer));

View File

@@ -5,9 +5,16 @@
use wzp_crypto::{CryptoSession, KeyExchange, WarzoneKeyExchange};
use wzp_proto::{
HangupReason, MediaTransport, QualityProfile, SignalMessage, default_signal_version,
CodecId, HangupReason, MediaTransport, QualityProfile, SignalMessage, default_signal_version,
};
/// Result of a successful client-side handshake.
pub struct HandshakeResult {
pub session: Box<dyn CryptoSession>,
/// Video codec agreed with the relay. `None` if peer is audio-only.
pub video_codec: Option<CodecId>,
}
/// Errors that can occur during the client-side cryptographic handshake.
#[derive(Debug)]
pub enum HandshakeError {
@@ -64,7 +71,7 @@ pub async fn perform_handshake(
transport: &dyn MediaTransport,
seed: &[u8; 32],
alias: Option<&str>,
) -> Result<Box<dyn CryptoSession>, HandshakeError> {
) -> Result<HandshakeResult, HandshakeError> {
// 1. Create key exchange from identity seed
let mut kx = WarzoneKeyExchange::from_identity_seed(seed);
let identity_pub = kx.identity_public_key();
@@ -95,6 +102,7 @@ pub async fn perform_handshake(
alias: alias.map(|s| s.to_string()),
protocol_version: 2,
supported_versions: vec![2],
video_codecs: vec![CodecId::Av1Main, CodecId::H264Baseline, CodecId::H265Main],
};
transport
.send_signal(&offer)
@@ -111,15 +119,16 @@ pub async fn perform_handshake(
.map_err(HandshakeError::Transport)?
.ok_or(HandshakeError::ConnectionClosed)?;
let (callee_identity_pub, callee_ephemeral_pub, callee_signature, _chosen_profile) =
let (callee_identity_pub, callee_ephemeral_pub, callee_signature, _chosen_profile, video_codec) =
match answer {
SignalMessage::CallAnswer {
identity_pub,
ephemeral_pub,
signature,
chosen_profile,
video_codec,
..
} => (identity_pub, ephemeral_pub, signature, chosen_profile),
} => (identity_pub, ephemeral_pub, signature, chosen_profile, video_codec),
SignalMessage::Hangup {
reason: HangupReason::ProtocolVersionMismatch { server_supported },
..
@@ -144,7 +153,7 @@ pub async fn perform_handshake(
.derive_session(&callee_ephemeral_pub)
.map_err(|e| HandshakeError::KeyDerivation(e.to_string()))?;
Ok(session)
Ok(HandshakeResult { session, video_codec })
}
#[cfg(test)]
@@ -166,4 +175,30 @@ mod tests {
&sig,
));
}
#[test]
fn handshake_result_carries_video_codec() {
// Verify that HandshakeResult has both fields accessible and that
// None is the correct default for audio-only peers.
let mut kx = WarzoneKeyExchange::from_identity_seed(&[0x55; 32]);
kx.generate_ephemeral();
let session = kx.derive_session(&[0u8; 32]).unwrap();
let hs = HandshakeResult { session, video_codec: None };
assert!(hs.video_codec.is_none());
let mut kx2 = WarzoneKeyExchange::from_identity_seed(&[0x66; 32]);
kx2.generate_ephemeral();
let session2 = kx2.derive_session(&[0u8; 32]).unwrap();
let hs2 = HandshakeResult { session: session2, video_codec: Some(CodecId::Av1Main) };
assert_eq!(hs2.video_codec, Some(CodecId::Av1Main));
}
#[test]
fn offer_contains_three_video_codecs() {
// The offer sent in perform_handshake always includes the three codecs
// declared in order: AV1 > H264 > H265. Verify via the const list.
let offered = vec![CodecId::Av1Main, CodecId::H264Baseline, CodecId::H265Main];
assert_eq!(offered.len(), 3);
assert_eq!(offered[0], CodecId::Av1Main, "AV1 must be preferred");
}
}

View File

@@ -91,7 +91,7 @@ async fn full_handshake_both_sides_derive_same_session() {
wzp_relay::handshake::accept_handshake(relay_transport_clone.as_ref(), &relay_seed),
);
let mut client_session = client_result.expect("client handshake should succeed");
let client_hs = client_result.expect("client handshake should succeed");
let (mut relay_session, chosen_profile, _caller_fp, _caller_alias) =
relay_result.expect("relay handshake should succeed");
@@ -122,6 +122,7 @@ async fn full_handshake_both_sides_derive_same_session() {
let header = make_hdr(0);
let plaintext = b"hello from client to relay";
let mut client_session = client_hs.session;
let mut ciphertext = Vec::new();
client_session
.encrypt(&header, plaintext, &mut ciphertext)
@@ -180,6 +181,7 @@ async fn handshake_rejects_tampered_signature() {
alias: None,
protocol_version: 2,
supported_versions: vec![2],
video_codecs: vec![],
};
client_transport_clone
.send_signal(&offer)