fix(audio+net): revert dual-stack [::]:0, add Oboe playout stall auto-restart

Two fixes:

## Revert [::]:0 dual-stack sockets → back to 0.0.0.0:0

Android's IPV6_V6ONLY=1 default on some kernels (confirmed on
Nothing Phone) makes [::]:0 IPv6-only, silently killing ALL
IPv4 traffic. This broke P2P direct calls: IPv4 LAN candidates
(172.16.81.x) couldn't complete QUIC handshakes through the
IPv6-only socket, causing local_direct_ok=false and relay
fallback on every call after the first.

Reverted all bind sites to 0.0.0.0:0 (reliable IPv4). IPv6 host
candidates are disabled in local_host_candidates() until a
proper dual-socket approach (one IPv4 + one IPv6 endpoint,
Phase 7) is implemented.

## Fix A (task #35): Oboe playout callback stall auto-restart

The Nothing Phone's Oboe playout callback fires once (cb#0) and
then stops draining the ring on ~50% of cold-launch calls. Fix
D+C (stop+prime from previous commit) didn't help because
audio_stop is a no-op on cold launch.

New approach: self-healing watchdog in audio_write_playout.
Tracks the playout ring's read_idx across writes. If read_idx
hasn't advanced in 50 consecutive writes (~1 second), the Oboe
playout callback has stopped:

1. Log "playout STALL detected"
2. Call wzp_oboe_stop() to tear down the stuck streams
3. Clear both ring buffers (prevent stale data reads)
4. Call wzp_oboe_start() to rebuild fresh streams
5. Log success/failure
6. Return 0 (caller retries on next frame)

This is the same teardown+rebuild that "rejoin" does — but
triggered automatically from the first stalled call instead of
requiring the user to hang up and redial. The watchdog runs
on every write so it fires within 1s of the stall starting.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Siavash Sameni
2026-04-12 11:24:16 +04:00
parent 9fb92967eb
commit aee41a638d
4 changed files with 114 additions and 44 deletions

View File

@@ -170,14 +170,14 @@ pub async fn race(
}
None => {
let (sc, _cert_der) = wzp_transport::server_config();
// [::]:0 = dual-stack socket — handles both IPv4 (via
// v4-mapped addrs) and IPv6 natively. Pre-Phase-5.5
// used 0.0.0.0:0 (IPv4-only) which silently made
// all IPv6 host candidates non-functional: dials
// to [2a0d:...] failed or hung, accepts from IPv6
// peers never arrived, and the JoinSet wasted time
// on dead candidates before the IPv4 one won.
let bind: SocketAddr = "[::]:0".parse().unwrap();
// 0.0.0.0:0 = IPv4 socket. [::]:0 dual-stack was
// tried but breaks on Android devices where
// IPV6_V6ONLY=1 (default on some kernels) —
// IPv4 candidates silently fail. IPv6 host
// candidates are skipped for now; they need a
// dedicated IPv6 socket alongside the v4 one
// (like WebRTC's dual-socket approach).
let bind: SocketAddr = "0.0.0.0:0".parse().unwrap();
let fresh = wzp_transport::create_endpoint(bind, Some(sc))?;
tracing::info!(
local_addr = ?fresh.local_addr().ok(),
@@ -213,14 +213,14 @@ pub async fn race(
ep
}
None => {
// [::]:0 = dual-stack socket — handles both IPv4 (via
// v4-mapped addrs) and IPv6 natively. Pre-Phase-5.5
// used 0.0.0.0:0 (IPv4-only) which silently made
// all IPv6 host candidates non-functional: dials
// to [2a0d:...] failed or hung, accepts from IPv6
// peers never arrived, and the JoinSet wasted time
// on dead candidates before the IPv4 one won.
let bind: SocketAddr = "[::]:0".parse().unwrap();
// 0.0.0.0:0 = IPv4 socket. [::]:0 dual-stack was
// tried but breaks on Android devices where
// IPV6_V6ONLY=1 (default on some kernels) —
// IPv4 candidates silently fail. IPv6 host
// candidates are skipped for now; they need a
// dedicated IPv6 socket alongside the v4 one
// (like WebRTC's dual-socket approach).
let bind: SocketAddr = "0.0.0.0:0".parse().unwrap();
let fresh = wzp_transport::create_endpoint(bind, None)?;
tracing::info!(
local_addr = ?fresh.local_addr().ok(),

View File

@@ -102,8 +102,7 @@ pub async fn probe_reflect_addr(
let endpoint = match existing_endpoint {
Some(ep) => ep,
None => {
// [::]:0 = dual-stack socket for both IPv4 + IPv6
let bind: SocketAddr = "[::]:0".parse().unwrap();
let bind: SocketAddr = "0.0.0.0:0".parse().unwrap();
create_endpoint(bind, None).map_err(|e| format!("endpoint: {e}"))?
}
};
@@ -320,25 +319,20 @@ pub fn local_host_candidates(port: u16) -> Vec<SocketAddr> {
out.push(SocketAddr::new(std::net::IpAddr::V4(v4), port));
}
}
std::net::IpAddr::V6(v6) => {
if v6.is_loopback() || v6.is_unspecified() {
continue;
}
// Link-local (fe80::/10) — skip because it needs
// a zone/scope ID to be usable and that scope is
// meaningless to the peer.
let first = v6.segments()[0];
if (first & 0xffc0) == 0xfe80 {
continue;
}
// Include everything else: ULA (fc00::/7, high
// bits 0xfc00/0xfd00) and global unicast
// (2000::/3, first segment 0x2000-0x3fff). Both
// are directly dialable from a peer on the same
// dual-stack LAN, and on Starlink / most modern
// ISPs the IPv6 path usually has no CGNAT and
// works even when the v4 path doesn't hairpin.
out.push(SocketAddr::new(std::net::IpAddr::V6(v6), port));
std::net::IpAddr::V6(_v6) => {
// IPv6 host candidates are disabled until we add
// a dedicated IPv6 socket alongside the IPv4 one.
// Android's IPV6_V6ONLY=1 default on some kernels
// makes [::]:0 dual-stack unreliable — IPv4 dials
// silently fail. Advertising IPv6 addrs from an
// IPv4-only socket wastes JoinSet slots and adds
// timeout delays before the working IPv4 candidate
// gets picked.
//
// TODO: Phase 7 — create a second quinn::Endpoint
// on [::]:0 for IPv6-only dials, run them alongside
// the IPv4 JoinSet. This gives true dual-stack ICE
// without the v4-mapped-address fragility.
}
}
}

View File

@@ -174,6 +174,13 @@ struct AudioBackend {
started: std::sync::Mutex<bool>,
/// Per-write logging throttle counter for wzp_native_audio_write_playout.
playout_write_log_count: std::sync::atomic::AtomicU64,
/// Fix A (task #35): the playout ring's read_idx at the last
/// check. If audio_write_playout observes read_idx hasn't
/// advanced after N writes, the Oboe playout callback has
/// stopped firing → restart the streams.
playout_last_read_idx: std::sync::atomic::AtomicI32,
/// Number of writes since the last read_idx advance.
playout_stall_writes: std::sync::atomic::AtomicU32,
}
static BACKEND: OnceLock<&'static AudioBackend> = OnceLock::new();
@@ -185,6 +192,8 @@ fn backend() -> &'static AudioBackend {
playout: RingBuffer::new(RING_CAPACITY),
started: std::sync::Mutex::new(false),
playout_write_log_count: std::sync::atomic::AtomicU64::new(0),
playout_last_read_idx: std::sync::atomic::AtomicI32::new(0),
playout_stall_writes: std::sync::atomic::AtomicU32::new(0),
}))
})
}
@@ -262,6 +271,76 @@ pub unsafe extern "C" fn wzp_native_audio_write_playout(input: *const i16, in_le
}
let slice = unsafe { std::slice::from_raw_parts(input, in_len) };
let b = backend();
// Fix A (task #35): detect playout callback stall. If the
// playout ring's read_idx hasn't advanced in 50+ writes
// (~1 second at 50 writes/sec), the Oboe playout callback
// has stopped firing → restart the streams. This is the
// self-healing behavior that makes rejoin work: teardown +
// rebuild clears whatever HAL state locked up the callback.
let current_read_idx = b.playout.read_idx.load(std::sync::atomic::Ordering::Relaxed);
let last_read_idx = b.playout_last_read_idx.load(std::sync::atomic::Ordering::Relaxed);
if current_read_idx == last_read_idx {
let stall = b.playout_stall_writes.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if stall >= 50 {
// Callback hasn't drained anything in ~1 second.
// Force a stream restart.
unsafe {
android_log("playout STALL detected (50 writes, read_idx unchanged) — restarting Oboe streams");
}
b.playout_stall_writes.store(0, std::sync::atomic::Ordering::Relaxed);
// Release the started lock, stop, re-start.
// This is the same logic as the Rust-side
// audio_stop() + audio_start() but done inline
// because we can't call the extern "C" fns
// recursively. Just call the C++ side directly.
{
if let Ok(mut started) = b.started.lock() {
if *started {
unsafe { wzp_oboe_stop() };
*started = false;
}
}
}
// Clear the rings so the restart doesn't read stale data
b.playout.write_idx.store(0, std::sync::atomic::Ordering::Relaxed);
b.playout.read_idx.store(0, std::sync::atomic::Ordering::Relaxed);
b.capture.write_idx.store(0, std::sync::atomic::Ordering::Relaxed);
b.capture.read_idx.store(0, std::sync::atomic::Ordering::Relaxed);
// Re-start
let config = WzpOboeConfig {
sample_rate: 48_000,
frames_per_burst: FRAME_SAMPLES as i32,
channel_count: 1,
};
let rings = WzpOboeRings {
capture_buf: b.capture.buf_ptr(),
capture_capacity: b.capture.capacity as i32,
capture_write_idx: b.capture.write_idx_ptr(),
capture_read_idx: b.capture.read_idx_ptr(),
playout_buf: b.playout.buf_ptr(),
playout_capacity: b.playout.capacity as i32,
playout_write_idx: b.playout.write_idx_ptr(),
playout_read_idx: b.playout.read_idx_ptr(),
};
let ret = unsafe { wzp_oboe_start(&config, &rings) };
if ret == 0 {
if let Ok(mut started) = b.started.lock() {
*started = true;
}
unsafe { android_log("playout restart OK — Oboe streams rebuilt"); }
} else {
unsafe { android_log(&format!("playout restart FAILED: {ret}")); }
}
b.playout_last_read_idx.store(0, std::sync::atomic::Ordering::Relaxed);
return 0; // caller will retry on next frame
}
} else {
// read_idx advanced — callback is alive, reset counter
b.playout_stall_writes.store(0, std::sync::atomic::Ordering::Relaxed);
b.playout_last_read_idx.store(current_read_idx, std::sync::atomic::Ordering::Relaxed);
}
let before_w = b.playout.write_idx.load(std::sync::atomic::Ordering::Relaxed);
let before_r = b.playout.read_idx.load(std::sync::atomic::Ordering::Relaxed);
let written = b.playout.write(slice);