fix(audio+net): revert dual-stack [::]:0, add Oboe playout stall auto-restart
Two fixes: ## Revert [::]:0 dual-stack sockets → back to 0.0.0.0:0 Android's IPV6_V6ONLY=1 default on some kernels (confirmed on Nothing Phone) makes [::]:0 IPv6-only, silently killing ALL IPv4 traffic. This broke P2P direct calls: IPv4 LAN candidates (172.16.81.x) couldn't complete QUIC handshakes through the IPv6-only socket, causing local_direct_ok=false and relay fallback on every call after the first. Reverted all bind sites to 0.0.0.0:0 (reliable IPv4). IPv6 host candidates are disabled in local_host_candidates() until a proper dual-socket approach (one IPv4 + one IPv6 endpoint, Phase 7) is implemented. ## Fix A (task #35): Oboe playout callback stall auto-restart The Nothing Phone's Oboe playout callback fires once (cb#0) and then stops draining the ring on ~50% of cold-launch calls. Fix D+C (stop+prime from previous commit) didn't help because audio_stop is a no-op on cold launch. New approach: self-healing watchdog in audio_write_playout. Tracks the playout ring's read_idx across writes. If read_idx hasn't advanced in 50 consecutive writes (~1 second), the Oboe playout callback has stopped: 1. Log "playout STALL detected" 2. Call wzp_oboe_stop() to tear down the stuck streams 3. Clear both ring buffers (prevent stale data reads) 4. Call wzp_oboe_start() to rebuild fresh streams 5. Log success/failure 6. Return 0 (caller retries on next frame) This is the same teardown+rebuild that "rejoin" does — but triggered automatically from the first stalled call instead of requiring the user to hang up and redial. The watchdog runs on every write so it fires within 1s of the stall starting. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -170,14 +170,14 @@ pub async fn race(
|
||||
}
|
||||
None => {
|
||||
let (sc, _cert_der) = wzp_transport::server_config();
|
||||
// [::]:0 = dual-stack socket — handles both IPv4 (via
|
||||
// v4-mapped addrs) and IPv6 natively. Pre-Phase-5.5
|
||||
// used 0.0.0.0:0 (IPv4-only) which silently made
|
||||
// all IPv6 host candidates non-functional: dials
|
||||
// to [2a0d:...] failed or hung, accepts from IPv6
|
||||
// peers never arrived, and the JoinSet wasted time
|
||||
// on dead candidates before the IPv4 one won.
|
||||
let bind: SocketAddr = "[::]:0".parse().unwrap();
|
||||
// 0.0.0.0:0 = IPv4 socket. [::]:0 dual-stack was
|
||||
// tried but breaks on Android devices where
|
||||
// IPV6_V6ONLY=1 (default on some kernels) —
|
||||
// IPv4 candidates silently fail. IPv6 host
|
||||
// candidates are skipped for now; they need a
|
||||
// dedicated IPv6 socket alongside the v4 one
|
||||
// (like WebRTC's dual-socket approach).
|
||||
let bind: SocketAddr = "0.0.0.0:0".parse().unwrap();
|
||||
let fresh = wzp_transport::create_endpoint(bind, Some(sc))?;
|
||||
tracing::info!(
|
||||
local_addr = ?fresh.local_addr().ok(),
|
||||
@@ -213,14 +213,14 @@ pub async fn race(
|
||||
ep
|
||||
}
|
||||
None => {
|
||||
// [::]:0 = dual-stack socket — handles both IPv4 (via
|
||||
// v4-mapped addrs) and IPv6 natively. Pre-Phase-5.5
|
||||
// used 0.0.0.0:0 (IPv4-only) which silently made
|
||||
// all IPv6 host candidates non-functional: dials
|
||||
// to [2a0d:...] failed or hung, accepts from IPv6
|
||||
// peers never arrived, and the JoinSet wasted time
|
||||
// on dead candidates before the IPv4 one won.
|
||||
let bind: SocketAddr = "[::]:0".parse().unwrap();
|
||||
// 0.0.0.0:0 = IPv4 socket. [::]:0 dual-stack was
|
||||
// tried but breaks on Android devices where
|
||||
// IPV6_V6ONLY=1 (default on some kernels) —
|
||||
// IPv4 candidates silently fail. IPv6 host
|
||||
// candidates are skipped for now; they need a
|
||||
// dedicated IPv6 socket alongside the v4 one
|
||||
// (like WebRTC's dual-socket approach).
|
||||
let bind: SocketAddr = "0.0.0.0:0".parse().unwrap();
|
||||
let fresh = wzp_transport::create_endpoint(bind, None)?;
|
||||
tracing::info!(
|
||||
local_addr = ?fresh.local_addr().ok(),
|
||||
|
||||
@@ -102,8 +102,7 @@ pub async fn probe_reflect_addr(
|
||||
let endpoint = match existing_endpoint {
|
||||
Some(ep) => ep,
|
||||
None => {
|
||||
// [::]:0 = dual-stack socket for both IPv4 + IPv6
|
||||
let bind: SocketAddr = "[::]:0".parse().unwrap();
|
||||
let bind: SocketAddr = "0.0.0.0:0".parse().unwrap();
|
||||
create_endpoint(bind, None).map_err(|e| format!("endpoint: {e}"))?
|
||||
}
|
||||
};
|
||||
@@ -320,25 +319,20 @@ pub fn local_host_candidates(port: u16) -> Vec<SocketAddr> {
|
||||
out.push(SocketAddr::new(std::net::IpAddr::V4(v4), port));
|
||||
}
|
||||
}
|
||||
std::net::IpAddr::V6(v6) => {
|
||||
if v6.is_loopback() || v6.is_unspecified() {
|
||||
continue;
|
||||
}
|
||||
// Link-local (fe80::/10) — skip because it needs
|
||||
// a zone/scope ID to be usable and that scope is
|
||||
// meaningless to the peer.
|
||||
let first = v6.segments()[0];
|
||||
if (first & 0xffc0) == 0xfe80 {
|
||||
continue;
|
||||
}
|
||||
// Include everything else: ULA (fc00::/7, high
|
||||
// bits 0xfc00/0xfd00) and global unicast
|
||||
// (2000::/3, first segment 0x2000-0x3fff). Both
|
||||
// are directly dialable from a peer on the same
|
||||
// dual-stack LAN, and on Starlink / most modern
|
||||
// ISPs the IPv6 path usually has no CGNAT and
|
||||
// works even when the v4 path doesn't hairpin.
|
||||
out.push(SocketAddr::new(std::net::IpAddr::V6(v6), port));
|
||||
std::net::IpAddr::V6(_v6) => {
|
||||
// IPv6 host candidates are disabled until we add
|
||||
// a dedicated IPv6 socket alongside the IPv4 one.
|
||||
// Android's IPV6_V6ONLY=1 default on some kernels
|
||||
// makes [::]:0 dual-stack unreliable — IPv4 dials
|
||||
// silently fail. Advertising IPv6 addrs from an
|
||||
// IPv4-only socket wastes JoinSet slots and adds
|
||||
// timeout delays before the working IPv4 candidate
|
||||
// gets picked.
|
||||
//
|
||||
// TODO: Phase 7 — create a second quinn::Endpoint
|
||||
// on [::]:0 for IPv6-only dials, run them alongside
|
||||
// the IPv4 JoinSet. This gives true dual-stack ICE
|
||||
// without the v4-mapped-address fragility.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,6 +174,13 @@ struct AudioBackend {
|
||||
started: std::sync::Mutex<bool>,
|
||||
/// Per-write logging throttle counter for wzp_native_audio_write_playout.
|
||||
playout_write_log_count: std::sync::atomic::AtomicU64,
|
||||
/// Fix A (task #35): the playout ring's read_idx at the last
|
||||
/// check. If audio_write_playout observes read_idx hasn't
|
||||
/// advanced after N writes, the Oboe playout callback has
|
||||
/// stopped firing → restart the streams.
|
||||
playout_last_read_idx: std::sync::atomic::AtomicI32,
|
||||
/// Number of writes since the last read_idx advance.
|
||||
playout_stall_writes: std::sync::atomic::AtomicU32,
|
||||
}
|
||||
|
||||
static BACKEND: OnceLock<&'static AudioBackend> = OnceLock::new();
|
||||
@@ -185,6 +192,8 @@ fn backend() -> &'static AudioBackend {
|
||||
playout: RingBuffer::new(RING_CAPACITY),
|
||||
started: std::sync::Mutex::new(false),
|
||||
playout_write_log_count: std::sync::atomic::AtomicU64::new(0),
|
||||
playout_last_read_idx: std::sync::atomic::AtomicI32::new(0),
|
||||
playout_stall_writes: std::sync::atomic::AtomicU32::new(0),
|
||||
}))
|
||||
})
|
||||
}
|
||||
@@ -262,6 +271,76 @@ pub unsafe extern "C" fn wzp_native_audio_write_playout(input: *const i16, in_le
|
||||
}
|
||||
let slice = unsafe { std::slice::from_raw_parts(input, in_len) };
|
||||
let b = backend();
|
||||
|
||||
// Fix A (task #35): detect playout callback stall. If the
|
||||
// playout ring's read_idx hasn't advanced in 50+ writes
|
||||
// (~1 second at 50 writes/sec), the Oboe playout callback
|
||||
// has stopped firing → restart the streams. This is the
|
||||
// self-healing behavior that makes rejoin work: teardown +
|
||||
// rebuild clears whatever HAL state locked up the callback.
|
||||
let current_read_idx = b.playout.read_idx.load(std::sync::atomic::Ordering::Relaxed);
|
||||
let last_read_idx = b.playout_last_read_idx.load(std::sync::atomic::Ordering::Relaxed);
|
||||
if current_read_idx == last_read_idx {
|
||||
let stall = b.playout_stall_writes.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if stall >= 50 {
|
||||
// Callback hasn't drained anything in ~1 second.
|
||||
// Force a stream restart.
|
||||
unsafe {
|
||||
android_log("playout STALL detected (50 writes, read_idx unchanged) — restarting Oboe streams");
|
||||
}
|
||||
b.playout_stall_writes.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
// Release the started lock, stop, re-start.
|
||||
// This is the same logic as the Rust-side
|
||||
// audio_stop() + audio_start() but done inline
|
||||
// because we can't call the extern "C" fns
|
||||
// recursively. Just call the C++ side directly.
|
||||
{
|
||||
if let Ok(mut started) = b.started.lock() {
|
||||
if *started {
|
||||
unsafe { wzp_oboe_stop() };
|
||||
*started = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Clear the rings so the restart doesn't read stale data
|
||||
b.playout.write_idx.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
b.playout.read_idx.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
b.capture.write_idx.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
b.capture.read_idx.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
// Re-start
|
||||
let config = WzpOboeConfig {
|
||||
sample_rate: 48_000,
|
||||
frames_per_burst: FRAME_SAMPLES as i32,
|
||||
channel_count: 1,
|
||||
};
|
||||
let rings = WzpOboeRings {
|
||||
capture_buf: b.capture.buf_ptr(),
|
||||
capture_capacity: b.capture.capacity as i32,
|
||||
capture_write_idx: b.capture.write_idx_ptr(),
|
||||
capture_read_idx: b.capture.read_idx_ptr(),
|
||||
playout_buf: b.playout.buf_ptr(),
|
||||
playout_capacity: b.playout.capacity as i32,
|
||||
playout_write_idx: b.playout.write_idx_ptr(),
|
||||
playout_read_idx: b.playout.read_idx_ptr(),
|
||||
};
|
||||
let ret = unsafe { wzp_oboe_start(&config, &rings) };
|
||||
if ret == 0 {
|
||||
if let Ok(mut started) = b.started.lock() {
|
||||
*started = true;
|
||||
}
|
||||
unsafe { android_log("playout restart OK — Oboe streams rebuilt"); }
|
||||
} else {
|
||||
unsafe { android_log(&format!("playout restart FAILED: {ret}")); }
|
||||
}
|
||||
b.playout_last_read_idx.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
return 0; // caller will retry on next frame
|
||||
}
|
||||
} else {
|
||||
// read_idx advanced — callback is alive, reset counter
|
||||
b.playout_stall_writes.store(0, std::sync::atomic::Ordering::Relaxed);
|
||||
b.playout_last_read_idx.store(current_read_idx, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
let before_w = b.playout.write_idx.load(std::sync::atomic::Ordering::Relaxed);
|
||||
let before_r = b.playout.read_idx.load(std::sync::atomic::Ordering::Relaxed);
|
||||
let written = b.playout.write(slice);
|
||||
|
||||
@@ -914,13 +914,10 @@ fn do_register_signal(
|
||||
// endpoints, which made MikroTik look symmetric and broke direct
|
||||
// P2P because the advertised reflex port was not the listening
|
||||
// port.
|
||||
// [::]:0 = dual-stack socket — handles IPv4 (via ::ffff:x.x.x.x
|
||||
// mapped addresses) AND native IPv6 on one socket. Critical for
|
||||
// Phase 5.5 ICE host candidates: without dual-stack, the IPv6
|
||||
// candidates advertised in DirectCallOffer/Answer are dead on
|
||||
// arrival — the Dialer can't send to them and the Acceptor can't
|
||||
// receive from them.
|
||||
let bind: std::net::SocketAddr = "[::]:0".parse().unwrap();
|
||||
// 0.0.0.0:0 = IPv4. [::]:0 dual-stack was tried but breaks on
|
||||
// Android (IPV6_V6ONLY=1 on some kernels kills IPv4). IPv6
|
||||
// host candidates need a separate dedicated socket (future).
|
||||
let bind: std::net::SocketAddr = "0.0.0.0:0".parse().unwrap();
|
||||
let (server_cfg, _cert_der) = wzp_transport::server_config();
|
||||
let endpoint = wzp_transport::create_endpoint(bind, Some(server_cfg))
|
||||
.map_err(|e| format!("{e}"))?;
|
||||
|
||||
Reference in New Issue
Block a user