From 49f101d78508557d4754d483db0c5609f4b8b200 Mon Sep 17 00:00:00 2001 From: Siavash Sameni Date: Thu, 9 Apr 2026 20:29:51 +0400 Subject: [PATCH] fix(android): reuse signal endpoint for direct-call media connection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Direct-call accept hangs forever at the QUIC handshake on Android. Logs from d7b37a5 showed: CallEngine::start (android) invoked relay=172.16.81.172:4433 room=call-… resolved relay addr identity loaded endpoint created, dialing relay ← reached ← nothing, 90s+, no error The "connect failed" and "QUIC connection established" log lines never fire, meaning endpoint.connect_with(…).await never makes progress. Repro is 100%: SFU room join (one endpoint) works perfectly; direct call (opens a SECOND quinn::Endpoint on top of the signal one) hangs in the QUIC handshake. Creating two quinn::Endpoints on Android's AAudio-adjacent UDP stack apparently causes the second one's datagrams to never reach the relay (the server never sees the Initial packet). Rather than fight the platform, quinn is happy to multiplex multiple Connections on a single Endpoint — so we reuse the signal endpoint for the media connection. - SignalState now stores the quinn::Endpoint alongside the QuinnTransport. register_signal populates both at the same time. - CallEngine::start (both android and desktop branches) takes an Option. Some → reuse (direct-call path, after register_signal). None → create fresh (SFU room join path). - The connect tauri command reads state.signal.endpoint and threads it through to CallEngine::start, so the direct-call auto-connect (fired by the "setup" signal-event in main.ts) lands on the existing UDP socket. - wzp_transport re-exports quinn::Endpoint so wzp-desktop doesn't need to depend on quinn directly. - Also wraps the android connect in tokio::time::timeout(10s) so future hangs become deterministic "connect TIMED OUT" errors in logcat instead of silent deadlock. Same fix applies verbatim to the desktop client — the user suspects direct call is broken there too and this was likely always the cause, just never surfaced because desktop was only tested via SFU rooms. --- crates/wzp-transport/src/lib.rs | 5 +++ desktop/src-tauri/src/engine.rs | 54 ++++++++++++++++++++++++++------- desktop/src-tauri/src/lib.rs | 20 ++++++++++-- 3 files changed, 65 insertions(+), 14 deletions(-) diff --git a/crates/wzp-transport/src/lib.rs b/crates/wzp-transport/src/lib.rs index 4034701..0e58b43 100644 --- a/crates/wzp-transport/src/lib.rs +++ b/crates/wzp-transport/src/lib.rs @@ -27,3 +27,8 @@ pub use connection::{accept, connect, create_endpoint}; pub use path_monitor::PathMonitor; pub use quic::QuinnTransport; pub use wzp_proto::{MediaTransport, PathQuality, TransportError}; + +// Re-export the quinn Endpoint type so downstream crates (wzp-desktop) can +// thread a shared endpoint between signaling and media connections without +// needing to depend on quinn directly. +pub use quinn::Endpoint; diff --git a/desktop/src-tauri/src/engine.rs b/desktop/src-tauri/src/engine.rs index 29e9d8c..9951275 100644 --- a/desktop/src-tauri/src/engine.rs +++ b/desktop/src-tauri/src/engine.rs @@ -105,12 +105,13 @@ impl CallEngine { alias: String, _os_aec: bool, quality: String, + reuse_endpoint: Option, event_cb: F, ) -> Result where F: Fn(&str, &str) + Send + Sync + 'static, { - info!(%relay, %room, %alias, %quality, "CallEngine::start (android) invoked"); + info!(%relay, %room, %alias, %quality, has_reuse = reuse_endpoint.is_some(), "CallEngine::start (android) invoked"); let _ = rustls::crypto::ring::default_provider().install_default(); let relay_addr: SocketAddr = relay.parse()?; @@ -124,14 +125,38 @@ impl CallEngine { info!(%fp, "identity loaded"); // QUIC transport + handshake. - let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap(); - let endpoint = wzp_transport::create_endpoint(bind_addr, None) - .map_err(|e| { error!("create_endpoint failed: {e}"); e })?; - info!("endpoint created, dialing relay"); + // + // If a `reuse_endpoint` was passed in (the direct-call path, where we + // already opened a quinn::Endpoint for the signal connection), reuse + // it: a second quinn::Endpoint on Android silently fails to complete + // the QUIC handshake against the same relay. Reusing the existing + // socket lets quinn multiplex the signal + media connections on one + // UDP port. + let endpoint = if let Some(ep) = reuse_endpoint { + info!(local_addr = ?ep.local_addr().ok(), "reusing signal endpoint for media connection"); + ep + } else { + let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap(); + let ep = wzp_transport::create_endpoint(bind_addr, None) + .map_err(|e| { error!("create_endpoint failed: {e}"); e })?; + info!(local_addr = ?ep.local_addr().ok(), "created new endpoint, dialing relay"); + ep + }; let client_config = wzp_transport::client_config(); - let conn = wzp_transport::connect(&endpoint, relay_addr, &room, client_config) - .await - .map_err(|e| { error!("connect failed: {e}"); e })?; + let conn = match tokio::time::timeout( + std::time::Duration::from_secs(10), + wzp_transport::connect(&endpoint, relay_addr, &room, client_config), + ).await { + Ok(Ok(c)) => c, + Ok(Err(e)) => { + error!("connect failed: {e}"); + return Err(e.into()); + } + Err(_) => { + error!("connect TIMED OUT after 10s — QUIC handshake never completed. Relay may be unreachable from this endpoint."); + return Err(anyhow::anyhow!("QUIC connect timeout (10s)")); + } + }; info!("QUIC connection established, performing handshake"); let transport = Arc::new(wzp_transport::QuinnTransport::new(conn)); @@ -378,6 +403,7 @@ impl CallEngine { alias: String, _os_aec: bool, quality: String, + reuse_endpoint: Option, event_cb: F, ) -> Result where @@ -418,9 +444,15 @@ impl CallEngine { let fingerprint = fp.to_string(); info!(%fp, "identity loaded"); - // Connect - let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap(); - let endpoint = wzp_transport::create_endpoint(bind_addr, None)?; + // Connect — reuse the signal endpoint if the direct-call path gave us + // one, otherwise create a fresh one (SFU room join path). + let endpoint = if let Some(ep) = reuse_endpoint { + info!("reusing signal endpoint for media connection"); + ep + } else { + let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap(); + wzp_transport::create_endpoint(bind_addr, None)? + }; let client_config = wzp_transport::client_config(); let conn = wzp_transport::connect(&endpoint, relay_addr, &room, client_config).await?; let transport = Arc::new(wzp_transport::QuinnTransport::new(conn)); diff --git a/desktop/src-tauri/src/lib.rs b/desktop/src-tauri/src/lib.rs index c3b8101..b1f9fe9 100644 --- a/desktop/src-tauri/src/lib.rs +++ b/desktop/src-tauri/src/lib.rs @@ -243,8 +243,17 @@ async fn connect( return Err("already connected".into()); } + // If we previously opened a quinn::Endpoint for the signaling connection + // (direct-call path), reuse it so the media connection shares the same + // UDP socket. This side-steps the Android issue where a second + // quinn::Endpoint silently hangs in the QUIC handshake. + let reuse_endpoint = state.signal.lock().await.endpoint.clone(); + if reuse_endpoint.is_some() { + tracing::info!("connect: reusing existing signal endpoint for media connection"); + } + let app_clone = app.clone(); - match CallEngine::start(relay, room, alias, os_aec, quality, move |event_kind, message| { + match CallEngine::start(relay, room, alias, os_aec, quality, reuse_endpoint, move |event_kind, message| { let _ = app_clone.emit( "call-event", CallEvent { @@ -341,6 +350,11 @@ async fn get_status(state: tauri::State<'_, Arc>) -> Result>, + /// The quinn::Endpoint backing the signal connection. Reused for the + /// media connection when a direct call is accepted — Android phones + /// silently drop packets from a second quinn::Endpoint to the same + /// relay, so every call after register_signal MUST share this socket. + endpoint: Option, fingerprint: String, signal_status: String, incoming_call_id: Option, @@ -380,7 +394,7 @@ async fn register_signal( _ => return Err("registration failed".into()), } - { let mut sig = state.signal.lock().await; sig.transport = Some(transport.clone()); sig.fingerprint = fp.clone(); sig.signal_status = "registered".into(); } + { let mut sig = state.signal.lock().await; sig.transport = Some(transport.clone()); sig.endpoint = Some(endpoint.clone()); sig.fingerprint = fp.clone(); sig.signal_status = "registered".into(); } tracing::info!(%fp, "signal registered, spawning recv loop"); let signal_state = Arc::clone(&state.signal); @@ -483,7 +497,7 @@ pub fn run() { let state = Arc::new(AppState { engine: Mutex::new(None), signal: Arc::new(Mutex::new(SignalState { - transport: None, fingerprint: String::new(), signal_status: "idle".into(), + transport: None, endpoint: None, fingerprint: String::new(), signal_status: "idle".into(), incoming_call_id: None, incoming_caller_fp: None, incoming_caller_alias: None, })), });