fix(android): reuse signal endpoint for direct-call media connection
Some checks failed
Mirror to GitHub / mirror (push) Failing after 38s
Build Release Binaries / build-amd64 (push) Failing after 3m46s

Direct-call accept hangs forever at the QUIC handshake on Android. Logs
from d7b37a5 showed:
  CallEngine::start (android) invoked relay=172.16.81.172:4433 room=call-…
  resolved relay addr
  identity loaded
  endpoint created, dialing relay   ← reached
                                    ← nothing, 90s+, no error
The "connect failed" and "QUIC connection established" log lines never
fire, meaning endpoint.connect_with(…).await never makes progress.

Repro is 100%: SFU room join (one endpoint) works perfectly; direct call
(opens a SECOND quinn::Endpoint on top of the signal one) hangs in the
QUIC handshake. Creating two quinn::Endpoints on Android's AAudio-adjacent
UDP stack apparently causes the second one's datagrams to never reach the
relay (the server never sees the Initial packet). Rather than fight the
platform, quinn is happy to multiplex multiple Connections on a single
Endpoint — so we reuse the signal endpoint for the media connection.

- SignalState now stores the quinn::Endpoint alongside the QuinnTransport.
  register_signal populates both at the same time.
- CallEngine::start (both android and desktop branches) takes an
  Option<wzp_transport::Endpoint>. Some → reuse (direct-call path, after
  register_signal). None → create fresh (SFU room join path).
- The connect tauri command reads state.signal.endpoint and threads it
  through to CallEngine::start, so the direct-call auto-connect (fired by
  the "setup" signal-event in main.ts) lands on the existing UDP socket.
- wzp_transport re-exports quinn::Endpoint so wzp-desktop doesn't need to
  depend on quinn directly.
- Also wraps the android connect in tokio::time::timeout(10s) so future
  hangs become deterministic "connect TIMED OUT" errors in logcat
  instead of silent deadlock.

Same fix applies verbatim to the desktop client — the user suspects
direct call is broken there too and this was likely always the cause,
just never surfaced because desktop was only tested via SFU rooms.
This commit is contained in:
Siavash Sameni
2026-04-09 20:29:51 +04:00
parent d7b37a5749
commit 49f101d785
3 changed files with 65 additions and 14 deletions

View File

@@ -27,3 +27,8 @@ pub use connection::{accept, connect, create_endpoint};
pub use path_monitor::PathMonitor;
pub use quic::QuinnTransport;
pub use wzp_proto::{MediaTransport, PathQuality, TransportError};
// Re-export the quinn Endpoint type so downstream crates (wzp-desktop) can
// thread a shared endpoint between signaling and media connections without
// needing to depend on quinn directly.
pub use quinn::Endpoint;

View File

@@ -105,12 +105,13 @@ impl CallEngine {
alias: String,
_os_aec: bool,
quality: String,
reuse_endpoint: Option<wzp_transport::Endpoint>,
event_cb: F,
) -> Result<Self, anyhow::Error>
where
F: Fn(&str, &str) + Send + Sync + 'static,
{
info!(%relay, %room, %alias, %quality, "CallEngine::start (android) invoked");
info!(%relay, %room, %alias, %quality, has_reuse = reuse_endpoint.is_some(), "CallEngine::start (android) invoked");
let _ = rustls::crypto::ring::default_provider().install_default();
let relay_addr: SocketAddr = relay.parse()?;
@@ -124,14 +125,38 @@ impl CallEngine {
info!(%fp, "identity loaded");
// QUIC transport + handshake.
let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap();
let endpoint = wzp_transport::create_endpoint(bind_addr, None)
.map_err(|e| { error!("create_endpoint failed: {e}"); e })?;
info!("endpoint created, dialing relay");
//
// If a `reuse_endpoint` was passed in (the direct-call path, where we
// already opened a quinn::Endpoint for the signal connection), reuse
// it: a second quinn::Endpoint on Android silently fails to complete
// the QUIC handshake against the same relay. Reusing the existing
// socket lets quinn multiplex the signal + media connections on one
// UDP port.
let endpoint = if let Some(ep) = reuse_endpoint {
info!(local_addr = ?ep.local_addr().ok(), "reusing signal endpoint for media connection");
ep
} else {
let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap();
let ep = wzp_transport::create_endpoint(bind_addr, None)
.map_err(|e| { error!("create_endpoint failed: {e}"); e })?;
info!(local_addr = ?ep.local_addr().ok(), "created new endpoint, dialing relay");
ep
};
let client_config = wzp_transport::client_config();
let conn = wzp_transport::connect(&endpoint, relay_addr, &room, client_config)
.await
.map_err(|e| { error!("connect failed: {e}"); e })?;
let conn = match tokio::time::timeout(
std::time::Duration::from_secs(10),
wzp_transport::connect(&endpoint, relay_addr, &room, client_config),
).await {
Ok(Ok(c)) => c,
Ok(Err(e)) => {
error!("connect failed: {e}");
return Err(e.into());
}
Err(_) => {
error!("connect TIMED OUT after 10s — QUIC handshake never completed. Relay may be unreachable from this endpoint.");
return Err(anyhow::anyhow!("QUIC connect timeout (10s)"));
}
};
info!("QUIC connection established, performing handshake");
let transport = Arc::new(wzp_transport::QuinnTransport::new(conn));
@@ -378,6 +403,7 @@ impl CallEngine {
alias: String,
_os_aec: bool,
quality: String,
reuse_endpoint: Option<wzp_transport::Endpoint>,
event_cb: F,
) -> Result<Self, anyhow::Error>
where
@@ -418,9 +444,15 @@ impl CallEngine {
let fingerprint = fp.to_string();
info!(%fp, "identity loaded");
// Connect
let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap();
let endpoint = wzp_transport::create_endpoint(bind_addr, None)?;
// Connect — reuse the signal endpoint if the direct-call path gave us
// one, otherwise create a fresh one (SFU room join path).
let endpoint = if let Some(ep) = reuse_endpoint {
info!("reusing signal endpoint for media connection");
ep
} else {
let bind_addr: SocketAddr = "0.0.0.0:0".parse().unwrap();
wzp_transport::create_endpoint(bind_addr, None)?
};
let client_config = wzp_transport::client_config();
let conn = wzp_transport::connect(&endpoint, relay_addr, &room, client_config).await?;
let transport = Arc::new(wzp_transport::QuinnTransport::new(conn));

View File

@@ -243,8 +243,17 @@ async fn connect(
return Err("already connected".into());
}
// If we previously opened a quinn::Endpoint for the signaling connection
// (direct-call path), reuse it so the media connection shares the same
// UDP socket. This side-steps the Android issue where a second
// quinn::Endpoint silently hangs in the QUIC handshake.
let reuse_endpoint = state.signal.lock().await.endpoint.clone();
if reuse_endpoint.is_some() {
tracing::info!("connect: reusing existing signal endpoint for media connection");
}
let app_clone = app.clone();
match CallEngine::start(relay, room, alias, os_aec, quality, move |event_kind, message| {
match CallEngine::start(relay, room, alias, os_aec, quality, reuse_endpoint, move |event_kind, message| {
let _ = app_clone.emit(
"call-event",
CallEvent {
@@ -341,6 +350,11 @@ async fn get_status(state: tauri::State<'_, Arc<AppState>>) -> Result<CallStatus
struct SignalState {
transport: Option<Arc<wzp_transport::QuinnTransport>>,
/// The quinn::Endpoint backing the signal connection. Reused for the
/// media connection when a direct call is accepted — Android phones
/// silently drop packets from a second quinn::Endpoint to the same
/// relay, so every call after register_signal MUST share this socket.
endpoint: Option<wzp_transport::Endpoint>,
fingerprint: String,
signal_status: String,
incoming_call_id: Option<String>,
@@ -380,7 +394,7 @@ async fn register_signal(
_ => return Err("registration failed".into()),
}
{ let mut sig = state.signal.lock().await; sig.transport = Some(transport.clone()); sig.fingerprint = fp.clone(); sig.signal_status = "registered".into(); }
{ let mut sig = state.signal.lock().await; sig.transport = Some(transport.clone()); sig.endpoint = Some(endpoint.clone()); sig.fingerprint = fp.clone(); sig.signal_status = "registered".into(); }
tracing::info!(%fp, "signal registered, spawning recv loop");
let signal_state = Arc::clone(&state.signal);
@@ -483,7 +497,7 @@ pub fn run() {
let state = Arc::new(AppState {
engine: Mutex::new(None),
signal: Arc::new(Mutex::new(SignalState {
transport: None, fingerprint: String::new(), signal_status: "idle".into(),
transport: None, endpoint: None, fingerprint: String::new(), signal_status: "idle".into(),
incoming_call_id: None, incoming_caller_fp: None, incoming_caller_alias: None,
})),
});