From 1de280fe048adc7e74614e475b71d03871bb1ab5 Mon Sep 17 00:00:00 2001 From: Siavash Sameni Date: Tue, 14 Apr 2026 15:58:13 +0400 Subject: [PATCH] fix(nat): working NAT tickle + smart filter debug + timeout diags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes from real-world 5G↔Starlink testing: NAT tickle fix: - tokio::net::UdpSocket::bind() doesn't set SO_REUSEADDR, so binding to the same port as quinn silently failed. Now uses socket2::Socket with explicit SO_REUSEADDR + SO_REUSEPORT (via libc on unix). - Tickle now logs success/failure for debugging. Diagnostic fixes: - connect:dual_path_race_start shows both dial_order_raw and dial_order_smart so we can see what filtering removed - Grace-period timeout (relay wins first, direct still running) now fills "timeout:grace" diags for unrecorded candidates - Previously candidate_diags was empty when relay won the race Dependencies: - Added socket2 = "0.5" to wzp-client 593 tests pass, 0 regressions. Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 1 + crates/wzp-client/Cargo.toml | 1 + crates/wzp-client/src/dual_path.rs | 78 +++++++++++++++++++++++------- desktop/src-tauri/src/lib.rs | 13 ++--- 4 files changed, 67 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d44e4c5..316a6cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8011,6 +8011,7 @@ dependencies = [ "rustls", "serde", "serde_json", + "socket2 0.5.10", "tokio", "tracing", "tracing-subscriber", diff --git a/crates/wzp-client/Cargo.toml b/crates/wzp-client/Cargo.toml index c4fae7c..57fa23d 100644 --- a/crates/wzp-client/Cargo.toml +++ b/crates/wzp-client/Cargo.toml @@ -34,6 +34,7 @@ libc = "0.2" # MikroTik's default masquerade, don't support). if-addrs = "0.13" rand = { workspace = true } +socket2 = "0.5" # coreaudio-rs is Apple-framework-only; gate it to macOS so enabling # the `vpio` feature from a non-macOS target builds cleanly instead of diff --git a/crates/wzp-client/src/dual_path.rs b/crates/wzp-client/src/dual_path.rs index 32983c7..6f74562 100644 --- a/crates/wzp-client/src/dual_path.rs +++ b/crates/wzp-client/src/dual_path.rs @@ -285,29 +285,56 @@ pub async fn race( // gets dropped by our NAT. if !tickle_addrs.is_empty() { if let Ok(local_addr) = ep_for_fut.local_addr() { - // We can't send raw UDP on the quinn endpoint, - // so we use a fresh socket on the SAME port - // (SO_REUSEADDR). This makes the NAT see - // outbound traffic from our port to the peer, - // opening the pinhole. - let bind = SocketAddr::new( - std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED), - local_addr.port(), - ); - if let Ok(tickle_sock) = tokio::net::UdpSocket::bind(bind).await { + // Send a tickle to each peer candidate address + // to open our NAT for return traffic from that IP. + // + // We use a socket2 socket with SO_REUSEADDR + + // SO_REUSEPORT on the SAME port as the quinn + // endpoint. This is necessary because quinn + // already holds the port — a plain bind() would + // fail with EADDRINUSE. + let tickle_result: Result<(), String> = (|| { + use std::net::UdpSocket as StdUdpSocket; + let sock = socket2::Socket::new( + socket2::Domain::IPV4, + socket2::Type::DGRAM, + Some(socket2::Protocol::UDP), + ).map_err(|e| format!("socket: {e}"))?; + sock.set_reuse_address(true).map_err(|e| format!("reuseaddr: {e}"))?; + // macOS/BSD/Linux also need SO_REUSEPORT + #[cfg(any(target_os = "macos", target_os = "linux", target_os = "android"))] + { + // socket2 exposes set_reuse_port on unix + unsafe { + let optval: libc::c_int = 1; + libc::setsockopt( + std::os::unix::io::AsRawFd::as_raw_fd(&sock), + libc::SOL_SOCKET, + libc::SO_REUSEPORT, + &optval as *const _ as *const libc::c_void, + std::mem::size_of::() as libc::socklen_t, + ); + } + } + sock.set_nonblocking(true).map_err(|e| format!("nonblock: {e}"))?; + let bind_addr: SocketAddr = SocketAddr::new( + std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED), + local_addr.port(), + ); + sock.bind(&bind_addr.into()).map_err(|e| format!("bind :{}: {e}", local_addr.port()))?; + let std_sock: StdUdpSocket = sock.into(); for addr in &tickle_addrs { - // Send a minimal QUIC-like packet (version - // negotiation bait). The content doesn't - // matter — we just need the NAT to see - // outbound traffic from our port to this IP. - let tickle_bytes = [0u8; 1]; - let _ = tickle_sock.send_to(&tickle_bytes, addr).await; + let _ = std_sock.send_to(&[0u8; 1], addr); tracing::info!( %addr, local_port = local_addr.port(), "dual_path: A-role sent NAT tickle" ); } + Ok(()) + })(); + if let Err(e) = tickle_result { + tracing::warn!(error = %e, "dual_path: A-role NAT tickle failed"); } } } @@ -670,7 +697,24 @@ pub async fn race( match tokio::time::timeout(Duration::from_secs(1), direct_task).await { Ok(Ok(Ok(Ok(t)))) => { direct_result = Some(Ok(t)); } Ok(Ok(Ok(Err(e)))) => { direct_result = Some(Err(anyhow::anyhow!("{e}"))); } - _ => { direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period"))); } + _ => { + direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period"))); + // Fill timeout diags for candidates that never reported. + if let Ok(mut d) = diags_collector.lock() { + let recorded: std::collections::HashSet = + d.iter().map(|diag| diag.index).collect(); + for (idx, addr) in smart_order.iter().enumerate() { + if !recorded.contains(&idx) { + d.push(CandidateDiag { + index: idx, + addr: addr.to_string(), + result: "timeout:grace".into(), + elapsed_ms: None, + }); + } + } + } + } } } if relay_result.is_none() { diff --git a/desktop/src-tauri/src/lib.rs b/desktop/src-tauri/src/lib.rs index 591f69f..bf5f9d2 100644 --- a/desktop/src-tauri/src/lib.rs +++ b/desktop/src-tauri/src/lib.rs @@ -453,21 +453,18 @@ async fn connect( own = ?own_reflex_addr, "connect: starting dual-path race" ); + let own_reflex_parsed: Option = + own_reflex_addr.as_deref().and_then(|s| s.parse().ok()); emit_call_debug(&app, "connect:dual_path_race_start", serde_json::json!({ "role": format!("{:?}", r), "peer_reflex": peer_addr_parsed.map(|a| a.to_string()), "peer_mapped": peer_mapped_parsed.map(|a| a.to_string()), "peer_local": peer_local_parsed.iter().map(|a| a.to_string()).collect::>(), - "dial_order": candidates.dial_order().iter().map(|a| a.to_string()).collect::>(), + "dial_order_raw": candidates.dial_order().iter().map(|a| a.to_string()).collect::>(), + "dial_order_smart": candidates.smart_dial_order(own_reflex_parsed.as_ref()).iter().map(|a| a.to_string()).collect::>(), "relay_addr": relay_sockaddr.to_string(), "own_reflex_addr": own_reflex_addr, })); - // Phase 6 fix: install the oneshot BEFORE the race - // starts. The peer's MediaPathReport can arrive - // while our race is still running — if we set up - // the oneshot after the race, the recv loop has - // nowhere to send the report and it gets dropped, - // causing a 3s timeout and false relay fallback. let (path_report_tx, path_report_rx) = tokio::sync::oneshot::channel::(); { let mut sig = state.signal.lock().await; @@ -476,8 +473,6 @@ async fn connect( let room_sni = room.clone(); let call_sni = format!("call-{room}"); - let own_reflex_parsed: Option = - own_reflex_addr.as_deref().and_then(|s| s.parse().ok()); match wzp_client::dual_path::race( r, candidates,