fix(nat): working NAT tickle + smart filter debug + timeout diags
Some checks failed
Mirror to GitHub / mirror (push) Failing after 27s
Build Release Binaries / build-amd64 (push) Failing after 3m39s

Fixes from real-world 5G↔Starlink testing:

NAT tickle fix:
- tokio::net::UdpSocket::bind() doesn't set SO_REUSEADDR, so binding
  to the same port as quinn silently failed. Now uses socket2::Socket
  with explicit SO_REUSEADDR + SO_REUSEPORT (via libc on unix).
- Tickle now logs success/failure for debugging.

Diagnostic fixes:
- connect:dual_path_race_start shows both dial_order_raw and
  dial_order_smart so we can see what filtering removed
- Grace-period timeout (relay wins first, direct still running)
  now fills "timeout:grace" diags for unrecorded candidates
- Previously candidate_diags was empty when relay won the race

Dependencies:
- Added socket2 = "0.5" to wzp-client

593 tests pass, 0 regressions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Siavash Sameni
2026-04-14 15:58:13 +04:00
parent bc6d327ebb
commit 1de280fe04
4 changed files with 67 additions and 26 deletions

1
Cargo.lock generated
View File

@@ -8011,6 +8011,7 @@ dependencies = [
"rustls",
"serde",
"serde_json",
"socket2 0.5.10",
"tokio",
"tracing",
"tracing-subscriber",

View File

@@ -34,6 +34,7 @@ libc = "0.2"
# MikroTik's default masquerade, don't support).
if-addrs = "0.13"
rand = { workspace = true }
socket2 = "0.5"
# coreaudio-rs is Apple-framework-only; gate it to macOS so enabling
# the `vpio` feature from a non-macOS target builds cleanly instead of

View File

@@ -285,29 +285,56 @@ pub async fn race(
// gets dropped by our NAT.
if !tickle_addrs.is_empty() {
if let Ok(local_addr) = ep_for_fut.local_addr() {
// We can't send raw UDP on the quinn endpoint,
// so we use a fresh socket on the SAME port
// (SO_REUSEADDR). This makes the NAT see
// outbound traffic from our port to the peer,
// opening the pinhole.
let bind = SocketAddr::new(
// Send a tickle to each peer candidate address
// to open our NAT for return traffic from that IP.
//
// We use a socket2 socket with SO_REUSEADDR +
// SO_REUSEPORT on the SAME port as the quinn
// endpoint. This is necessary because quinn
// already holds the port — a plain bind() would
// fail with EADDRINUSE.
let tickle_result: Result<(), String> = (|| {
use std::net::UdpSocket as StdUdpSocket;
let sock = socket2::Socket::new(
socket2::Domain::IPV4,
socket2::Type::DGRAM,
Some(socket2::Protocol::UDP),
).map_err(|e| format!("socket: {e}"))?;
sock.set_reuse_address(true).map_err(|e| format!("reuseaddr: {e}"))?;
// macOS/BSD/Linux also need SO_REUSEPORT
#[cfg(any(target_os = "macos", target_os = "linux", target_os = "android"))]
{
// socket2 exposes set_reuse_port on unix
unsafe {
let optval: libc::c_int = 1;
libc::setsockopt(
std::os::unix::io::AsRawFd::as_raw_fd(&sock),
libc::SOL_SOCKET,
libc::SO_REUSEPORT,
&optval as *const _ as *const libc::c_void,
std::mem::size_of::<libc::c_int>() as libc::socklen_t,
);
}
}
sock.set_nonblocking(true).map_err(|e| format!("nonblock: {e}"))?;
let bind_addr: SocketAddr = SocketAddr::new(
std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
local_addr.port(),
);
if let Ok(tickle_sock) = tokio::net::UdpSocket::bind(bind).await {
sock.bind(&bind_addr.into()).map_err(|e| format!("bind :{}: {e}", local_addr.port()))?;
let std_sock: StdUdpSocket = sock.into();
for addr in &tickle_addrs {
// Send a minimal QUIC-like packet (version
// negotiation bait). The content doesn't
// matter — we just need the NAT to see
// outbound traffic from our port to this IP.
let tickle_bytes = [0u8; 1];
let _ = tickle_sock.send_to(&tickle_bytes, addr).await;
let _ = std_sock.send_to(&[0u8; 1], addr);
tracing::info!(
%addr,
local_port = local_addr.port(),
"dual_path: A-role sent NAT tickle"
);
}
Ok(())
})();
if let Err(e) = tickle_result {
tracing::warn!(error = %e, "dual_path: A-role NAT tickle failed");
}
}
}
@@ -670,7 +697,24 @@ pub async fn race(
match tokio::time::timeout(Duration::from_secs(1), direct_task).await {
Ok(Ok(Ok(Ok(t)))) => { direct_result = Some(Ok(t)); }
Ok(Ok(Ok(Err(e)))) => { direct_result = Some(Err(anyhow::anyhow!("{e}"))); }
_ => { direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period"))); }
_ => {
direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period")));
// Fill timeout diags for candidates that never reported.
if let Ok(mut d) = diags_collector.lock() {
let recorded: std::collections::HashSet<usize> =
d.iter().map(|diag| diag.index).collect();
for (idx, addr) in smart_order.iter().enumerate() {
if !recorded.contains(&idx) {
d.push(CandidateDiag {
index: idx,
addr: addr.to_string(),
result: "timeout:grace".into(),
elapsed_ms: None,
});
}
}
}
}
}
}
if relay_result.is_none() {

View File

@@ -453,21 +453,18 @@ async fn connect(
own = ?own_reflex_addr,
"connect: starting dual-path race"
);
let own_reflex_parsed: Option<std::net::SocketAddr> =
own_reflex_addr.as_deref().and_then(|s| s.parse().ok());
emit_call_debug(&app, "connect:dual_path_race_start", serde_json::json!({
"role": format!("{:?}", r),
"peer_reflex": peer_addr_parsed.map(|a| a.to_string()),
"peer_mapped": peer_mapped_parsed.map(|a| a.to_string()),
"peer_local": peer_local_parsed.iter().map(|a| a.to_string()).collect::<Vec<_>>(),
"dial_order": candidates.dial_order().iter().map(|a| a.to_string()).collect::<Vec<_>>(),
"dial_order_raw": candidates.dial_order().iter().map(|a| a.to_string()).collect::<Vec<_>>(),
"dial_order_smart": candidates.smart_dial_order(own_reflex_parsed.as_ref()).iter().map(|a| a.to_string()).collect::<Vec<_>>(),
"relay_addr": relay_sockaddr.to_string(),
"own_reflex_addr": own_reflex_addr,
}));
// Phase 6 fix: install the oneshot BEFORE the race
// starts. The peer's MediaPathReport can arrive
// while our race is still running — if we set up
// the oneshot after the race, the recv loop has
// nowhere to send the report and it gets dropped,
// causing a 3s timeout and false relay fallback.
let (path_report_tx, path_report_rx) = tokio::sync::oneshot::channel::<bool>();
{
let mut sig = state.signal.lock().await;
@@ -476,8 +473,6 @@ async fn connect(
let room_sni = room.clone();
let call_sni = format!("call-{room}");
let own_reflex_parsed: Option<std::net::SocketAddr> =
own_reflex_addr.as_deref().and_then(|s| s.parse().ok());
match wzp_client::dual_path::race(
r,
candidates,