fix(nat): working NAT tickle + smart filter debug + timeout diags
Fixes from real-world 5G↔Starlink testing: NAT tickle fix: - tokio::net::UdpSocket::bind() doesn't set SO_REUSEADDR, so binding to the same port as quinn silently failed. Now uses socket2::Socket with explicit SO_REUSEADDR + SO_REUSEPORT (via libc on unix). - Tickle now logs success/failure for debugging. Diagnostic fixes: - connect:dual_path_race_start shows both dial_order_raw and dial_order_smart so we can see what filtering removed - Grace-period timeout (relay wins first, direct still running) now fills "timeout:grace" diags for unrecorded candidates - Previously candidate_diags was empty when relay won the race Dependencies: - Added socket2 = "0.5" to wzp-client 593 tests pass, 0 regressions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -8011,6 +8011,7 @@ dependencies = [
|
||||
"rustls",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"socket2 0.5.10",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
|
||||
@@ -34,6 +34,7 @@ libc = "0.2"
|
||||
# MikroTik's default masquerade, don't support).
|
||||
if-addrs = "0.13"
|
||||
rand = { workspace = true }
|
||||
socket2 = "0.5"
|
||||
|
||||
# coreaudio-rs is Apple-framework-only; gate it to macOS so enabling
|
||||
# the `vpio` feature from a non-macOS target builds cleanly instead of
|
||||
|
||||
@@ -285,29 +285,56 @@ pub async fn race(
|
||||
// gets dropped by our NAT.
|
||||
if !tickle_addrs.is_empty() {
|
||||
if let Ok(local_addr) = ep_for_fut.local_addr() {
|
||||
// We can't send raw UDP on the quinn endpoint,
|
||||
// so we use a fresh socket on the SAME port
|
||||
// (SO_REUSEADDR). This makes the NAT see
|
||||
// outbound traffic from our port to the peer,
|
||||
// opening the pinhole.
|
||||
let bind = SocketAddr::new(
|
||||
std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
|
||||
local_addr.port(),
|
||||
);
|
||||
if let Ok(tickle_sock) = tokio::net::UdpSocket::bind(bind).await {
|
||||
// Send a tickle to each peer candidate address
|
||||
// to open our NAT for return traffic from that IP.
|
||||
//
|
||||
// We use a socket2 socket with SO_REUSEADDR +
|
||||
// SO_REUSEPORT on the SAME port as the quinn
|
||||
// endpoint. This is necessary because quinn
|
||||
// already holds the port — a plain bind() would
|
||||
// fail with EADDRINUSE.
|
||||
let tickle_result: Result<(), String> = (|| {
|
||||
use std::net::UdpSocket as StdUdpSocket;
|
||||
let sock = socket2::Socket::new(
|
||||
socket2::Domain::IPV4,
|
||||
socket2::Type::DGRAM,
|
||||
Some(socket2::Protocol::UDP),
|
||||
).map_err(|e| format!("socket: {e}"))?;
|
||||
sock.set_reuse_address(true).map_err(|e| format!("reuseaddr: {e}"))?;
|
||||
// macOS/BSD/Linux also need SO_REUSEPORT
|
||||
#[cfg(any(target_os = "macos", target_os = "linux", target_os = "android"))]
|
||||
{
|
||||
// socket2 exposes set_reuse_port on unix
|
||||
unsafe {
|
||||
let optval: libc::c_int = 1;
|
||||
libc::setsockopt(
|
||||
std::os::unix::io::AsRawFd::as_raw_fd(&sock),
|
||||
libc::SOL_SOCKET,
|
||||
libc::SO_REUSEPORT,
|
||||
&optval as *const _ as *const libc::c_void,
|
||||
std::mem::size_of::<libc::c_int>() as libc::socklen_t,
|
||||
);
|
||||
}
|
||||
}
|
||||
sock.set_nonblocking(true).map_err(|e| format!("nonblock: {e}"))?;
|
||||
let bind_addr: SocketAddr = SocketAddr::new(
|
||||
std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
|
||||
local_addr.port(),
|
||||
);
|
||||
sock.bind(&bind_addr.into()).map_err(|e| format!("bind :{}: {e}", local_addr.port()))?;
|
||||
let std_sock: StdUdpSocket = sock.into();
|
||||
for addr in &tickle_addrs {
|
||||
// Send a minimal QUIC-like packet (version
|
||||
// negotiation bait). The content doesn't
|
||||
// matter — we just need the NAT to see
|
||||
// outbound traffic from our port to this IP.
|
||||
let tickle_bytes = [0u8; 1];
|
||||
let _ = tickle_sock.send_to(&tickle_bytes, addr).await;
|
||||
let _ = std_sock.send_to(&[0u8; 1], addr);
|
||||
tracing::info!(
|
||||
%addr,
|
||||
local_port = local_addr.port(),
|
||||
"dual_path: A-role sent NAT tickle"
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
})();
|
||||
if let Err(e) = tickle_result {
|
||||
tracing::warn!(error = %e, "dual_path: A-role NAT tickle failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -670,7 +697,24 @@ pub async fn race(
|
||||
match tokio::time::timeout(Duration::from_secs(1), direct_task).await {
|
||||
Ok(Ok(Ok(Ok(t)))) => { direct_result = Some(Ok(t)); }
|
||||
Ok(Ok(Ok(Err(e)))) => { direct_result = Some(Err(anyhow::anyhow!("{e}"))); }
|
||||
_ => { direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period"))); }
|
||||
_ => {
|
||||
direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period")));
|
||||
// Fill timeout diags for candidates that never reported.
|
||||
if let Ok(mut d) = diags_collector.lock() {
|
||||
let recorded: std::collections::HashSet<usize> =
|
||||
d.iter().map(|diag| diag.index).collect();
|
||||
for (idx, addr) in smart_order.iter().enumerate() {
|
||||
if !recorded.contains(&idx) {
|
||||
d.push(CandidateDiag {
|
||||
index: idx,
|
||||
addr: addr.to_string(),
|
||||
result: "timeout:grace".into(),
|
||||
elapsed_ms: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if relay_result.is_none() {
|
||||
|
||||
@@ -453,21 +453,18 @@ async fn connect(
|
||||
own = ?own_reflex_addr,
|
||||
"connect: starting dual-path race"
|
||||
);
|
||||
let own_reflex_parsed: Option<std::net::SocketAddr> =
|
||||
own_reflex_addr.as_deref().and_then(|s| s.parse().ok());
|
||||
emit_call_debug(&app, "connect:dual_path_race_start", serde_json::json!({
|
||||
"role": format!("{:?}", r),
|
||||
"peer_reflex": peer_addr_parsed.map(|a| a.to_string()),
|
||||
"peer_mapped": peer_mapped_parsed.map(|a| a.to_string()),
|
||||
"peer_local": peer_local_parsed.iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
||||
"dial_order": candidates.dial_order().iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
||||
"dial_order_raw": candidates.dial_order().iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
||||
"dial_order_smart": candidates.smart_dial_order(own_reflex_parsed.as_ref()).iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
||||
"relay_addr": relay_sockaddr.to_string(),
|
||||
"own_reflex_addr": own_reflex_addr,
|
||||
}));
|
||||
// Phase 6 fix: install the oneshot BEFORE the race
|
||||
// starts. The peer's MediaPathReport can arrive
|
||||
// while our race is still running — if we set up
|
||||
// the oneshot after the race, the recv loop has
|
||||
// nowhere to send the report and it gets dropped,
|
||||
// causing a 3s timeout and false relay fallback.
|
||||
let (path_report_tx, path_report_rx) = tokio::sync::oneshot::channel::<bool>();
|
||||
{
|
||||
let mut sig = state.signal.lock().await;
|
||||
@@ -476,8 +473,6 @@ async fn connect(
|
||||
|
||||
let room_sni = room.clone();
|
||||
let call_sni = format!("call-{room}");
|
||||
let own_reflex_parsed: Option<std::net::SocketAddr> =
|
||||
own_reflex_addr.as_deref().and_then(|s| s.parse().ok());
|
||||
match wzp_client::dual_path::race(
|
||||
r,
|
||||
candidates,
|
||||
|
||||
Reference in New Issue
Block a user