fix(nat): working NAT tickle + smart filter debug + timeout diags
Fixes from real-world 5G↔Starlink testing: NAT tickle fix: - tokio::net::UdpSocket::bind() doesn't set SO_REUSEADDR, so binding to the same port as quinn silently failed. Now uses socket2::Socket with explicit SO_REUSEADDR + SO_REUSEPORT (via libc on unix). - Tickle now logs success/failure for debugging. Diagnostic fixes: - connect:dual_path_race_start shows both dial_order_raw and dial_order_smart so we can see what filtering removed - Grace-period timeout (relay wins first, direct still running) now fills "timeout:grace" diags for unrecorded candidates - Previously candidate_diags was empty when relay won the race Dependencies: - Added socket2 = "0.5" to wzp-client 593 tests pass, 0 regressions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -8011,6 +8011,7 @@ dependencies = [
|
|||||||
"rustls",
|
"rustls",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"socket2 0.5.10",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ libc = "0.2"
|
|||||||
# MikroTik's default masquerade, don't support).
|
# MikroTik's default masquerade, don't support).
|
||||||
if-addrs = "0.13"
|
if-addrs = "0.13"
|
||||||
rand = { workspace = true }
|
rand = { workspace = true }
|
||||||
|
socket2 = "0.5"
|
||||||
|
|
||||||
# coreaudio-rs is Apple-framework-only; gate it to macOS so enabling
|
# coreaudio-rs is Apple-framework-only; gate it to macOS so enabling
|
||||||
# the `vpio` feature from a non-macOS target builds cleanly instead of
|
# the `vpio` feature from a non-macOS target builds cleanly instead of
|
||||||
|
|||||||
@@ -285,29 +285,56 @@ pub async fn race(
|
|||||||
// gets dropped by our NAT.
|
// gets dropped by our NAT.
|
||||||
if !tickle_addrs.is_empty() {
|
if !tickle_addrs.is_empty() {
|
||||||
if let Ok(local_addr) = ep_for_fut.local_addr() {
|
if let Ok(local_addr) = ep_for_fut.local_addr() {
|
||||||
// We can't send raw UDP on the quinn endpoint,
|
// Send a tickle to each peer candidate address
|
||||||
// so we use a fresh socket on the SAME port
|
// to open our NAT for return traffic from that IP.
|
||||||
// (SO_REUSEADDR). This makes the NAT see
|
//
|
||||||
// outbound traffic from our port to the peer,
|
// We use a socket2 socket with SO_REUSEADDR +
|
||||||
// opening the pinhole.
|
// SO_REUSEPORT on the SAME port as the quinn
|
||||||
let bind = SocketAddr::new(
|
// endpoint. This is necessary because quinn
|
||||||
std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
|
// already holds the port — a plain bind() would
|
||||||
local_addr.port(),
|
// fail with EADDRINUSE.
|
||||||
);
|
let tickle_result: Result<(), String> = (|| {
|
||||||
if let Ok(tickle_sock) = tokio::net::UdpSocket::bind(bind).await {
|
use std::net::UdpSocket as StdUdpSocket;
|
||||||
|
let sock = socket2::Socket::new(
|
||||||
|
socket2::Domain::IPV4,
|
||||||
|
socket2::Type::DGRAM,
|
||||||
|
Some(socket2::Protocol::UDP),
|
||||||
|
).map_err(|e| format!("socket: {e}"))?;
|
||||||
|
sock.set_reuse_address(true).map_err(|e| format!("reuseaddr: {e}"))?;
|
||||||
|
// macOS/BSD/Linux also need SO_REUSEPORT
|
||||||
|
#[cfg(any(target_os = "macos", target_os = "linux", target_os = "android"))]
|
||||||
|
{
|
||||||
|
// socket2 exposes set_reuse_port on unix
|
||||||
|
unsafe {
|
||||||
|
let optval: libc::c_int = 1;
|
||||||
|
libc::setsockopt(
|
||||||
|
std::os::unix::io::AsRawFd::as_raw_fd(&sock),
|
||||||
|
libc::SOL_SOCKET,
|
||||||
|
libc::SO_REUSEPORT,
|
||||||
|
&optval as *const _ as *const libc::c_void,
|
||||||
|
std::mem::size_of::<libc::c_int>() as libc::socklen_t,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sock.set_nonblocking(true).map_err(|e| format!("nonblock: {e}"))?;
|
||||||
|
let bind_addr: SocketAddr = SocketAddr::new(
|
||||||
|
std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED),
|
||||||
|
local_addr.port(),
|
||||||
|
);
|
||||||
|
sock.bind(&bind_addr.into()).map_err(|e| format!("bind :{}: {e}", local_addr.port()))?;
|
||||||
|
let std_sock: StdUdpSocket = sock.into();
|
||||||
for addr in &tickle_addrs {
|
for addr in &tickle_addrs {
|
||||||
// Send a minimal QUIC-like packet (version
|
let _ = std_sock.send_to(&[0u8; 1], addr);
|
||||||
// negotiation bait). The content doesn't
|
|
||||||
// matter — we just need the NAT to see
|
|
||||||
// outbound traffic from our port to this IP.
|
|
||||||
let tickle_bytes = [0u8; 1];
|
|
||||||
let _ = tickle_sock.send_to(&tickle_bytes, addr).await;
|
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
%addr,
|
%addr,
|
||||||
local_port = local_addr.port(),
|
local_port = local_addr.port(),
|
||||||
"dual_path: A-role sent NAT tickle"
|
"dual_path: A-role sent NAT tickle"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
|
})();
|
||||||
|
if let Err(e) = tickle_result {
|
||||||
|
tracing::warn!(error = %e, "dual_path: A-role NAT tickle failed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -670,7 +697,24 @@ pub async fn race(
|
|||||||
match tokio::time::timeout(Duration::from_secs(1), direct_task).await {
|
match tokio::time::timeout(Duration::from_secs(1), direct_task).await {
|
||||||
Ok(Ok(Ok(Ok(t)))) => { direct_result = Some(Ok(t)); }
|
Ok(Ok(Ok(Ok(t)))) => { direct_result = Some(Ok(t)); }
|
||||||
Ok(Ok(Ok(Err(e)))) => { direct_result = Some(Err(anyhow::anyhow!("{e}"))); }
|
Ok(Ok(Ok(Err(e)))) => { direct_result = Some(Err(anyhow::anyhow!("{e}"))); }
|
||||||
_ => { direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period"))); }
|
_ => {
|
||||||
|
direct_result = Some(Err(anyhow::anyhow!("direct: no result in grace period")));
|
||||||
|
// Fill timeout diags for candidates that never reported.
|
||||||
|
if let Ok(mut d) = diags_collector.lock() {
|
||||||
|
let recorded: std::collections::HashSet<usize> =
|
||||||
|
d.iter().map(|diag| diag.index).collect();
|
||||||
|
for (idx, addr) in smart_order.iter().enumerate() {
|
||||||
|
if !recorded.contains(&idx) {
|
||||||
|
d.push(CandidateDiag {
|
||||||
|
index: idx,
|
||||||
|
addr: addr.to_string(),
|
||||||
|
result: "timeout:grace".into(),
|
||||||
|
elapsed_ms: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if relay_result.is_none() {
|
if relay_result.is_none() {
|
||||||
|
|||||||
@@ -453,21 +453,18 @@ async fn connect(
|
|||||||
own = ?own_reflex_addr,
|
own = ?own_reflex_addr,
|
||||||
"connect: starting dual-path race"
|
"connect: starting dual-path race"
|
||||||
);
|
);
|
||||||
|
let own_reflex_parsed: Option<std::net::SocketAddr> =
|
||||||
|
own_reflex_addr.as_deref().and_then(|s| s.parse().ok());
|
||||||
emit_call_debug(&app, "connect:dual_path_race_start", serde_json::json!({
|
emit_call_debug(&app, "connect:dual_path_race_start", serde_json::json!({
|
||||||
"role": format!("{:?}", r),
|
"role": format!("{:?}", r),
|
||||||
"peer_reflex": peer_addr_parsed.map(|a| a.to_string()),
|
"peer_reflex": peer_addr_parsed.map(|a| a.to_string()),
|
||||||
"peer_mapped": peer_mapped_parsed.map(|a| a.to_string()),
|
"peer_mapped": peer_mapped_parsed.map(|a| a.to_string()),
|
||||||
"peer_local": peer_local_parsed.iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
"peer_local": peer_local_parsed.iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
||||||
"dial_order": candidates.dial_order().iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
"dial_order_raw": candidates.dial_order().iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
||||||
|
"dial_order_smart": candidates.smart_dial_order(own_reflex_parsed.as_ref()).iter().map(|a| a.to_string()).collect::<Vec<_>>(),
|
||||||
"relay_addr": relay_sockaddr.to_string(),
|
"relay_addr": relay_sockaddr.to_string(),
|
||||||
"own_reflex_addr": own_reflex_addr,
|
"own_reflex_addr": own_reflex_addr,
|
||||||
}));
|
}));
|
||||||
// Phase 6 fix: install the oneshot BEFORE the race
|
|
||||||
// starts. The peer's MediaPathReport can arrive
|
|
||||||
// while our race is still running — if we set up
|
|
||||||
// the oneshot after the race, the recv loop has
|
|
||||||
// nowhere to send the report and it gets dropped,
|
|
||||||
// causing a 3s timeout and false relay fallback.
|
|
||||||
let (path_report_tx, path_report_rx) = tokio::sync::oneshot::channel::<bool>();
|
let (path_report_tx, path_report_rx) = tokio::sync::oneshot::channel::<bool>();
|
||||||
{
|
{
|
||||||
let mut sig = state.signal.lock().await;
|
let mut sig = state.signal.lock().await;
|
||||||
@@ -476,8 +473,6 @@ async fn connect(
|
|||||||
|
|
||||||
let room_sni = room.clone();
|
let room_sni = room.clone();
|
||||||
let call_sni = format!("call-{room}");
|
let call_sni = format!("call-{room}");
|
||||||
let own_reflex_parsed: Option<std::net::SocketAddr> =
|
|
||||||
own_reflex_addr.as_deref().and_then(|s| s.parse().ok());
|
|
||||||
match wzp_client::dual_path::race(
|
match wzp_client::dual_path::race(
|
||||||
r,
|
r,
|
||||||
candidates,
|
candidates,
|
||||||
|
|||||||
Reference in New Issue
Block a user