feat: per-session metrics + inter-relay health probe (T5-S2/S5)

WZP-P2-T5-S2: Per-session Prometheus metrics
- 5 new per-session gauges/counters: buffer_depth, loss_pct, rtt_ms,
  underruns, overruns — all labeled by session_id
- update_session_quality() reads QualityReport from packet headers
- update_session_buffer() tracks jitter buffer state per session
- remove_session_metrics() cleans up labels on disconnect
- Delta-aware counter increments avoid double-counting
- 2 tests: session_quality_update, session_metrics_cleanup

WZP-P2-T5-S5: Inter-relay health probe
- New probe.rs: ProbeConfig, ProbeMetrics, SlidingWindow, ProbeRunner
- --probe <addr> flag (repeatable) spawns background probe per target
- Sends Ping/s over QUIC, receives Pong, computes RTT/loss/jitter
- SlidingWindow(60): tracks last 60 pings, loss = missed pongs,
  jitter = std deviation of RTT
- Prometheus gauges: wzp_probe_rtt_ms, loss_pct, jitter_ms, up
  with target label
- Probe connections use SNI "_probe" — relay responds with Pong loop,
  skipping auth/handshake
- Auto-reconnect with 5s backoff on disconnect
- 6 tests: metrics_register, rtt/loss/jitter calculation,
  window eviction, empty edge cases

231 tests passing across all crates.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Siavash Sameni
2026-03-28 13:09:52 +04:00
parent 39f6908478
commit 216ebf4a25
6 changed files with 650 additions and 4 deletions

View File

@@ -53,8 +53,16 @@ fn parse_args() -> RelayConfig {
.parse().expect("invalid --metrics-port number"),
);
}
"--probe" => {
i += 1;
let addr: SocketAddr = args.get(i)
.expect("--probe requires an address")
.parse()
.expect("invalid --probe address");
config.probe_targets.push(addr);
}
"--help" | "-h" => {
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>] [--metrics-port <port>]");
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>] [--metrics-port <port>] [--probe <addr>]...");
eprintln!();
eprintln!("Options:");
eprintln!(" --listen <addr> Listen address (default: 0.0.0.0:4433)");
@@ -62,6 +70,7 @@ fn parse_args() -> RelayConfig {
eprintln!(" --auth-url <url> featherChat auth endpoint (e.g., https://chat.example.com/v1/auth/validate)");
eprintln!(" When set, clients must send a bearer token as first signal message.");
eprintln!(" --metrics-port <port> Prometheus metrics HTTP port (e.g., 9090). Disabled if not set.");
eprintln!(" --probe <addr> Peer relay to probe for health monitoring (repeatable).");
eprintln!();
eprintln!("Room mode (default):");
eprintln!(" Clients join rooms by name. Packets forwarded to all others (SFU).");
@@ -183,6 +192,14 @@ async fn main() -> anyhow::Result<()> {
// Session manager — enforces max concurrent sessions
let session_mgr = Arc::new(Mutex::new(SessionManager::new(config.max_sessions)));
// Spawn inter-relay health probes
for target in &config.probe_targets {
let probe_config = wzp_relay::probe::ProbeConfig::new(*target);
let runner = wzp_relay::probe::ProbeRunner::new(probe_config, metrics.registry());
info!(target = %target, "spawning inter-relay health probe");
tokio::spawn(async move { runner.run().await });
}
if let Some(ref url) = config.auth_url {
info!(url, "auth enabled — clients must present featherChat token");
} else {
@@ -217,6 +234,37 @@ async fn main() -> anyhow::Result<()> {
let transport = Arc::new(wzp_transport::QuinnTransport::new(connection));
// Probe connections use SNI "_probe" to identify themselves.
// They skip auth + handshake and just do Ping->Pong.
if room_name == "_probe" {
info!(%addr, "probe connection detected, entering Ping/Pong responder");
loop {
match transport.recv_signal().await {
Ok(Some(wzp_proto::SignalMessage::Ping { timestamp_ms })) => {
if let Err(e) = transport.send_signal(
&wzp_proto::SignalMessage::Pong { timestamp_ms },
).await {
error!(%addr, "probe pong send error: {e}");
break;
}
}
Ok(Some(_)) => {
// Ignore non-Ping signals on probe connections
}
Ok(None) => {
info!(%addr, "probe connection closed");
break;
}
Err(e) => {
error!(%addr, "probe recv error: {e}");
break;
}
}
}
transport.close().await.ok();
return;
}
// Auth check: if --auth-url is set, expect first signal message to be a token
// Auth: if --auth-url is set, expect AuthToken as first signal
let authenticated_fp: Option<String> = if let Some(ref url) = auth_url {
@@ -345,15 +393,21 @@ async fn main() -> anyhow::Result<()> {
}
};
let session_id_str: String = session_id
.iter()
.map(|b| format!("{b:02x}"))
.collect();
room::run_participant(
room_mgr.clone(),
room_name,
participant_id,
transport.clone(),
metrics.clone(),
&session_id_str,
).await;
// Participant disconnected — clean up session
// Participant disconnected — clean up per-session metrics
metrics.remove_session_metrics(&session_id_str);
metrics.active_sessions.dec();
{
let mgr = room_mgr.lock().await;