feat: per-session metrics + inter-relay health probe (T5-S2/S5)
WZP-P2-T5-S2: Per-session Prometheus metrics - 5 new per-session gauges/counters: buffer_depth, loss_pct, rtt_ms, underruns, overruns — all labeled by session_id - update_session_quality() reads QualityReport from packet headers - update_session_buffer() tracks jitter buffer state per session - remove_session_metrics() cleans up labels on disconnect - Delta-aware counter increments avoid double-counting - 2 tests: session_quality_update, session_metrics_cleanup WZP-P2-T5-S5: Inter-relay health probe - New probe.rs: ProbeConfig, ProbeMetrics, SlidingWindow, ProbeRunner - --probe <addr> flag (repeatable) spawns background probe per target - Sends Ping/s over QUIC, receives Pong, computes RTT/loss/jitter - SlidingWindow(60): tracks last 60 pings, loss = missed pongs, jitter = std deviation of RTT - Prometheus gauges: wzp_probe_rtt_ms, loss_pct, jitter_ms, up with target label - Probe connections use SNI "_probe" — relay responds with Pong loop, skipping auth/handshake - Auto-reconnect with 5s backoff on disconnect - 6 tests: metrics_register, rtt/loss/jitter calculation, window eviction, empty edge cases 231 tests passing across all crates. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -53,8 +53,16 @@ fn parse_args() -> RelayConfig {
|
||||
.parse().expect("invalid --metrics-port number"),
|
||||
);
|
||||
}
|
||||
"--probe" => {
|
||||
i += 1;
|
||||
let addr: SocketAddr = args.get(i)
|
||||
.expect("--probe requires an address")
|
||||
.parse()
|
||||
.expect("invalid --probe address");
|
||||
config.probe_targets.push(addr);
|
||||
}
|
||||
"--help" | "-h" => {
|
||||
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>] [--metrics-port <port>]");
|
||||
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>] [--metrics-port <port>] [--probe <addr>]...");
|
||||
eprintln!();
|
||||
eprintln!("Options:");
|
||||
eprintln!(" --listen <addr> Listen address (default: 0.0.0.0:4433)");
|
||||
@@ -62,6 +70,7 @@ fn parse_args() -> RelayConfig {
|
||||
eprintln!(" --auth-url <url> featherChat auth endpoint (e.g., https://chat.example.com/v1/auth/validate)");
|
||||
eprintln!(" When set, clients must send a bearer token as first signal message.");
|
||||
eprintln!(" --metrics-port <port> Prometheus metrics HTTP port (e.g., 9090). Disabled if not set.");
|
||||
eprintln!(" --probe <addr> Peer relay to probe for health monitoring (repeatable).");
|
||||
eprintln!();
|
||||
eprintln!("Room mode (default):");
|
||||
eprintln!(" Clients join rooms by name. Packets forwarded to all others (SFU).");
|
||||
@@ -183,6 +192,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
// Session manager — enforces max concurrent sessions
|
||||
let session_mgr = Arc::new(Mutex::new(SessionManager::new(config.max_sessions)));
|
||||
|
||||
// Spawn inter-relay health probes
|
||||
for target in &config.probe_targets {
|
||||
let probe_config = wzp_relay::probe::ProbeConfig::new(*target);
|
||||
let runner = wzp_relay::probe::ProbeRunner::new(probe_config, metrics.registry());
|
||||
info!(target = %target, "spawning inter-relay health probe");
|
||||
tokio::spawn(async move { runner.run().await });
|
||||
}
|
||||
|
||||
if let Some(ref url) = config.auth_url {
|
||||
info!(url, "auth enabled — clients must present featherChat token");
|
||||
} else {
|
||||
@@ -217,6 +234,37 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let transport = Arc::new(wzp_transport::QuinnTransport::new(connection));
|
||||
|
||||
// Probe connections use SNI "_probe" to identify themselves.
|
||||
// They skip auth + handshake and just do Ping->Pong.
|
||||
if room_name == "_probe" {
|
||||
info!(%addr, "probe connection detected, entering Ping/Pong responder");
|
||||
loop {
|
||||
match transport.recv_signal().await {
|
||||
Ok(Some(wzp_proto::SignalMessage::Ping { timestamp_ms })) => {
|
||||
if let Err(e) = transport.send_signal(
|
||||
&wzp_proto::SignalMessage::Pong { timestamp_ms },
|
||||
).await {
|
||||
error!(%addr, "probe pong send error: {e}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(Some(_)) => {
|
||||
// Ignore non-Ping signals on probe connections
|
||||
}
|
||||
Ok(None) => {
|
||||
info!(%addr, "probe connection closed");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
error!(%addr, "probe recv error: {e}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
transport.close().await.ok();
|
||||
return;
|
||||
}
|
||||
|
||||
// Auth check: if --auth-url is set, expect first signal message to be a token
|
||||
// Auth: if --auth-url is set, expect AuthToken as first signal
|
||||
let authenticated_fp: Option<String> = if let Some(ref url) = auth_url {
|
||||
@@ -345,15 +393,21 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
let session_id_str: String = session_id
|
||||
.iter()
|
||||
.map(|b| format!("{b:02x}"))
|
||||
.collect();
|
||||
room::run_participant(
|
||||
room_mgr.clone(),
|
||||
room_name,
|
||||
participant_id,
|
||||
transport.clone(),
|
||||
metrics.clone(),
|
||||
&session_id_str,
|
||||
).await;
|
||||
|
||||
// Participant disconnected — clean up session
|
||||
// Participant disconnected — clean up per-session metrics
|
||||
metrics.remove_session_metrics(&session_id_str);
|
||||
metrics.active_sessions.dec();
|
||||
{
|
||||
let mgr = room_mgr.lock().await;
|
||||
|
||||
Reference in New Issue
Block a user