feat: Prometheus metrics on relay + web bridge, client JSONL export (T5-S1/S3/S4)

WZP-P2-T5-S1: Relay Prometheus /metrics
- RelayMetrics: active_sessions, active_rooms, packets/bytes_forwarded,
  auth_attempts (ok/fail), handshake_duration histogram
- --metrics-port flag spawns HTTP server
- Wired into auth, handshake, session, and packet forwarding paths
- 2 tests

WZP-P2-T5-S3: Web bridge Prometheus /metrics
- WebMetrics: active_connections, frames_bridged (up/down),
  auth_failures, handshake_latency histogram
- Added /metrics route to existing axum app
- Wired into WS connect/disconnect, auth, handshake, send/recv loops
- 2 tests

WZP-P2-T5-S4: Client --metrics-file JSONL
- ClientMetricsSnapshot with all telemetry fields
- MetricsWriter: writes one JSON line per second to file
- snapshot_from_stats() converts JitterStats to snapshot
- --metrics-file <path> flag
- 3 tests

223 tests passing across all crates.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Siavash Sameni
2026-03-28 12:44:57 +04:00
parent 3f813cd510
commit 39f6908478
14 changed files with 645 additions and 12 deletions

View File

@@ -22,6 +22,9 @@ pub struct RelayConfig {
/// featherChat auth validation URL (e.g., "https://chat.example.com/v1/auth/validate").
/// If set, clients must present a valid token before joining rooms.
pub auth_url: Option<String>,
/// Port for the Prometheus metrics HTTP endpoint (e.g., 9090).
/// If None, the metrics endpoint is disabled.
pub metrics_port: Option<u16>,
}
impl Default for RelayConfig {
@@ -34,6 +37,7 @@ impl Default for RelayConfig {
jitter_max_depth: 250,
log_level: "info".to_string(),
auth_url: None,
metrics_port: None,
}
}
}

View File

@@ -10,6 +10,7 @@
pub mod auth;
pub mod config;
pub mod handshake;
pub mod metrics;
pub mod pipeline;
pub mod room;
pub mod session_mgr;

View File

@@ -17,6 +17,7 @@ use tracing::{error, info};
use wzp_proto::MediaTransport;
use wzp_relay::config::RelayConfig;
use wzp_relay::metrics::RelayMetrics;
use wzp_relay::pipeline::{PipelineConfig, RelayPipeline};
use wzp_relay::room::{self, RoomManager};
use wzp_relay::session_mgr::SessionManager;
@@ -45,14 +46,22 @@ fn parse_args() -> RelayConfig {
args.get(i).expect("--auth-url requires a URL").to_string(),
);
}
"--metrics-port" => {
i += 1;
config.metrics_port = Some(
args.get(i).expect("--metrics-port requires a port number")
.parse().expect("invalid --metrics-port number"),
);
}
"--help" | "-h" => {
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>]");
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>] [--metrics-port <port>]");
eprintln!();
eprintln!("Options:");
eprintln!(" --listen <addr> Listen address (default: 0.0.0.0:4433)");
eprintln!(" --remote <addr> Remote relay for forwarding (disables room mode)");
eprintln!(" --auth-url <url> featherChat auth endpoint (e.g., https://chat.example.com/v1/auth/validate)");
eprintln!(" When set, clients must send a bearer token as first signal message.");
eprintln!(" --listen <addr> Listen address (default: 0.0.0.0:4433)");
eprintln!(" --remote <addr> Remote relay for forwarding (disables room mode)");
eprintln!(" --auth-url <url> featherChat auth endpoint (e.g., https://chat.example.com/v1/auth/validate)");
eprintln!(" When set, clients must send a bearer token as first signal message.");
eprintln!(" --metrics-port <port> Prometheus metrics HTTP port (e.g., 9090). Disabled if not set.");
eprintln!();
eprintln!("Room mode (default):");
eprintln!(" Clients join rooms by name. Packets forwarded to all others (SFU).");
@@ -141,6 +150,13 @@ async fn main() -> anyhow::Result<()> {
.install_default()
.expect("failed to install rustls crypto provider");
// Prometheus metrics
let metrics = Arc::new(RelayMetrics::new());
if let Some(port) = config.metrics_port {
let m = metrics.clone();
tokio::spawn(wzp_relay::metrics::serve_metrics(port, m));
}
// Generate ephemeral relay identity for crypto handshake
let relay_seed = wzp_crypto::Seed::generate();
let relay_fp = relay_seed.derive_identity().public_identity().fingerprint;
@@ -186,6 +202,7 @@ async fn main() -> anyhow::Result<()> {
let session_mgr = session_mgr.clone();
let auth_url = config.auth_url.clone();
let relay_seed_bytes = relay_seed.0;
let metrics = metrics.clone();
tokio::spawn(async move {
let addr = connection.remote_address();
@@ -208,6 +225,7 @@ async fn main() -> anyhow::Result<()> {
Ok(Some(wzp_proto::SignalMessage::AuthToken { token })) => {
match wzp_relay::auth::validate_token(url, &token).await {
Ok(client) => {
metrics.auth_attempts.with_label_values(&["ok"]).inc();
info!(
%addr,
fingerprint = %client.fingerprint,
@@ -217,6 +235,7 @@ async fn main() -> anyhow::Result<()> {
Some(client.fingerprint)
}
Err(e) => {
metrics.auth_attempts.with_label_values(&["fail"]).inc();
error!(%addr, "auth failed: {e}");
transport.close().await.ok();
return;
@@ -243,12 +262,15 @@ async fn main() -> anyhow::Result<()> {
};
// Crypto handshake: verify client identity + negotiate quality profile
let handshake_start = std::time::Instant::now();
let (_crypto_session, _chosen_profile) = match wzp_relay::handshake::accept_handshake(
&*transport,
&relay_seed_bytes,
).await {
Ok(result) => {
info!(%addr, "crypto handshake complete");
let elapsed = handshake_start.elapsed().as_secs_f64();
metrics.handshake_duration.observe(elapsed);
info!(%addr, elapsed_ms = %(elapsed * 1000.0), "crypto handshake complete");
result
}
Err(e) => {
@@ -302,13 +324,19 @@ async fn main() -> anyhow::Result<()> {
}
};
metrics.active_sessions.inc();
let participant_id = {
let mut mgr = room_mgr.lock().await;
match mgr.join(&room_name, addr, transport.clone(), authenticated_fp.as_deref()) {
Ok(id) => id,
Ok(id) => {
metrics.active_rooms.set(mgr.list().len() as i64);
id
}
Err(e) => {
error!(%addr, room = %room_name, "room join denied: {e}");
// Clean up the session we just created
metrics.active_sessions.dec();
let mut smgr = session_mgr.lock().await;
smgr.remove_session(session_id);
transport.close().await.ok();
@@ -322,9 +350,15 @@ async fn main() -> anyhow::Result<()> {
room_name,
participant_id,
transport.clone(),
metrics.clone(),
).await;
// Participant disconnected — clean up session
metrics.active_sessions.dec();
{
let mgr = room_mgr.lock().await;
metrics.active_rooms.set(mgr.list().len() as i64);
}
{
let mut smgr = session_mgr.lock().await;
smgr.remove_session(session_id);

View File

@@ -0,0 +1,147 @@
//! Prometheus metrics for the WZP relay daemon.
use prometheus::{
Encoder, Histogram, HistogramOpts, IntCounter, IntCounterVec, IntGauge, Opts, Registry,
TextEncoder,
};
use std::sync::Arc;
/// All relay-level Prometheus metrics.
#[derive(Clone)]
pub struct RelayMetrics {
pub active_sessions: IntGauge,
pub active_rooms: IntGauge,
pub packets_forwarded: IntCounter,
pub bytes_forwarded: IntCounter,
pub auth_attempts: IntCounterVec,
pub handshake_duration: Histogram,
registry: Registry,
}
impl RelayMetrics {
/// Create and register all relay metrics with a new registry.
pub fn new() -> Self {
let registry = Registry::new();
let active_sessions = IntGauge::with_opts(
Opts::new("wzp_relay_active_sessions", "Current active sessions"),
)
.expect("metric");
let active_rooms = IntGauge::with_opts(
Opts::new("wzp_relay_active_rooms", "Current active rooms"),
)
.expect("metric");
let packets_forwarded = IntCounter::with_opts(
Opts::new("wzp_relay_packets_forwarded_total", "Total packets forwarded"),
)
.expect("metric");
let bytes_forwarded = IntCounter::with_opts(
Opts::new("wzp_relay_bytes_forwarded_total", "Total bytes forwarded"),
)
.expect("metric");
let auth_attempts = IntCounterVec::new(
Opts::new("wzp_relay_auth_attempts_total", "Auth validation attempts"),
&["result"],
)
.expect("metric");
let handshake_duration = Histogram::with_opts(
HistogramOpts::new(
"wzp_relay_handshake_duration_seconds",
"Crypto handshake time",
)
.buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5]),
)
.expect("metric");
registry.register(Box::new(active_sessions.clone())).expect("register");
registry.register(Box::new(active_rooms.clone())).expect("register");
registry.register(Box::new(packets_forwarded.clone())).expect("register");
registry.register(Box::new(bytes_forwarded.clone())).expect("register");
registry.register(Box::new(auth_attempts.clone())).expect("register");
registry.register(Box::new(handshake_duration.clone())).expect("register");
Self {
active_sessions,
active_rooms,
packets_forwarded,
bytes_forwarded,
auth_attempts,
handshake_duration,
registry,
}
}
/// Gather all metrics and encode them as Prometheus text format.
pub fn metrics_handler(&self) -> String {
let encoder = TextEncoder::new();
let metric_families = self.registry.gather();
let mut buffer = Vec::new();
encoder.encode(&metric_families, &mut buffer).expect("encode");
String::from_utf8(buffer).expect("utf8")
}
}
/// Start an HTTP server serving GET /metrics on the given port.
pub async fn serve_metrics(port: u16, metrics: Arc<RelayMetrics>) {
use axum::{routing::get, Router};
let app = Router::new().route(
"/metrics",
get(move || {
let m = metrics.clone();
async move { m.metrics_handler() }
}),
);
let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port));
let listener = tokio::net::TcpListener::bind(addr)
.await
.expect("failed to bind metrics port");
tracing::info!(%addr, "metrics endpoint serving");
axum::serve(listener, app)
.await
.expect("metrics server error");
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn metrics_register() {
let m = RelayMetrics::new();
// Touch the CounterVec labels so they appear in output
m.auth_attempts.with_label_values(&["ok"]);
m.auth_attempts.with_label_values(&["fail"]);
let output = m.metrics_handler();
// Should contain all registered metric names (as HELP or TYPE lines)
assert!(output.contains("wzp_relay_active_sessions"));
assert!(output.contains("wzp_relay_active_rooms"));
assert!(output.contains("wzp_relay_packets_forwarded_total"));
assert!(output.contains("wzp_relay_bytes_forwarded_total"));
assert!(output.contains("wzp_relay_auth_attempts_total"));
assert!(output.contains("wzp_relay_handshake_duration_seconds"));
}
#[test]
fn metrics_increment() {
let m = RelayMetrics::new();
m.active_sessions.set(5);
m.active_rooms.set(2);
m.packets_forwarded.inc_by(100);
m.bytes_forwarded.inc_by(48000);
m.auth_attempts.with_label_values(&["ok"]).inc();
m.auth_attempts.with_label_values(&["fail"]).inc_by(3);
m.handshake_duration.observe(0.042);
let output = m.metrics_handler();
assert!(output.contains("wzp_relay_active_sessions 5"));
assert!(output.contains("wzp_relay_active_rooms 2"));
assert!(output.contains("wzp_relay_packets_forwarded_total 100"));
assert!(output.contains("wzp_relay_bytes_forwarded_total 48000"));
assert!(output.contains("wzp_relay_auth_attempts_total{result=\"ok\"} 1"));
assert!(output.contains("wzp_relay_auth_attempts_total{result=\"fail\"} 3"));
assert!(output.contains("wzp_relay_handshake_duration_seconds_count 1"));
}
}

View File

@@ -12,6 +12,8 @@ use tracing::{error, info, warn};
use wzp_proto::MediaTransport;
use crate::metrics::RelayMetrics;
/// Unique participant ID within a room.
pub type ParticipantId = u64;
@@ -176,6 +178,7 @@ pub async fn run_participant(
room_name: String,
participant_id: ParticipantId,
transport: Arc<wzp_transport::QuinnTransport>,
metrics: Arc<RelayMetrics>,
) {
let addr = transport.connection().remote_address();
let mut packets_forwarded = 0u64;
@@ -200,6 +203,7 @@ pub async fn run_participant(
};
// Forward to all others
let pkt_bytes = pkt.payload.len() as u64;
for other in &others {
// Best-effort: if one send fails, continue to others
if let Err(e) = other.send_media(&pkt).await {
@@ -208,6 +212,9 @@ pub async fn run_participant(
}
}
let fan_out = others.len() as u64;
metrics.packets_forwarded.inc_by(fan_out);
metrics.bytes_forwarded.inc_by(pkt_bytes * fan_out);
packets_forwarded += 1;
if packets_forwarded % 500 == 0 {
let room_size = {