feat: Prometheus metrics on relay + web bridge, client JSONL export (T5-S1/S3/S4)
WZP-P2-T5-S1: Relay Prometheus /metrics - RelayMetrics: active_sessions, active_rooms, packets/bytes_forwarded, auth_attempts (ok/fail), handshake_duration histogram - --metrics-port flag spawns HTTP server - Wired into auth, handshake, session, and packet forwarding paths - 2 tests WZP-P2-T5-S3: Web bridge Prometheus /metrics - WebMetrics: active_connections, frames_bridged (up/down), auth_failures, handshake_latency histogram - Added /metrics route to existing axum app - Wired into WS connect/disconnect, auth, handshake, send/recv loops - 2 tests WZP-P2-T5-S4: Client --metrics-file JSONL - ClientMetricsSnapshot with all telemetry fields - MetricsWriter: writes one JSON line per second to file - snapshot_from_stats() converts JitterStats to snapshot - --metrics-file <path> flag - 3 tests 223 tests passing across all crates. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,9 @@ pub struct RelayConfig {
|
||||
/// featherChat auth validation URL (e.g., "https://chat.example.com/v1/auth/validate").
|
||||
/// If set, clients must present a valid token before joining rooms.
|
||||
pub auth_url: Option<String>,
|
||||
/// Port for the Prometheus metrics HTTP endpoint (e.g., 9090).
|
||||
/// If None, the metrics endpoint is disabled.
|
||||
pub metrics_port: Option<u16>,
|
||||
}
|
||||
|
||||
impl Default for RelayConfig {
|
||||
@@ -34,6 +37,7 @@ impl Default for RelayConfig {
|
||||
jitter_max_depth: 250,
|
||||
log_level: "info".to_string(),
|
||||
auth_url: None,
|
||||
metrics_port: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
pub mod auth;
|
||||
pub mod config;
|
||||
pub mod handshake;
|
||||
pub mod metrics;
|
||||
pub mod pipeline;
|
||||
pub mod room;
|
||||
pub mod session_mgr;
|
||||
|
||||
@@ -17,6 +17,7 @@ use tracing::{error, info};
|
||||
|
||||
use wzp_proto::MediaTransport;
|
||||
use wzp_relay::config::RelayConfig;
|
||||
use wzp_relay::metrics::RelayMetrics;
|
||||
use wzp_relay::pipeline::{PipelineConfig, RelayPipeline};
|
||||
use wzp_relay::room::{self, RoomManager};
|
||||
use wzp_relay::session_mgr::SessionManager;
|
||||
@@ -45,14 +46,22 @@ fn parse_args() -> RelayConfig {
|
||||
args.get(i).expect("--auth-url requires a URL").to_string(),
|
||||
);
|
||||
}
|
||||
"--metrics-port" => {
|
||||
i += 1;
|
||||
config.metrics_port = Some(
|
||||
args.get(i).expect("--metrics-port requires a port number")
|
||||
.parse().expect("invalid --metrics-port number"),
|
||||
);
|
||||
}
|
||||
"--help" | "-h" => {
|
||||
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>]");
|
||||
eprintln!("Usage: wzp-relay [--listen <addr>] [--remote <addr>] [--auth-url <url>] [--metrics-port <port>]");
|
||||
eprintln!();
|
||||
eprintln!("Options:");
|
||||
eprintln!(" --listen <addr> Listen address (default: 0.0.0.0:4433)");
|
||||
eprintln!(" --remote <addr> Remote relay for forwarding (disables room mode)");
|
||||
eprintln!(" --auth-url <url> featherChat auth endpoint (e.g., https://chat.example.com/v1/auth/validate)");
|
||||
eprintln!(" When set, clients must send a bearer token as first signal message.");
|
||||
eprintln!(" --listen <addr> Listen address (default: 0.0.0.0:4433)");
|
||||
eprintln!(" --remote <addr> Remote relay for forwarding (disables room mode)");
|
||||
eprintln!(" --auth-url <url> featherChat auth endpoint (e.g., https://chat.example.com/v1/auth/validate)");
|
||||
eprintln!(" When set, clients must send a bearer token as first signal message.");
|
||||
eprintln!(" --metrics-port <port> Prometheus metrics HTTP port (e.g., 9090). Disabled if not set.");
|
||||
eprintln!();
|
||||
eprintln!("Room mode (default):");
|
||||
eprintln!(" Clients join rooms by name. Packets forwarded to all others (SFU).");
|
||||
@@ -141,6 +150,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
.install_default()
|
||||
.expect("failed to install rustls crypto provider");
|
||||
|
||||
// Prometheus metrics
|
||||
let metrics = Arc::new(RelayMetrics::new());
|
||||
if let Some(port) = config.metrics_port {
|
||||
let m = metrics.clone();
|
||||
tokio::spawn(wzp_relay::metrics::serve_metrics(port, m));
|
||||
}
|
||||
|
||||
// Generate ephemeral relay identity for crypto handshake
|
||||
let relay_seed = wzp_crypto::Seed::generate();
|
||||
let relay_fp = relay_seed.derive_identity().public_identity().fingerprint;
|
||||
@@ -186,6 +202,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let session_mgr = session_mgr.clone();
|
||||
let auth_url = config.auth_url.clone();
|
||||
let relay_seed_bytes = relay_seed.0;
|
||||
let metrics = metrics.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let addr = connection.remote_address();
|
||||
@@ -208,6 +225,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
Ok(Some(wzp_proto::SignalMessage::AuthToken { token })) => {
|
||||
match wzp_relay::auth::validate_token(url, &token).await {
|
||||
Ok(client) => {
|
||||
metrics.auth_attempts.with_label_values(&["ok"]).inc();
|
||||
info!(
|
||||
%addr,
|
||||
fingerprint = %client.fingerprint,
|
||||
@@ -217,6 +235,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
Some(client.fingerprint)
|
||||
}
|
||||
Err(e) => {
|
||||
metrics.auth_attempts.with_label_values(&["fail"]).inc();
|
||||
error!(%addr, "auth failed: {e}");
|
||||
transport.close().await.ok();
|
||||
return;
|
||||
@@ -243,12 +262,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
};
|
||||
|
||||
// Crypto handshake: verify client identity + negotiate quality profile
|
||||
let handshake_start = std::time::Instant::now();
|
||||
let (_crypto_session, _chosen_profile) = match wzp_relay::handshake::accept_handshake(
|
||||
&*transport,
|
||||
&relay_seed_bytes,
|
||||
).await {
|
||||
Ok(result) => {
|
||||
info!(%addr, "crypto handshake complete");
|
||||
let elapsed = handshake_start.elapsed().as_secs_f64();
|
||||
metrics.handshake_duration.observe(elapsed);
|
||||
info!(%addr, elapsed_ms = %(elapsed * 1000.0), "crypto handshake complete");
|
||||
result
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -302,13 +324,19 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
metrics.active_sessions.inc();
|
||||
|
||||
let participant_id = {
|
||||
let mut mgr = room_mgr.lock().await;
|
||||
match mgr.join(&room_name, addr, transport.clone(), authenticated_fp.as_deref()) {
|
||||
Ok(id) => id,
|
||||
Ok(id) => {
|
||||
metrics.active_rooms.set(mgr.list().len() as i64);
|
||||
id
|
||||
}
|
||||
Err(e) => {
|
||||
error!(%addr, room = %room_name, "room join denied: {e}");
|
||||
// Clean up the session we just created
|
||||
metrics.active_sessions.dec();
|
||||
let mut smgr = session_mgr.lock().await;
|
||||
smgr.remove_session(session_id);
|
||||
transport.close().await.ok();
|
||||
@@ -322,9 +350,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
room_name,
|
||||
participant_id,
|
||||
transport.clone(),
|
||||
metrics.clone(),
|
||||
).await;
|
||||
|
||||
// Participant disconnected — clean up session
|
||||
metrics.active_sessions.dec();
|
||||
{
|
||||
let mgr = room_mgr.lock().await;
|
||||
metrics.active_rooms.set(mgr.list().len() as i64);
|
||||
}
|
||||
{
|
||||
let mut smgr = session_mgr.lock().await;
|
||||
smgr.remove_session(session_id);
|
||||
|
||||
147
crates/wzp-relay/src/metrics.rs
Normal file
147
crates/wzp-relay/src/metrics.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
//! Prometheus metrics for the WZP relay daemon.
|
||||
|
||||
use prometheus::{
|
||||
Encoder, Histogram, HistogramOpts, IntCounter, IntCounterVec, IntGauge, Opts, Registry,
|
||||
TextEncoder,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// All relay-level Prometheus metrics.
|
||||
#[derive(Clone)]
|
||||
pub struct RelayMetrics {
|
||||
pub active_sessions: IntGauge,
|
||||
pub active_rooms: IntGauge,
|
||||
pub packets_forwarded: IntCounter,
|
||||
pub bytes_forwarded: IntCounter,
|
||||
pub auth_attempts: IntCounterVec,
|
||||
pub handshake_duration: Histogram,
|
||||
registry: Registry,
|
||||
}
|
||||
|
||||
impl RelayMetrics {
|
||||
/// Create and register all relay metrics with a new registry.
|
||||
pub fn new() -> Self {
|
||||
let registry = Registry::new();
|
||||
|
||||
let active_sessions = IntGauge::with_opts(
|
||||
Opts::new("wzp_relay_active_sessions", "Current active sessions"),
|
||||
)
|
||||
.expect("metric");
|
||||
let active_rooms = IntGauge::with_opts(
|
||||
Opts::new("wzp_relay_active_rooms", "Current active rooms"),
|
||||
)
|
||||
.expect("metric");
|
||||
let packets_forwarded = IntCounter::with_opts(
|
||||
Opts::new("wzp_relay_packets_forwarded_total", "Total packets forwarded"),
|
||||
)
|
||||
.expect("metric");
|
||||
let bytes_forwarded = IntCounter::with_opts(
|
||||
Opts::new("wzp_relay_bytes_forwarded_total", "Total bytes forwarded"),
|
||||
)
|
||||
.expect("metric");
|
||||
let auth_attempts = IntCounterVec::new(
|
||||
Opts::new("wzp_relay_auth_attempts_total", "Auth validation attempts"),
|
||||
&["result"],
|
||||
)
|
||||
.expect("metric");
|
||||
let handshake_duration = Histogram::with_opts(
|
||||
HistogramOpts::new(
|
||||
"wzp_relay_handshake_duration_seconds",
|
||||
"Crypto handshake time",
|
||||
)
|
||||
.buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5]),
|
||||
)
|
||||
.expect("metric");
|
||||
|
||||
registry.register(Box::new(active_sessions.clone())).expect("register");
|
||||
registry.register(Box::new(active_rooms.clone())).expect("register");
|
||||
registry.register(Box::new(packets_forwarded.clone())).expect("register");
|
||||
registry.register(Box::new(bytes_forwarded.clone())).expect("register");
|
||||
registry.register(Box::new(auth_attempts.clone())).expect("register");
|
||||
registry.register(Box::new(handshake_duration.clone())).expect("register");
|
||||
|
||||
Self {
|
||||
active_sessions,
|
||||
active_rooms,
|
||||
packets_forwarded,
|
||||
bytes_forwarded,
|
||||
auth_attempts,
|
||||
handshake_duration,
|
||||
registry,
|
||||
}
|
||||
}
|
||||
|
||||
/// Gather all metrics and encode them as Prometheus text format.
|
||||
pub fn metrics_handler(&self) -> String {
|
||||
let encoder = TextEncoder::new();
|
||||
let metric_families = self.registry.gather();
|
||||
let mut buffer = Vec::new();
|
||||
encoder.encode(&metric_families, &mut buffer).expect("encode");
|
||||
String::from_utf8(buffer).expect("utf8")
|
||||
}
|
||||
}
|
||||
|
||||
/// Start an HTTP server serving GET /metrics on the given port.
|
||||
pub async fn serve_metrics(port: u16, metrics: Arc<RelayMetrics>) {
|
||||
use axum::{routing::get, Router};
|
||||
|
||||
let app = Router::new().route(
|
||||
"/metrics",
|
||||
get(move || {
|
||||
let m = metrics.clone();
|
||||
async move { m.metrics_handler() }
|
||||
}),
|
||||
);
|
||||
|
||||
let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port));
|
||||
let listener = tokio::net::TcpListener::bind(addr)
|
||||
.await
|
||||
.expect("failed to bind metrics port");
|
||||
tracing::info!(%addr, "metrics endpoint serving");
|
||||
axum::serve(listener, app)
|
||||
.await
|
||||
.expect("metrics server error");
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn metrics_register() {
|
||||
let m = RelayMetrics::new();
|
||||
// Touch the CounterVec labels so they appear in output
|
||||
m.auth_attempts.with_label_values(&["ok"]);
|
||||
m.auth_attempts.with_label_values(&["fail"]);
|
||||
let output = m.metrics_handler();
|
||||
// Should contain all registered metric names (as HELP or TYPE lines)
|
||||
assert!(output.contains("wzp_relay_active_sessions"));
|
||||
assert!(output.contains("wzp_relay_active_rooms"));
|
||||
assert!(output.contains("wzp_relay_packets_forwarded_total"));
|
||||
assert!(output.contains("wzp_relay_bytes_forwarded_total"));
|
||||
assert!(output.contains("wzp_relay_auth_attempts_total"));
|
||||
assert!(output.contains("wzp_relay_handshake_duration_seconds"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_increment() {
|
||||
let m = RelayMetrics::new();
|
||||
|
||||
m.active_sessions.set(5);
|
||||
m.active_rooms.set(2);
|
||||
m.packets_forwarded.inc_by(100);
|
||||
m.bytes_forwarded.inc_by(48000);
|
||||
m.auth_attempts.with_label_values(&["ok"]).inc();
|
||||
m.auth_attempts.with_label_values(&["fail"]).inc_by(3);
|
||||
m.handshake_duration.observe(0.042);
|
||||
|
||||
let output = m.metrics_handler();
|
||||
assert!(output.contains("wzp_relay_active_sessions 5"));
|
||||
assert!(output.contains("wzp_relay_active_rooms 2"));
|
||||
assert!(output.contains("wzp_relay_packets_forwarded_total 100"));
|
||||
assert!(output.contains("wzp_relay_bytes_forwarded_total 48000"));
|
||||
assert!(output.contains("wzp_relay_auth_attempts_total{result=\"ok\"} 1"));
|
||||
assert!(output.contains("wzp_relay_auth_attempts_total{result=\"fail\"} 3"));
|
||||
assert!(output.contains("wzp_relay_handshake_duration_seconds_count 1"));
|
||||
}
|
||||
}
|
||||
@@ -12,6 +12,8 @@ use tracing::{error, info, warn};
|
||||
|
||||
use wzp_proto::MediaTransport;
|
||||
|
||||
use crate::metrics::RelayMetrics;
|
||||
|
||||
/// Unique participant ID within a room.
|
||||
pub type ParticipantId = u64;
|
||||
|
||||
@@ -176,6 +178,7 @@ pub async fn run_participant(
|
||||
room_name: String,
|
||||
participant_id: ParticipantId,
|
||||
transport: Arc<wzp_transport::QuinnTransport>,
|
||||
metrics: Arc<RelayMetrics>,
|
||||
) {
|
||||
let addr = transport.connection().remote_address();
|
||||
let mut packets_forwarded = 0u64;
|
||||
@@ -200,6 +203,7 @@ pub async fn run_participant(
|
||||
};
|
||||
|
||||
// Forward to all others
|
||||
let pkt_bytes = pkt.payload.len() as u64;
|
||||
for other in &others {
|
||||
// Best-effort: if one send fails, continue to others
|
||||
if let Err(e) = other.send_media(&pkt).await {
|
||||
@@ -208,6 +212,9 @@ pub async fn run_participant(
|
||||
}
|
||||
}
|
||||
|
||||
let fan_out = others.len() as u64;
|
||||
metrics.packets_forwarded.inc_by(fan_out);
|
||||
metrics.bytes_forwarded.inc_by(pkt_bytes * fan_out);
|
||||
packets_forwarded += 1;
|
||||
if packets_forwarded % 500 == 0 {
|
||||
let room_size = {
|
||||
|
||||
Reference in New Issue
Block a user