T2.3-T2.6: BWE guard, relay conformance Tier A/B/C, Prometheus metrics

This commit is contained in:
Siavash Sameni
2026-05-11 20:50:01 +04:00
parent 3de56cf1f9
commit 54c1a35186
16 changed files with 977 additions and 38 deletions

View File

@@ -107,6 +107,7 @@ pub fn signal_to_call_type(signal: &SignalMessage) -> CallSignalType {
SignalMessage::TransferAck => CallSignalType::Offer, // reuse
SignalMessage::PresenceUpdate { .. } => CallSignalType::Offer, // reuse
SignalMessage::RouteQuery { .. } => CallSignalType::Offer, // reuse
SignalMessage::TransportFeedback { .. } => CallSignalType::Offer, // reuse (BWE)
SignalMessage::RouteResponse { .. } => CallSignalType::Offer, // reuse
SignalMessage::SessionForward { .. } => CallSignalType::Offer, // reuse
SignalMessage::SessionForwardAck { .. } => CallSignalType::Offer, // reuse

View File

@@ -1,8 +1,10 @@
//! See also: [`crate::dred_tuner`] for continuous DRED tuning within a tier.
use std::collections::VecDeque;
use std::sync::Arc;
use std::time::{Duration, Instant};
use crate::BandwidthEstimator;
use crate::QualityProfile;
use crate::packet::QualityReport;
use crate::traits::QualityController;
@@ -134,6 +136,8 @@ pub struct AdaptiveQualityController {
probe: Option<ProbeState>,
/// Time spent stable at the current tier (for probe trigger).
stable_since: Option<Instant>,
/// Optional bandwidth estimator for BWE-guarded upgrades.
bwe: Option<Arc<BandwidthEstimator>>,
}
/// Threshold for downgrading (fast reaction to degradation).
@@ -187,6 +191,7 @@ impl AdaptiveQualityController {
fec_boost_amount: DEFAULT_FEC_BOOST,
probe: None,
stable_since: None,
bwe: None,
}
}
@@ -254,6 +259,17 @@ impl AdaptiveQualityController {
self.stable_since = None;
}
/// Attach a bandwidth estimator for BWE-guarded tier transitions.
pub fn set_bandwidth_estimator(&mut self, bwe: Arc<BandwidthEstimator>) {
self.bwe = Some(bwe);
}
/// Return the bitrate ceiling (in bps) for a given tier, including FEC overhead.
fn tier_ceiling_bps(tier: Tier) -> u64 {
let kbps = tier.profile().total_bitrate_kbps();
(kbps * 1000.0) as u64
}
/// Get the effective downgrade threshold based on network context.
fn downgrade_threshold(&self) -> u32 {
match self.network_context {
@@ -296,6 +312,15 @@ impl AdaptiveQualityController {
if self.consecutive_up >= threshold {
// Only upgrade one step at a time
if let Some(next_tier) = self.upgrade_one_step() {
// BWE guard: require 130% headroom over target tier bitrate
if let Some(ref bwe) = self.bwe {
let required = (Self::tier_ceiling_bps(next_tier) * 130) / 100;
if bwe.target_send_bps() < required {
// Insufficient bandwidth — reset counter to prevent flapping
self.consecutive_up = 0;
return None;
}
}
self.current_tier = next_tier;
self.current_profile = next_tier.profile();
self.consecutive_up = 0;
@@ -529,6 +554,53 @@ mod tests {
}
}
#[test]
fn bwe_guard_blocks_upgrade_when_bandwidth_insufficient() {
let mut ctrl = AdaptiveQualityController::new();
// Force to catastrophic
let bad = make_report(50.0, 300);
for _ in 0..3 {
ctrl.observe(&bad);
}
assert_eq!(ctrl.tier(), Tier::Catastrophic);
// Attach a BWE with very low headroom.
// Degraded tier needs 6kbps * 1.5 FEC = 9kbps → 130% = 11.7kbps.
// Set target_send_bps ≈ 9_000 (below 11_700 threshold).
let bwe = Arc::new(BandwidthEstimator::new(1000.0, 1.0, 100_000.0));
bwe.update_from_path(1_000_000, 0, 10); // high cwnd
bwe.update_from_peer(10_000); // low remb → target = 9_000
ctrl.set_bandwidth_estimator(bwe.clone());
let good = make_report(0.5, 20);
for _ in 0..5 {
assert!(
ctrl.observe(&good).is_none(),
"upgrade should be blocked by low BWE"
);
}
assert_eq!(
ctrl.tier(),
Tier::Catastrophic,
"should remain at Catastrophic"
);
// Raise BWE well above the 130% threshold
bwe.update_from_peer(100_000); // target ≈ 90_000 bps
// Counter was reset, need another 5 good reports
for _ in 0..4 {
assert!(ctrl.observe(&good).is_none());
}
let result = ctrl.observe(&good);
assert!(
result.is_some(),
"upgrade should proceed with sufficient BWE"
);
assert_eq!(ctrl.tier(), Tier::Degraded);
}
#[test]
fn tier_classification() {
// Studio tiers

View File

@@ -0,0 +1,315 @@
//! Relay conformance metering — Tier A/B/C enforcement.
//!
//! Each participant gets a [`ConformanceMeter`] that tracks per-second
//! traffic against the declared codec's nominal bitrate ceiling.
//! Violations are logged and counted but do **not** drop packets
//! (observe-only mode).
use std::collections::VecDeque;
use std::time::{Duration, Instant};
use wzp_proto::{CodecId, MediaHeader};
/// Rolling window size for timestamp-drift detection (Tier C).
const DRIFT_WINDOW_SIZE: usize = 200;
/// Kinds of conformance violation detected by the relay.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Violation {
/// Cumulative bitrate in the current 1 s window exceeds the Tier A ceiling.
BitrateExceeded,
/// Packet rate exceeds the per-codec safety limit (Tier B).
PacketRateExceeded,
/// Timestamp jumped backwards or forwards suspiciously (Tier C).
TimestampDrift,
}
/// Per-participant traffic conformance meter.
pub struct ConformanceMeter {
window_start: Instant,
bytes_in_window: u64,
packets_in_window: u64,
/// Rolling (seq, timestamp) pairs for drift detection.
drift_window: VecDeque<(u32, u32)>,
}
impl ConformanceMeter {
pub fn new() -> Self {
Self {
window_start: Instant::now(),
bytes_in_window: 0,
packets_in_window: 0,
drift_window: VecDeque::with_capacity(DRIFT_WINDOW_SIZE),
}
}
/// Inspect an incoming media packet and accumulate it against the
/// current 1-second window. Returns [`Err(Violation)`] when a limit
/// is crossed.
pub fn observe(
&mut self,
header: &MediaHeader,
payload_len: usize,
now: Instant,
) -> Result<(), Violation> {
// Roll the window forward if a second has elapsed.
if now.duration_since(self.window_start) >= Duration::from_secs(1) {
self.window_start = now;
self.bytes_in_window = 0;
self.packets_in_window = 0;
}
let packet_size = (MediaHeader::WIRE_SIZE + payload_len) as u64;
self.bytes_in_window += packet_size;
self.packets_in_window += 1;
// Tier A — bitrate ceiling.
let ceiling = ceiling_bps(header.codec_id);
let max_bytes_per_sec = ceiling / 8;
if self.bytes_in_window > max_bytes_per_sec {
return Err(Violation::BitrateExceeded);
}
// Tier B — packet-rate ceiling.
let max_pps = max_pps(header.codec_id);
let pps_threshold = (max_pps as f32 * 1.5) as u64;
if self.packets_in_window > pps_threshold {
return Err(Violation::PacketRateExceeded);
}
// Tier C — timestamp drift.
self.drift_window.push_back((header.seq, header.timestamp));
if self.drift_window.len() > DRIFT_WINDOW_SIZE {
self.drift_window.pop_front();
}
if self.drift_window.len() >= 2 {
let (first_seq, first_ts) = self.drift_window.front().copied().unwrap();
let (last_seq, last_ts) = self.drift_window.back().copied().unwrap();
let ds = last_seq.wrapping_sub(first_seq) as f64;
let dt = last_ts.wrapping_sub(first_ts) as f64;
if ds > 0.0 {
let avg_ms_per_packet = dt / ds;
let frame_ms = header.codec_id.frame_duration_ms() as f64;
let min_ratio = frame_ms * 0.5;
let max_ratio = frame_ms * 2.0;
if avg_ms_per_packet < min_ratio || avg_ms_per_packet > max_ratio {
return Err(Violation::TimestampDrift);
}
}
}
Ok(())
}
}
impl Default for ConformanceMeter {
fn default() -> Self {
Self::new()
}
}
/// Compute the Tier A bitrate ceiling for a given codec.
///
/// Formula:
/// nominal_bitrate * 3 (FEC 2.0 overhead) * 115 / 100 (15% safety margin)
/// with a floor of 2 kbps.
pub fn ceiling_bps(codec: CodecId) -> u64 {
let nominal = codec.bitrate_bps() as u64;
(nominal * 3 * 115 / 100).max(2_000)
}
/// Compute the Tier B packet-rate ceiling for a given codec.
///
/// Formula:
/// 1000 / frame_duration_ms * 3 (FEC overhead factor)
pub fn max_pps(codec: CodecId) -> u32 {
let fd = codec.frame_duration_ms() as u32;
if fd == 0 {
return 0;
}
(1000 / fd) * 3
}
#[cfg(test)]
mod tests {
use super::*;
use wzp_proto::MediaType;
fn make_header(codec_id: CodecId) -> MediaHeader {
MediaHeader {
version: 2,
flags: 0,
media_type: MediaType::Audio,
codec_id,
seq: 0,
timestamp: 0,
fec_block: 0,
stream_id: 0,
fec_ratio: 0,
}
}
fn make_header_with_seq_ts(codec_id: CodecId, seq: u32, timestamp: u32) -> MediaHeader {
MediaHeader {
version: 2,
flags: 0,
media_type: MediaType::Audio,
codec_id,
seq,
timestamp,
fec_block: 0,
stream_id: 0,
fec_ratio: 0,
}
}
#[test]
fn bitrate_exceeded_for_opus24k() {
let mut meter = ConformanceMeter::new();
let header = make_header(CodecId::Opus24k);
// Ceiling for Opus24k = 24_000 * 3 * 115 / 100 = 82_800 bps
// = 10_350 bytes/sec. 1 MB/s = 125_000 bytes/packet will blow past
// that in a single packet.
let now = Instant::now();
let result = meter.observe(&header, 1_000_000, now);
assert_eq!(result, Err(Violation::BitrateExceeded));
}
#[test]
fn small_packets_stay_within_ceiling() {
let mut meter = ConformanceMeter::new();
let header = make_header(CodecId::Opus24k);
// Ceiling = 82_800 bps = 10_350 bytes/sec.
// Each packet = 16-byte header + 80 bytes = 96 bytes.
// 100 packets = 9_600 bytes < 10_350.
let now = Instant::now();
for _ in 0..100 {
assert!(meter.observe(&header, 80, now).is_ok());
}
}
#[test]
fn window_resets_after_one_second() {
let mut meter = ConformanceMeter::new();
let header = make_header(CodecId::Opus24k);
// Fill the window to just under the limit.
let t0 = Instant::now();
for _ in 0..10 {
assert!(meter.observe(&header, 1000, t0).is_ok());
}
// 10 * (header wire size + 1000) ≈ 10 * 1034 = 10_340 bytes < 10_350
// Same packets 1.1 seconds later should be fine because the window
// rolls over.
let t1 = t0 + Duration::from_millis(1_100);
for _ in 0..10 {
assert!(meter.observe(&header, 1000, t1).is_ok());
}
}
#[test]
fn ceiling_bps_floor() {
// ComfortNoise has 0 nominal bitrate, so the floor kicks in.
assert_eq!(ceiling_bps(CodecId::ComfortNoise), 2_000);
}
// ------------------------------------------------------------------
// Tier B — packet rate
// ------------------------------------------------------------------
#[test]
fn packet_rate_exceeded() {
let mut meter = ConformanceMeter::new();
// Opus24k: max_pps = 1000/20 * 3 = 150. Threshold = 150 * 1.5 = 225.
let header = make_header(CodecId::Opus24k);
let now = Instant::now();
for _ in 0..225 {
assert!(meter.observe(&header, 10, now).is_ok());
}
// 226th packet should trip the limit.
assert_eq!(
meter.observe(&header, 10, now),
Err(Violation::PacketRateExceeded)
);
}
#[test]
fn packet_rate_within_limit() {
let mut meter = ConformanceMeter::new();
// Opus6k: max_pps = 1000/40 * 3 = 75. Threshold = 75 * 1.5 = 112.
// Use 0-byte payload so bitrate ceiling (2_587 bytes/sec) is not the
// limiting factor. 112 packets × 16 bytes = 1_792 bytes < 2_587.
let header = make_header(CodecId::Opus6k);
let now = Instant::now();
for _ in 0..112 {
assert!(meter.observe(&header, 0, now).is_ok());
}
}
// ------------------------------------------------------------------
// Tier C — timestamp drift
// ------------------------------------------------------------------
#[test]
fn timestamp_drift_detected_when_too_fast() {
let mut meter = ConformanceMeter::new();
// Opus24k frame_duration = 20 ms.
// Acceptable range: [10, 40] ms per packet.
// Send packets with timestamp advancing by 5 ms each (too fast).
let now = Instant::now();
let mut drift_seen = false;
for i in 0..200 {
let header = make_header_with_seq_ts(CodecId::Opus24k, i, i * 5);
match meter.observe(&header, 10, now) {
Ok(()) => {}
Err(Violation::TimestampDrift) => drift_seen = true,
Err(other) => panic!("unexpected violation: {other:?}"),
}
}
assert!(drift_seen, "expected TimestampDrift to be detected");
}
#[test]
fn timestamp_drift_detected_when_too_slow() {
let mut meter = ConformanceMeter::new();
// Opus24k frame_duration = 20 ms.
// Acceptable range: [10, 40] ms per packet.
// Send packets with timestamp advancing by 50 ms each (too slow).
let now = Instant::now();
let mut drift_seen = false;
for i in 0..200 {
let header = make_header_with_seq_ts(CodecId::Opus24k, i, i * 50);
match meter.observe(&header, 10, now) {
Ok(()) => {}
Err(Violation::TimestampDrift) => drift_seen = true,
Err(other) => panic!("unexpected violation: {other:?}"),
}
}
assert!(drift_seen, "expected TimestampDrift to be detected");
}
#[test]
fn timestamp_normal_no_drift() {
let mut meter = ConformanceMeter::new();
// Opus24k frame_duration = 20 ms.
// Send 200 packets with timestamp advancing by exactly 20 ms each.
let now = Instant::now();
for i in 0..200 {
let header = make_header_with_seq_ts(CodecId::Opus24k, i, i * 20);
assert!(meter.observe(&header, 10, now).is_ok());
}
}
#[test]
fn timestamp_drift_not_checked_before_two_packets() {
let mut meter = ConformanceMeter::new();
let now = Instant::now();
// Single packet with wild timestamp — should not trigger drift.
let header = make_header_with_seq_ts(CodecId::Opus24k, 0, 999_999);
assert!(meter.observe(&header, 10, now).is_ok());
}
}

View File

@@ -10,6 +10,7 @@
pub mod auth;
pub mod call_registry;
pub mod config;
pub mod conformance;
pub mod event_log;
pub mod federation;
pub mod handshake;

View File

@@ -1,12 +1,15 @@
//! Prometheus metrics for the WZP relay daemon.
use prometheus::{
Encoder, GaugeVec, Histogram, HistogramOpts, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
Opts, Registry, TextEncoder,
Encoder, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge,
IntGaugeVec, Opts, Registry, TextEncoder,
};
use std::sync::Arc;
use wzp_proto::MediaHeader;
use wzp_proto::packet::QualityReport;
use crate::conformance::Violation;
/// All relay-level Prometheus metrics.
#[derive(Clone)]
pub struct RelayMetrics {
@@ -32,6 +35,9 @@ pub struct RelayMetrics {
// Phase 4: loss-recovery breakdown per session.
pub session_dred_reconstructions: IntCounterVec,
pub session_classical_plc: IntCounterVec,
pub conformance_violations: IntCounterVec,
pub conformance_bytes: HistogramVec,
pub conformance_iat_ms: HistogramVec,
registry: Registry,
}
@@ -168,6 +174,37 @@ impl RelayMetrics {
&["session_id"],
)
.expect("metric");
let conformance_violations = IntCounterVec::new(
Opts::new(
"wzp_relay_conformance_violations_total",
"Conformance violations by tier, codec, media type and verdict",
),
&["tier", "codec_id", "media_type", "verdict"],
)
.expect("metric");
let conformance_bytes = HistogramVec::new(
HistogramOpts::new(
"wzp_relay_conformance_bytes_per_session",
"Packet size distribution observed by the conformance meter",
)
.buckets(vec![
16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0, 4096.0, 8192.0, 16384.0,
32768.0, 65536.0,
]),
&["media_type"],
)
.expect("metric");
let conformance_iat_ms = HistogramVec::new(
HistogramOpts::new(
"wzp_relay_conformance_iat_ms",
"Inter-arrival time distribution in milliseconds",
)
.buckets(vec![
1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 60.0, 80.0, 100.0, 150.0, 200.0, 300.0, 500.0,
]),
&["media_type"],
)
.expect("metric");
registry
.register(Box::new(active_sessions.clone()))
@@ -226,6 +263,15 @@ impl RelayMetrics {
registry
.register(Box::new(session_classical_plc.clone()))
.expect("register");
registry
.register(Box::new(conformance_violations.clone()))
.expect("register");
registry
.register(Box::new(conformance_bytes.clone()))
.expect("register");
registry
.register(Box::new(conformance_iat_ms.clone()))
.expect("register");
Self {
active_sessions,
@@ -247,6 +293,9 @@ impl RelayMetrics {
session_overruns,
session_dred_reconstructions,
session_classical_plc,
conformance_violations,
conformance_bytes,
conformance_iat_ms,
registry,
}
}
@@ -328,6 +377,43 @@ impl RelayMetrics {
}
}
/// Record conformance-related metrics for a single received packet.
///
/// * `header` — the media header (provides codec_id and media_type).
/// * `payload_len` — payload length in bytes.
/// * `iat_ms` — inter-arrival time since the previous packet.
/// * `violation` — `Some(Violation)` if the packet triggered a conformance
/// limit; `None` for clean packets.
pub fn record_conformance(
&self,
header: &MediaHeader,
payload_len: usize,
iat_ms: u64,
violation: Option<Violation>,
) {
let media_type = format!("{:?}", header.media_type);
let bytes = (MediaHeader::WIRE_SIZE + payload_len) as f64;
self.conformance_bytes
.with_label_values(&[&media_type])
.observe(bytes);
self.conformance_iat_ms
.with_label_values(&[&media_type])
.observe(iat_ms as f64);
if let Some(v) = violation {
let tier = match v {
Violation::BitrateExceeded => "A",
Violation::PacketRateExceeded => "B",
Violation::TimestampDrift => "C",
};
let codec_id = format!("{:?}", header.codec_id);
let verdict = format!("{:?}", v);
self.conformance_violations
.with_label_values(&[tier, &codec_id, &media_type, &verdict])
.inc();
}
}
/// Remove all per-session label values for a disconnected session.
pub fn remove_session_metrics(&self, session_id: &str) {
let _ = self.session_buffer_depth.remove_label_values(&[session_id]);

View File

@@ -17,6 +17,7 @@ use wzp_proto::packet::TrunkFrame;
use wzp_proto::quality::{AdaptiveQualityController, Tier};
use wzp_proto::traits::QualityController;
use crate::conformance::ConformanceMeter;
use crate::metrics::RelayMetrics;
use crate::trunk::TrunkBatcher;
@@ -780,6 +781,7 @@ async fn run_participant_plain(
let mut max_forward_ms = 0u64;
let mut send_errors = 0u64;
let mut last_log_instant = std::time::Instant::now();
let mut conformance = ConformanceMeter::new();
let mut tap_stats = if debug_tap.as_ref().map_or(false, |t| t.matches(&room_name)) {
Some(TapStats::new())
@@ -829,6 +831,22 @@ async fn run_participant_plain(
);
}
// Conformance check (Tier A/B/C — observe-only)
let violation = conformance
.observe(&pkt.header, pkt.payload.len(), std::time::Instant::now())
.err();
metrics.record_conformance(&pkt.header, pkt.payload.len(), recv_gap_ms, violation);
if let Some(v) = violation {
warn!(
room = %room_name,
participant = participant_id,
codec = ?pkt.header.codec_id,
seq = pkt.header.seq,
violation = ?v,
"conformance violation"
);
}
// Update per-session quality metrics if a quality report is present
if let Some(ref report) = pkt.quality_report {
metrics.update_session_quality(session_id, report);
@@ -998,6 +1016,7 @@ async fn run_participant_trunked(
let mut max_forward_ms = 0u64;
let mut send_errors = 0u64;
let mut last_log_instant = std::time::Instant::now();
let mut conformance = ConformanceMeter::new();
info!(
room = %room_name,
@@ -1051,6 +1070,22 @@ async fn run_participant_trunked(
);
}
// Conformance check (Tier A/B/C — observe-only)
let violation = conformance
.observe(&pkt.header, pkt.payload.len(), std::time::Instant::now())
.err();
metrics.record_conformance(&pkt.header, pkt.payload.len(), recv_gap_ms, violation);
if let Some(v) = violation {
warn!(
room = %room_name,
participant = participant_id,
codec = ?pkt.header.codec_id,
seq = pkt.header.seq,
violation = ?v,
"conformance violation (trunked)"
);
}
if let Some(ref report) = pkt.quality_report {
metrics.update_session_quality(session_id, report);
}