bench: add criterion benchmarks for protocol, bandwidth, TCP RX scan, and EC-SRP5

Adds four Criterion.rs benchmark suites to measure hot-path performance and demonstrate the impact of Sprints 1–3 optimizations: - benches/protocol.rs — Command & StatusMessage serialize/deserialize - benches/bandwidth.rs — BandwidthState atomics, budget, interval math - benches/tcp_rx_scan.rs — memchr SIMD scan vs naive O(n) loop (55× faster on 256KB buffers with status at end) - benches/ecsrp5.rs — WCurve::new() heavy math vs cached LazyLock (~123,000× faster access) Also adds BENCHMARKS.md with usage instructions and example results. Visibility changes (bench-only): - scan_status_message is now pub (was #[cfg(test)] only) - WCurve and WCURVE are now pub in ecsrp5.rs dev-dependencies: criterion + pprof (optional flamegraph support)
2026-04-30 21:01:38 +04:00
parent bba9b0512c
commit 3afbfb42cf
9 changed files with 969 additions and 18 deletions
--- a/benches/bandwidth.rs
+++ b/benches/bandwidth.rs
@@ -0,0 +1,79 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use btest_rs::bandwidth::{BandwidthState, calc_send_interval, advance_next_send};
+use std::sync::atomic::Ordering;
+use std::time::{Duration, Instant};
+
+fn bench_atomic_fetch_add(c: &mut Criterion) {
+    let state = BandwidthState::new();
+    c.bench_function("bandwidth_rx_bytes_fetch_add", |b| {
+        b.iter(|| {
+            black_box(state.rx_bytes.fetch_add(1500, Ordering::Relaxed));
+        })
+    });
+    c.bench_function("bandwidth_tx_bytes_fetch_add", |b| {
+        b.iter(|| {
+            black_box(state.tx_bytes.fetch_add(32768, Ordering::Relaxed));
+        })
+    });
+}
+
+fn bench_spend_budget(c: &mut Criterion) {
+    // Unlimited budget (fast path)
+    let unlimited = BandwidthState::new();
+    c.bench_function("spend_budget_unlimited", |b| {
+        b.iter(|| black_box(unlimited.spend_budget(black_box(1500))))
+    });
+
+    // Limited budget
+    let limited = BandwidthState::new();
+    limited.byte_budget.store(1_000_000_000, Ordering::SeqCst);
+    c.bench_function("spend_budget_limited", |b| {
+        b.iter(|| black_box(limited.spend_budget(black_box(1500))))
+    });
+}
+
+fn bench_calc_send_interval(c: &mut Criterion) {
+    c.bench_function("calc_interval_100mbps_1500b", |b| {
+        b.iter(|| black_box(calc_send_interval(black_box(100_000_000), black_box(1500))))
+    });
+    c.bench_function("calc_interval_1gbps_32768b", |b| {
+        b.iter(|| black_box(calc_send_interval(black_box(1_000_000_000), black_box(32768))))
+    });
+    c.bench_function("calc_interval_unlimited", |b| {
+        b.iter(|| black_box(calc_send_interval(black_box(0), black_box(1500))))
+    });
+}
+
+fn bench_advance_next_send(c: &mut Criterion) {
+    let iv = Duration::from_micros(120);
+    let now = Instant::now();
+    let mut next = now;
+    c.bench_function("advance_next_send", |b| {
+        b.iter(|| {
+            let r = advance_next_send(&mut next, iv, now);
+            black_box(r);
+        });
+        next = now;
+    });
+}
+
+fn bench_summary(c: &mut Criterion) {
+    let state = BandwidthState::new();
+    // Pre-populate some values so loads are real
+    state.total_tx_bytes.store(1_000_000_000, Ordering::Relaxed);
+    state.total_rx_bytes.store(2_000_000_000, Ordering::Relaxed);
+    state.intervals.store(100, Ordering::Relaxed);
+    c.bench_function("bandwidth_summary", |b| {
+        b.iter(|| black_box(state.summary()))
+    });
+}
+
+criterion_group!(
+    bandwidth_benches,
+    bench_atomic_fetch_add,
+    bench_spend_budget,
+    bench_calc_send_interval,
+    bench_advance_next_send,
+    bench_summary
+);
+criterion_main!(bandwidth_benches);
--- a/benches/ecsrp5.rs
+++ b/benches/ecsrp5.rs
@@ -0,0 +1,19 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use btest_rs::ecsrp5::{WCurve, WCURVE};
+
+fn bench_wcurve_new(c: &mut Criterion) {
+    c.bench_function("wcurve_new_uncached", |b| {
+        b.iter(|| black_box(WCurve::new()))
+    });
+}
+
+fn bench_wcurve_cached(c: &mut Criterion) {
+    // Force initialization before benchmarking
+    let _ = &*WCURVE;
+    c.bench_function("wcurve_cached_access", |b| {
+        b.iter(|| black_box(&*WCURVE))
+    });
+}
+
+criterion_group!(ecsrp5_benches, bench_wcurve_new, bench_wcurve_cached);
+criterion_main!(ecsrp5_benches);
--- a/benches/protocol.rs
+++ b/benches/protocol.rs
@@ -0,0 +1,65 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use btest_rs::protocol::{Command, StatusMessage, CMD_PROTO_TCP, CMD_DIR_BOTH};
+
+fn bench_command_serialize(c: &mut Criterion) {
+    let cmd = Command::new(CMD_PROTO_TCP, CMD_DIR_BOTH);
+    c.bench_function("command_serialize", |b| {
+        b.iter(|| black_box(cmd.serialize()))
+    });
+}
+
+fn bench_command_deserialize(c: &mut Criterion) {
+    let bytes = [0x01, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
+    c.bench_function("command_deserialize", |b| {
+        b.iter(|| black_box(Command::deserialize(black_box(&bytes))))
+    });
+}
+
+fn bench_status_message_serialize(c: &mut Criterion) {
+    let msg = StatusMessage {
+        seq: 42,
+        bytes_received: 1_000_000,
+        cpu_load: 50,
+    };
+    c.bench_function("status_message_serialize", |b| {
+        b.iter(|| black_box(msg.serialize()))
+    });
+}
+
+fn bench_status_message_deserialize(c: &mut Criterion) {
+    let bytes = [0x07, 0xB2, 0x00, 0x00, 0x2A, 0x00, 0x00, 0x00, 0x40, 0x42, 0x0F, 0x00];
+    c.bench_function("status_message_deserialize", |b| {
+        b.iter(|| black_box(StatusMessage::deserialize(black_box(&bytes))))
+    });
+}
+
+fn bench_roundtrip(c: &mut Criterion) {
+    let cmd = Command::new(CMD_PROTO_TCP, CMD_DIR_BOTH);
+    let msg = StatusMessage {
+        seq: 99,
+        bytes_received: 50_000,
+        cpu_load: 75,
+    };
+    c.bench_function("command_roundtrip", |b| {
+        b.iter(|| {
+            let s = black_box(cmd.serialize());
+            black_box(Command::deserialize(&s))
+        })
+    });
+    c.bench_function("status_message_roundtrip", |b| {
+        b.iter(|| {
+            let s = black_box(msg.serialize());
+            black_box(StatusMessage::deserialize(&s))
+        })
+    });
+}
+
+criterion_group!(
+    protocol_benches,
+    bench_command_serialize,
+    bench_command_deserialize,
+    bench_status_message_serialize,
+    bench_status_message_deserialize,
+    bench_roundtrip
+);
+criterion_main!(protocol_benches);
--- a/benches/tcp_rx_scan.rs
+++ b/benches/tcp_rx_scan.rs
@@ -0,0 +1,100 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use btest_rs::client::scan_status_message;
+use btest_rs::protocol::STATUS_MSG_TYPE;
+
+/// Naive O(n) byte-by-byte scan — the old implementation.
+fn naive_scan(buf: &[u8]) -> Option<u8> {
+    const STATUS_MSG_SIZE: usize = 12;
+    if buf.len() < STATUS_MSG_SIZE {
+        return None;
+    }
+    for i in 0..=(buf.len() - STATUS_MSG_SIZE) {
+        if buf[i] == STATUS_MSG_TYPE && buf[i + 1] >= 0x80 {
+            return Some((buf[i + 1] & 0x7F).min(100));
+        }
+    }
+    None
+}
+
+fn make_buffer(size: usize, status_at: Option<usize>) -> Vec<u8> {
+    let mut buf = vec![0u8; size];
+    if let Some(pos) = status_at {
+        buf[pos] = STATUS_MSG_TYPE;
+        buf[pos + 1] = 0x80 | 50; // CPU = 50%
+    }
+    buf
+}
+
+fn bench_scan_all_zeros(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tcp_rx_scan_all_zeros");
+    for size in [4096, 65536, 262144] {
+        let buf = make_buffer(size, None);
+        group.throughput(Throughput::Bytes(size as u64));
+        group.bench_with_input(BenchmarkId::new("naive", size), &buf, |b, buf| {
+            b.iter(|| black_box(naive_scan(black_box(buf))))
+        });
+        group.bench_with_input(BenchmarkId::new("memchr", size), &buf, |b, buf| {
+            b.iter(|| black_box(scan_status_message(black_box(&[]), black_box(buf))))
+        });
+    }
+    group.finish();
+}
+
+fn bench_scan_status_at_start(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tcp_rx_scan_status_at_start");
+    for size in [4096, 65536, 262144] {
+        let buf = make_buffer(size, Some(0));
+        group.throughput(Throughput::Bytes(size as u64));
+        group.bench_with_input(BenchmarkId::new("naive", size), &buf, |b, buf| {
+            b.iter(|| black_box(naive_scan(black_box(buf))))
+        });
+        group.bench_with_input(BenchmarkId::new("memchr", size), &buf, |b, buf| {
+            b.iter(|| black_box(scan_status_message(black_box(&[]), black_box(buf))))
+        });
+    }
+    group.finish();
+}
+
+fn bench_scan_status_at_end(c: &mut Criterion) {
+    let mut group = c.benchmark_group("tcp_rx_scan_status_at_end");
+    for size in [4096, 65536, 262144] {
+        let buf = make_buffer(size, Some(size - 12));
+        group.throughput(Throughput::Bytes(size as u64));
+        group.bench_with_input(BenchmarkId::new("naive", size), &buf, |b, buf| {
+            b.iter(|| black_box(naive_scan(black_box(buf))))
+        });
+        group.bench_with_input(BenchmarkId::new("memchr", size), &buf, |b, buf| {
+            b.iter(|| black_box(scan_status_message(black_box(&[]), black_box(buf))))
+        });
+    }
+    group.finish();
+}
+
+fn bench_scan_split_message(c: &mut Criterion) {
+    // Simulate a status message split across two reads:
+    // carry has first 5 bytes, buf has remaining 7 bytes
+    let mut carry = vec![0u8; 5];
+    carry[0] = STATUS_MSG_TYPE;
+    carry[1] = 0x80 | 75;
+    let buf = vec![0u8; 7];
+
+    c.bench_function("scan_split_5_7", |b| {
+        b.iter(|| black_box(scan_status_message(black_box(&carry), black_box(&buf))))
+    });
+
+    // Split with 2 bytes in carry (status type + cpu byte), 10 in buf
+    let carry_2 = vec![STATUS_MSG_TYPE, 0x80 | 33];
+    let buf_10 = vec![0u8; 10];
+    c.bench_function("scan_split_2_10", |b| {
+        b.iter(|| black_box(scan_status_message(black_box(&carry_2), black_box(&buf_10))))
+    });
+}
+
+criterion_group!(
+    tcp_rx_scan_benches,
+    bench_scan_all_zeros,
+    bench_scan_status_at_start,
+    bench_scan_status_at_end,
+    bench_scan_split_message
+);
+criterion_main!(tcp_rx_scan_benches);