perf: replace O(n) TCP RX buffer scan with SIMD memchr + carry buffer (Sprint 3)
All checks were successful
CI / test (push) Successful in 2m14s
All checks were successful
CI / test (push) Successful in 2m14s
This commit fixes the most significant hot-path bottleneck in the
client: the tcp_client_rx_loop was scanning up to 256KB byte-by-byte
on every read() call looking for interleaved 12-byte status messages.
Changes:
- client.rs (tcp_client_rx_loop): Replace the O(n) for-loop scan
with a three-stage approach:
1. Split-message check: An 11-byte carry buffer stores trailing
bytes from the previous read. We check every possible alignment
where a status message (0x07 + cpu_byte) could span the carry
and the start of the current buffer. This fixes a latent bug
where the old code would miss status messages split across TCP
read boundaries.
2. Fast scan: memchr::memchr (AVX2/NEON SIMD) finds 0x07 bytes
in the 256KB buffer. On all-zero data packets this exits in
~4096 SIMD-width operations instead of 262,144 byte compares.
~64x faster scan path.
3. Carry save: Save up to 11 trailing bytes for the next read.
- client.rs (unit tests): Add scan_status_message() helper and
five unit tests covering:
- Status message fully within buffer
- Status message split across reads (5+7 bytes)
- Status message split at boundary (1+11 bytes)
- All-zero buffer (no false positive)
- Short buffer (no panic)
- Cargo.toml / Cargo.lock: Add memchr as an explicit dependency.
Verified against live MikroTik RouterOS (TCP both + receive modes
with EC-SRP5 auth). Status messages detected correctly. No wire
protocol changes — 100% MikroTik compatible.
This commit is contained in:
101
Cargo.lock
generated
101
Cargo.lock
generated
@@ -11,6 +11,15 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "android_system_properties"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anstream"
|
name = "anstream"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
@@ -235,10 +244,12 @@ dependencies = [
|
|||||||
"askama",
|
"askama",
|
||||||
"axum",
|
"axum",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"hostname",
|
"hostname",
|
||||||
"ldap3",
|
"ldap3",
|
||||||
"md-5",
|
"md-5",
|
||||||
|
"memchr",
|
||||||
"num-bigint",
|
"num-bigint",
|
||||||
"num-integer",
|
"num-integer",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
@@ -283,6 +294,19 @@ version = "1.0.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chrono"
|
||||||
|
version = "0.4.44"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
|
||||||
|
dependencies = [
|
||||||
|
"iana-time-zone",
|
||||||
|
"js-sys",
|
||||||
|
"num-traits",
|
||||||
|
"wasm-bindgen",
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "4.6.0"
|
version = "4.6.0"
|
||||||
@@ -748,6 +772,30 @@ dependencies = [
|
|||||||
"tower-service",
|
"tower-service",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iana-time-zone"
|
||||||
|
version = "0.1.65"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470"
|
||||||
|
dependencies = [
|
||||||
|
"android_system_properties",
|
||||||
|
"core-foundation-sys",
|
||||||
|
"iana-time-zone-haiku",
|
||||||
|
"js-sys",
|
||||||
|
"log",
|
||||||
|
"wasm-bindgen",
|
||||||
|
"windows-core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "iana-time-zone-haiku"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "icu_collections"
|
name = "icu_collections"
|
||||||
version = "2.1.1"
|
version = "2.1.1"
|
||||||
@@ -1997,12 +2045,65 @@ dependencies = [
|
|||||||
"semver",
|
"semver",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-core"
|
||||||
|
version = "0.62.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
|
||||||
|
dependencies = [
|
||||||
|
"windows-implement",
|
||||||
|
"windows-interface",
|
||||||
|
"windows-link",
|
||||||
|
"windows-result",
|
||||||
|
"windows-strings",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-implement"
|
||||||
|
version = "0.60.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-interface"
|
||||||
|
version = "0.59.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-link"
|
name = "windows-link"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-result"
|
||||||
|
version = "0.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
|
||||||
|
dependencies = [
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-strings"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
|
||||||
|
dependencies = [
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.52.0"
|
version = "0.52.0"
|
||||||
|
|||||||
302
PERFORMANCE_AUDIT.md
Normal file
302
PERFORMANCE_AUDIT.md
Normal file
@@ -0,0 +1,302 @@
|
|||||||
|
# btest-rs Performance Audit
|
||||||
|
|
||||||
|
**Date:** 2026-04-30
|
||||||
|
**Scope:** Full codebase (`src/`, `tests/`, `Cargo.toml`)
|
||||||
|
**Methodology:** Static code analysis, hot-path tracing, lock/contention review, algorithmic complexity analysis
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
The codebase is generally well-structured for a network I/O tool, with good use of atomics in the per-packet hot path and zero-allocation protocol serialization. However, **three critical bottlenecks** significantly limit throughput and scalability:
|
||||||
|
|
||||||
|
1. **O(n) buffer scan on every TCP read** in the client RX loop (up to 256KB scanned per `read()` call)
|
||||||
|
2. **Expensive EC curve reconstruction on every authentication** (heavy `BigUint` modular arithmetic)
|
||||||
|
3. **Single SQLite connection mutex** serializing all DB operations in `server_pro`
|
||||||
|
|
||||||
|
Additionally, there is **no benchmark or profiling infrastructure** in the project, making it impossible to measure improvements or catch regressions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Severity Legend
|
||||||
|
|
||||||
|
| Icon | Severity | Impact |
|
||||||
|
|------|----------|--------|
|
||||||
|
| 🔴 | **Critical** | Direct throughput/latency hit in hot path; fix immediately |
|
||||||
|
| 🟠 | **High** | Significant overhead under load; fix in next sprint |
|
||||||
|
| 🟡 | **Medium** | Noticeable at scale or under specific conditions |
|
||||||
|
| 🟢 | **Low** | Cosmetic / easy wins; batch with other work |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔴 Critical Bottlenecks
|
||||||
|
|
||||||
|
### 1. O(n) Linear Buffer Scan in `tcp_client_rx_loop` (`src/client.rs:210-216`)
|
||||||
|
|
||||||
|
**Problem:** On every TCP `read()` call (up to 256KB), the client performs a byte-by-byte scan looking for interleaved 12-byte status messages:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
for i in 0..=(n - STATUS_MSG_SIZE) {
|
||||||
|
if buf[i] == STATUS_MSG_TYPE && buf[i + 1] >= 0x80 {
|
||||||
|
// ...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Since data packets are all zeros and status bytes are extremely rare, this **almost always scans the entire 256KB buffer** uselessly. At high bandwidth (many reads/sec), this wastes massive CPU cycles and pollutes cache lines.
|
||||||
|
|
||||||
|
**Impact:** CPU-bound slowdown on the client RX side during bidirectional TCP tests. The compiler *may* auto-vectorize the simple loop, but it still processes ~256K bytes per read.
|
||||||
|
|
||||||
|
**Fix Options (pick one):**
|
||||||
|
- **Best:** Use `memchr` (crate or `std::slice::memchr` on nightly) to find `0x07` bytes. On all-zero buffers this exits after a few SIMD-width checks.
|
||||||
|
- **Alternative:** Since status messages are injected at `write_all` boundaries and data is all zeros, maintain a small 12-byte sliding ring buffer across reads. Process the stream with a tiny state machine instead of scanning the whole buffer.
|
||||||
|
- **Alternative:** Track read bytes modulo expected packet size. Status messages are injected between full packets, so they will appear at predictable offsets *if* the client knows the server's `effective_size`. This requires protocol coordination.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. `WCurve::new()` Recomputes Generator on Every Auth (`src/ecsrp5.rs:363,499`)
|
||||||
|
|
||||||
|
**Problem:** Every EC-SRP5 authentication (client and server) calls `WCurve::new()`, which performs `lift_x(9)` → `prime_mod_sqrt()` — heavy `BigUint` modular arithmetic to derive the curve generator point.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub async fn client_authenticate<S: ...>(stream: &mut S, username: &str, password: &str) -> Result<()> {
|
||||||
|
let w = WCurve::new(); // <-- expensive, same result every time
|
||||||
|
// ...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The curve constants (`P`, `CURVE_ORDER`, `WEIERSTRASS_A`) are already cached as `LazyLock` statics, but the generator point is not.
|
||||||
|
|
||||||
|
**Impact:** Auth latency spikes, especially on the server under many concurrent connections. Each auth does redundant `BigUint` allocations and modular square roots.
|
||||||
|
|
||||||
|
**Fix:** Cache `WCurve` (or at least the generator point) in a global `LazyLock`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
static WCURVE: std::sync::LazyLock<WCurve> = std::sync::LazyLock::new(WCurve::new);
|
||||||
|
```
|
||||||
|
|
||||||
|
Then use `&*WCURVE` in both `client_authenticate` and `server_authenticate`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Single SQLite Mutex Serializes All DB Operations (`src/server_pro/user_db.rs:15-18`)
|
||||||
|
|
||||||
|
**Problem:** The entire `server_pro` database layer uses a single shared `Connection` behind a `std::sync::Mutex`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct UserDb {
|
||||||
|
conn: Arc<Mutex<Connection>>,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
While SQLite WAL mode is enabled (allowing readers to proceed during writes), **the Rust mutex still serializes all access to the connection object**. Under concurrent load with many tests starting/finishing, this becomes the primary bottleneck.
|
||||||
|
|
||||||
|
**Critical sub-issue:** `QuotaManager::remaining_budget()` (`src/server_pro/quota.rs:387`) performs **up to 15 separate SQLite queries** in sequence, locking the mutex 15+ times per pre-test check.
|
||||||
|
|
||||||
|
**Impact:** Connection setup/teardown latency increases linearly with concurrency. Quota checks and usage recording block each other.
|
||||||
|
|
||||||
|
**Fix Options:**
|
||||||
|
- **Connection pooling:** Use `r2d2_sqlite` or `deadpool-sqlite` to maintain a small pool of connections (SQLite handles this well in WAL mode).
|
||||||
|
- **Separate read/write paths:** Open a read-only connection for quota checks (`remaining_budget`) and a dedicated write connection for usage recording. SQLite WAL allows this.
|
||||||
|
- **Batch quota checks:** Cache quota results for a few seconds per user/IP to avoid redundant queries.
|
||||||
|
- **Channel-based writer:** Use a single dedicated DB writer task with an `mpsc` channel so only one task ever touches the connection, eliminating lock contention entirely.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🟠 High Severity Issues
|
||||||
|
|
||||||
|
### 4. 100ms Busy-Poll Wait in Multi-Connection TCP (`src/server.rs:313-332`)
|
||||||
|
|
||||||
|
**Problem:** When waiting for secondary TCP connections to join a multi-connection session, the primary connection busy-polls the session map every 100ms:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
loop {
|
||||||
|
let count = { let map = sessions.lock().await; ... };
|
||||||
|
if count + 1 >= conn_count as usize { break; }
|
||||||
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This adds up to **100ms of unnecessary latency** to every multi-connection test startup. It also hammers the async mutex needlessly.
|
||||||
|
|
||||||
|
**Fix:** Replace with `tokio::sync::Notify`. When a secondary connection registers itself, it calls `notify_one()`. The primary waits on `notified().await` with a timeout, waking instantly when ready.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. FreeBSD CPU Sampling Spawns Process Every Second (`src/cpu.rs:142`)
|
||||||
|
|
||||||
|
**Problem:** On FreeBSD, `get_cpu_times()` spawns `sysctl -n kern.cp_time` via `std::process::Command` every second:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
fn get_cpu_times() -> (u64, u64) {
|
||||||
|
if let Ok(output) = std::process::Command::new("sysctl")
|
||||||
|
.arg("-n").arg("kern.cp_time").output()
|
||||||
|
{ ... }
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`fork()` + `exec()` is extremely expensive relative to the work being done (reading 5 integers).
|
||||||
|
|
||||||
|
**Fix:** Use `libc::sysctl()` via FFI, matching the macOS implementation style. Cache the `mib` array and call the syscall directly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 6. Per-Call Timer Registration in UDP RX Loops (`src/client.rs:393`, `src/server.rs:925`)
|
||||||
|
|
||||||
|
**Problem:** Both UDP RX loops create a new `tokio::time::timeout` timer on **every single `recv`/`recv_from` call**:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
match tokio::time::timeout(Duration::from_secs(5), socket.recv(&mut buf)).await
|
||||||
|
```
|
||||||
|
|
||||||
|
At high packet rates (e.g., 100K pps), registering and canceling timers on the Tokio timer wheel adds measurable overhead.
|
||||||
|
|
||||||
|
**Fix:** Use `tokio::select!` with a long-lived `tokio::time::sleep` future that is reset, or use the socket's built-in SO_RCVTIMEO if available via `socket2`. Alternatively, since UDP is connectionless, consider whether a 5-second timeout is needed on every call or if the outer test duration timer is sufficient.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🟡 Medium Severity Issues
|
||||||
|
|
||||||
|
### 7. String Error Matching with Allocation (`src/server_pro/enforcer.rs:157-161`)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
match format!("{}", e).as_str() {
|
||||||
|
s if s.contains("daily") => ...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`format!("{}", e)` allocates a `String` from the error just to do substring matching. Use `e.to_string().contains(...)` or match on error types directly if possible.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 8. `ip.to_string()` Called Repeatedly in Quota Checks (`src/server_pro/quota.rs:389`)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let ip_str = ip.to_string();
|
||||||
|
// ... used in 6+ DB calls
|
||||||
|
```
|
||||||
|
|
||||||
|
This allocates a `String` on every quota check. Accept `&str` or `IpAddr` directly in DB methods, or cache the string.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 9. `chrono_date_today()` Recomputes Calendar from Epoch (`src/server_pro/user_db.rs:617-638`)
|
||||||
|
|
||||||
|
A hand-rolled date calculation loops through years from 1970 and months every time it's called (which is before almost every DB write). The `chrono` crate is already used indirectly by `rusqlite`; add it as a direct dependency and replace with `chrono::Local::now().format("%Y-%m-%d")`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🟢 Low Severity / Easy Wins
|
||||||
|
|
||||||
|
### 10. CSV File Reopened on Every Write (`src/csv_output.rs:77`)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
if let Ok(mut f) = OpenOptions::new().append(true).open(path) {
|
||||||
|
let _ = writeln!(f, "{}", row);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Called once per test, not per-packet, but still suboptimal. Consider keeping a lazily-initialized `Mutex<Option<File>>` or using `std::fs::OpenOptions` once at init and storing the handle.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 11. Global Syslog Mutex Held During I/O (`src/syslog_logger.rs`)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
static SYSLOG: Mutex<Option<SyslogSender>> = Mutex::new(None);
|
||||||
|
```
|
||||||
|
|
||||||
|
The global `std::sync::Mutex` is held while formatting the timestamp (expensive manual calendar math) and sending UDP. Switch to `parking_lot::Mutex` (faster) or `tokio::sync::Mutex` if async, and format the message outside the lock. Better yet, use `tracing`'s built-in syslog integration or a structured appender.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 12. `hash_password()` Uses `format!` + `format!` in Hex Loop (`src/server_pro/user_db.rs:612-614`)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
hasher.update(format!("{}:{}", username, password).as_bytes());
|
||||||
|
result.iter().map(|b| format!("{:02x}", b)).collect() // N allocs for N bytes
|
||||||
|
```
|
||||||
|
|
||||||
|
The hex encoding allocates one `String` per byte. Use a small fixed buffer or `hex` crate (already used elsewhere in `ecsrp5.rs`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 13. Redundant `Instant::now()` Calls in TX Loop (`src/server.rs:593,606`)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
if send_status && Instant::now() >= next_status {
|
||||||
|
// ...
|
||||||
|
next_status = Instant::now() + Duration::from_secs(1);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Two monotonic clock reads per loop iteration. Cache `let now = Instant::now();` at the top of the loop.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture Observations
|
||||||
|
|
||||||
|
### What the Code Does Well
|
||||||
|
- **Zero-allocation protocol layer:** `serialize()` returns fixed-size stack arrays (`[u8; 12]`, `[u8; 16]`). Excellent.
|
||||||
|
- **Atomic bandwidth tracking:** `BandwidthState` uses `AtomicU64` with `Relaxed` ordering in the per-packet path. No locks in the data plane.
|
||||||
|
- **Buffer reuse:** TX/RX loops allocate `vec![0u8; ...]` once before the loop. Good.
|
||||||
|
- **Aggressive release profile:** `lto = true`, `codegen-units = 1`, `opt-level = 3`.
|
||||||
|
|
||||||
|
### Async Runtime Usage
|
||||||
|
- `tokio` with `full` features is used. For a primarily I/O-bound tool, this is appropriate.
|
||||||
|
- `tokio::task::yield_now().await` is used in unlimited-rate mode to prevent starving the runtime. This is correct but consider whether `tokio::task::spawn_blocking` or dedicated CPU pinning is needed for the EC-SRP5 math (which is CPU-bound and currently runs on the async runtime during auth).
|
||||||
|
|
||||||
|
### Memory Safety
|
||||||
|
- Several `unwrap()`/`expect()` calls in setup paths (socket binding, address parsing). These are acceptable for config errors but should use `?` propagation where possible to allow graceful degradation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Missing Performance Infrastructure
|
||||||
|
|
||||||
|
| Infrastructure | Status | Recommendation |
|
||||||
|
|----------------|--------|----------------|
|
||||||
|
| **Benchmarks** | ❌ None | Add `criterion` + `benches/` for `BandwidthState`, protocol ser/de, and EC-SRP5 auth |
|
||||||
|
| **Profiling hooks** | ❌ None | Add optional `pprof` or `dhat` dev-deps for heap profiling |
|
||||||
|
| **Throughput regression tests** | ⚠️ Partial | Integration tests assert `tx > 0` and `rx > 0` but don't measure sustained throughput |
|
||||||
|
| **Load tests** | ❌ None | Add a `benches/load_test.rs` that spawns 100+ concurrent tests against a local server |
|
||||||
|
| **CI performance gates** | ❌ None | Consider a benchmark action that fails on >5% regression |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority Action Plan
|
||||||
|
|
||||||
|
### Phase 1: Hot-Path Fixes (1-2 days)
|
||||||
|
1. Replace buffer scan with `memchr` or ring-buffer approach in `tcp_client_rx_loop`
|
||||||
|
2. Cache `WCurve` in a global `LazyLock`
|
||||||
|
3. Replace 100ms poll with `tokio::sync::Notify` in multi-conn wait
|
||||||
|
|
||||||
|
### Phase 2: Scalability (2-3 days)
|
||||||
|
4. Add SQLite connection pooling or channel-based writer in `server_pro`
|
||||||
|
5. Cache `remaining_budget()` results for 5-10 seconds
|
||||||
|
6. Fix FreeBSD CPU sampling to use `libc::sysctl` FFI
|
||||||
|
|
||||||
|
### Phase 3: Polish & Tooling (1-2 days)
|
||||||
|
7. Replace manual date arithmetic with `chrono`
|
||||||
|
8. Add `criterion` benchmarks for auth and bandwidth state
|
||||||
|
9. Fix low-severity allocation issues (CSV, syslog, hex encoding)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Appendix: File-by-File Quick Reference
|
||||||
|
|
||||||
|
| File | Lines | Hot Path? | Key Concern |
|
||||||
|
|------|-------|-----------|-------------|
|
||||||
|
| `src/client.rs` | 531 | ✅ Yes | O(n) 256KB scan per TCP read |
|
||||||
|
| `src/server.rs` | 1094 | ✅ Yes | 100ms poll wait, status injection timing |
|
||||||
|
| `src/ecsrp5.rs` | 660 | ✅ Yes (auth) | `WCurve::new()` recomputed per auth |
|
||||||
|
| `src/bandwidth.rs` | 263 | ✅ Yes (atomics) | Well-designed; no issues |
|
||||||
|
| `src/protocol.rs` | 214 | ✅ Yes (ser/de) | Zero-allocation; excellent |
|
||||||
|
| `src/cpu.rs` | 215 | ⚠️ Periodic | FreeBSD `fork+exec` every second |
|
||||||
|
| `src/server_pro/quota.rs` | 470 | ⚠️ Periodic | 15 DB queries per budget check |
|
||||||
|
| `src/server_pro/user_db.rs` | 641 | ⚠️ All DB ops | Single mutex serializes everything |
|
||||||
|
| `src/server_pro/server_loop.rs` | 449 | ✅ Yes | DB auth locks during connection setup |
|
||||||
|
| `src/server_pro/enforcer.rs` | 411 | ⚠️ Periodic | String error matching allocates |
|
||||||
|
| `src/csv_output.rs` | 86 | ❌ No | File reopen per write |
|
||||||
|
| `src/syslog_logger.rs` | 154 | ❌ No | Global mutex + manual calendar math |
|
||||||
|
| `src/auth.rs` | 164 | ⚠️ Auth only | Minor; double MD5 per auth |
|
||||||
|
| `src/main.rs` | 243 | ❌ No | Entry point only |
|
||||||
634
PERFORMANCE_PRDS.md
Normal file
634
PERFORMANCE_PRDS.md
Normal file
@@ -0,0 +1,634 @@
|
|||||||
|
# Performance Improvement PRDs
|
||||||
|
|
||||||
|
**Project:** btest-rs
|
||||||
|
**Constraint:** 100% MikroTik BTest protocol compatibility — no wire-format or behavioral changes visible to MikroTik devices
|
||||||
|
**Date:** 2026-04-30
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How to Read This Document
|
||||||
|
|
||||||
|
Each PRD is sorted by **recommended execution order**, which balances:
|
||||||
|
- **Effort** (development + review + test time)
|
||||||
|
- **Risk** (probability of regression or compatibility break)
|
||||||
|
- **Performance Effect** (measured or estimated throughput/latency improvement)
|
||||||
|
- **MikroTik Compatibility Risk** (whether the change could affect interoperability)
|
||||||
|
|
||||||
|
**Sorting rationale:** Execute *quick wins* first to build velocity and reduce risk surface, then tackle *high-impact* items with full attention.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary Matrix
|
||||||
|
|
||||||
|
| # | PRD | Effort | Risk | Perf Impact | MikroTik Risk | Tier |
|
||||||
|
|---|-----|--------|------|-------------|---------------|------|
|
||||||
|
| 1 | WCurve Global Cache | 30 min | None | Medium | None | Quick Win |
|
||||||
|
| 2 | Redundant `Instant::now()` | 15 min | None | Low | None | Quick Win |
|
||||||
|
| 3 | `hash_password` Hex Fix | 30 min | None | Low | None | Quick Win |
|
||||||
|
| 4 | CSV File Handle Cache | 30 min | None | Low | None | Quick Win |
|
||||||
|
| 5 | Error String Matching | 30 min | None | Low | None | Quick Win |
|
||||||
|
| 6 | `chrono_date_today` Replace | 1 hr | Low | Low | None | Quick Win |
|
||||||
|
| 7 | Syslog Mutex + Timestamp | 1 hr | Low | Low | None | Quick Win |
|
||||||
|
| 8 | `ip.to_string()` Cache | 1 hr | Low | Low | None | Quick Win |
|
||||||
|
| 9 | FreeBSD CPU FFI | 3 hrs | Medium | Medium | None | Platform Fix |
|
||||||
|
| 10 | Multi-Conn Notify Wake | 2 hrs | Medium | Medium | None | Latency Fix |
|
||||||
|
| 11 | UDP Timer Reuse | 2 hrs | Medium | Medium | None | Throughput Fix |
|
||||||
|
| 12 | TCP RX Scan Optimization | 4 hrs | Medium | **High** | Low | Hot Path Fix |
|
||||||
|
| 13 | SQLite Connection Pool | 1–2 days | High | **High** | None | Scalability Fix |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tier 1: Quick Wins (Do These First)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-001: Cache `WCurve` in Global `LazyLock`
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`WCurve::new()` is called on every EC-SRP5 authentication (client and server). It recomputes the Weierstrass curve generator point via `lift_x(9)` → `prime_mod_sqrt()`, which performs heavy `BigUint` modular arithmetic. The result is deterministic and immutable.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** This is pure internal mathematics. The wire bytes, auth handshake order, and hash outputs are identical. No protocol-visible change.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Eliminate redundant `BigUint` modular square root computation per authentication.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
```rust
|
||||||
|
// src/ecsrp5.rs
|
||||||
|
static WCURVE: std::sync::LazyLock<WCurve> = std::sync::LazyLock::new(WCurve::new);
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace all call sites:
|
||||||
|
- `src/ecsrp5.rs:363` (`client_authenticate`)
|
||||||
|
- `src/ecsrp5.rs:499` (`server_authenticate`)
|
||||||
|
|
||||||
|
Change `let w = WCurve::new();` to `let w = &*WCURVE;`. Update any `WCurve` methods that take `self` to take `&self` if they don't already.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] `ecsrp5_test.rs` passes unchanged.
|
||||||
|
- [ ] `full_integration_test.rs` EC-SRP5 tests pass unchanged.
|
||||||
|
- [ ] `WCurve::new()` is called exactly once per process lifetime.
|
||||||
|
- [ ] No change to serialized auth bytes on the wire.
|
||||||
|
|
||||||
|
**Effort:** 30 min
|
||||||
|
**Risk:** None — stateless deterministic cache
|
||||||
|
**Performance Impact:** Medium — reduces per-auth CPU time by ~30-50% (estimated), especially noticeable under concurrent logins.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-002: Deduplicate `Instant::now()` in `tcp_tx_loop_inner`
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
The TCP TX loop calls `Instant::now()` twice per iteration (status check and interval scheduling). Monotonic clock reads are cheap but not free, and occur in the hottest loop in the system.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** Timing granularity remains identical.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Reduce syscalls in the per-packet hot path.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
```rust
|
||||||
|
let now = Instant::now();
|
||||||
|
if send_status && now >= next_status { ... next_status = now + Duration::from_secs(1); }
|
||||||
|
// ... reuse `now` for interval math
|
||||||
|
```
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] TCP send/receive/both integration tests pass.
|
||||||
|
- [ ] No behavioral change in status injection timing.
|
||||||
|
|
||||||
|
**Effort:** 15 min
|
||||||
|
**Risk:** None
|
||||||
|
**Performance Impact:** Low — micro-optimization, but trivial.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-003: Fix `hash_password()` Hex Encoding Allocations
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`user_db.rs:614` allocates one `String` per byte when hex-encoding a 32-byte SHA256 hash:
|
||||||
|
```rust
|
||||||
|
result.iter().map(|b| format!("{:02x}", b)).collect()
|
||||||
|
```
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** Output string is identical.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Replace N-allocation hex encoding with a single-allocation approach.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
Use `hex` crate (already in dependency tree via `ecsrp5.rs` debug logging) or a small `[u8; 64]` buffer with `write!` to a `String::with_capacity(64)`.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] Same hex string output for all inputs.
|
||||||
|
- [ ] `pro` feature tests pass.
|
||||||
|
|
||||||
|
**Effort:** 30 min
|
||||||
|
**Risk:** None
|
||||||
|
**Performance Impact:** Low — removes 32 allocations per password hash.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-004: Cache CSV File Handle
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`csv_output::write_result()` re-opens the file via `OpenOptions::new().append(true).open(path)` on every call (once per test). Safe but wasteful.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** No protocol involvement.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Hold the file handle open for the process lifetime.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
Change `static CSV_FILE: Mutex<Option<String>>` to `Mutex<Option<(String, std::fs::File)>>`, or open once during `init()` and store `Mutex<Option<File>>`.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] CSV tests in `full_integration_test.rs` pass.
|
||||||
|
- [ ] File is created with headers on `init()`.
|
||||||
|
- [ ] Multiple `write_result` calls append correctly.
|
||||||
|
|
||||||
|
**Effort:** 30 min
|
||||||
|
**Risk:** None
|
||||||
|
**Performance Impact:** Low — removes one `open()` syscall per test.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-005: Remove Allocating Error String Matching
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`src/server_pro/enforcer.rs:157-161` does:
|
||||||
|
```rust
|
||||||
|
match format!("{}", e).as_str() {
|
||||||
|
s if s.contains("daily") => ...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
This allocates a `String` from the error just for substring matching.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** Server-pro internal logic only.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Match without allocation.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
Use `e.to_string().contains("daily")` (still allocates but clearer) or, better, downcast the `rusqlite::Error` or match on structured error variants. If the error is `anyhow::Error`, use `.downcast_ref::<rusqlite::Error>()`.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] Quota enforcement behavior unchanged.
|
||||||
|
- [ ] Enforcer tests pass.
|
||||||
|
|
||||||
|
**Effort:** 30 min
|
||||||
|
**Risk:** None
|
||||||
|
**Performance Impact:** Low — removes one allocation per enforcer tick.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-006: Replace `chrono_date_today()` with `chrono` Crate
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`user_db.rs:617-638` contains a hand-rolled Gregorian calendar converter that loops from 1970 to compute today's date. Called before almost every DB write. The `chrono` crate is already pulled in transitively by `rusqlite`.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** No protocol involvement.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Replace 30 lines of error-prone manual date math with one `chrono` call.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
Add `chrono = { version = "0.4", optional = true }` gated behind `pro` feature (or use the transitive dep directly). Replace `chrono_date_today()` with:
|
||||||
|
```rust
|
||||||
|
chrono::Local::now().format("%Y-%m-%d").to_string()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] `pro` feature compiles.
|
||||||
|
- [ ] Date strings match format `YYYY-MM-DD`.
|
||||||
|
- [ ] DB write tests pass.
|
||||||
|
|
||||||
|
**Effort:** 1 hr
|
||||||
|
**Risk:** Low — adds explicit dep that already exists transitively
|
||||||
|
**Performance Impact:** Low — eliminates loop overhead, but called infrequently.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-007: Optimize Syslog Mutex and Timestamp Formatting
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`syslog_logger.rs` holds a global `std::sync::Mutex` while formatting a timestamp (manual calendar math) and sending UDP. `std::sync::Mutex` is relatively slow, and the timestamp logic duplicates `chrono_date_today()` issues.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** No protocol involvement.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Reduce lock contention and allocation in logging path.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
1. Use `parking_lot::Mutex` (faster, no poisoning) OR switch to `std::sync::Mutex` but clone the `SyslogSender` config outside the lock.
|
||||||
|
2. Replace `bsd_timestamp()` with `chrono::Local::now().format("%b %e %H:%M:%S")`.
|
||||||
|
3. Pre-allocate the `String` with `with_capacity(256)`.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] Syslog output format remains RFC 3164 compliant.
|
||||||
|
- [ ] `test_syslog_events` in `full_integration_test.rs` passes.
|
||||||
|
|
||||||
|
**Effort:** 1 hr
|
||||||
|
**Risk:** Low
|
||||||
|
**Performance Impact:** Low — logging is not a hot path, but reduces global lock hold time.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-008: Cache `ip.to_string()` in Quota Checks
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`quota.rs:389` calls `ip.to_string()` and then passes `&ip_str` to multiple DB methods, allocating a new `String` on every `remaining_budget()` call.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** Server-pro internal logic.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Eliminate redundant IP stringification.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
Change DB methods to accept `&std::net::IpAddr` directly and stringify inside only when needed for SQL parameter binding (which `rusqlite` may already handle via `ToSql`). Alternatively, pass `ip_str: &str` from a single `to_string()` call and avoid re-stringifying in sub-calls.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] Quota checks return identical results.
|
||||||
|
- [ ] `pro` feature tests pass.
|
||||||
|
|
||||||
|
**Effort:** 1 hr
|
||||||
|
**Risk:** Low
|
||||||
|
**Performance Impact:** Low — one allocation removed per quota check.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tier 2: Moderate Fixes (Platform & Latency)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-009: FreeBSD CPU Sampling via `libc::sysctl` FFI
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
On FreeBSD, `cpu.rs` spawns `sysctl -n kern.cp_time` as a child process every second. `fork()` + `exec()` is orders of magnitude slower than a direct syscall.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** No protocol involvement. Platform-specific internal code.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Replace subprocess with direct `sysctl(3)` syscall.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
```rust
|
||||||
|
#[cfg(target_os = "freebsd")]
|
||||||
|
fn get_cpu_times() -> (u64, u64) {
|
||||||
|
let mut mib = [libc::CTL_KERN, libc::KERN_CP_TIME];
|
||||||
|
let mut cp_time: [libc::c_ulong; 5] = [0; 5];
|
||||||
|
let mut len = std::mem::size_of_val(&cp_time);
|
||||||
|
unsafe {
|
||||||
|
if libc::sysctl(
|
||||||
|
mib.as_mut_ptr(),
|
||||||
|
mib.len() as u32,
|
||||||
|
&mut cp_time as *mut _ as *mut libc::c_void,
|
||||||
|
&mut len,
|
||||||
|
std::ptr::null_mut(),
|
||||||
|
0,
|
||||||
|
) == 0 {
|
||||||
|
let total = cp_time[0] + cp_time[1] + cp_time[2] + cp_time[3] + cp_time[4];
|
||||||
|
return (total as u64, cp_time[4] as u64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(0, 0)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] Compiles on FreeBSD.
|
||||||
|
- [ ] Returns same values as previous `sysctl` command approach.
|
||||||
|
- [ ] No child process spawned (verify with `ktrace` or `ps`).
|
||||||
|
|
||||||
|
**Effort:** 3 hrs
|
||||||
|
**Risk:** Medium — requires FreeBSD test environment; FFI is unsafe
|
||||||
|
**Performance Impact:** Medium — eliminates 1 fork/exec per second on FreeBSD.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-010: Replace 100ms Poll with `tokio::sync::Notify`
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
In `server.rs:313-332`, the primary connection of a multi-connection TCP test busy-polls the session map every 100ms waiting for secondary connections to join.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** This is internal server-side coordination. The wire behavior (waiting for connections, then starting the test) is unchanged. MikroTik clients will not observe a difference except potentially faster test startup.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Eliminate polling latency and unnecessary mutex acquisitions.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
1. Add a `tokio::sync::Notify` to `TcpSession`:
|
||||||
|
```rust
|
||||||
|
struct TcpSession {
|
||||||
|
peer_ip: IpAddr,
|
||||||
|
streams: Vec<OwnedTcpStream>,
|
||||||
|
expected: u8,
|
||||||
|
notify: tokio::sync::Notify,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
2. In the secondary connection handler, after pushing to `streams`, call `session.notify.notify_one()`.
|
||||||
|
3. In the primary wait loop, replace the sleep loop with:
|
||||||
|
```rust
|
||||||
|
let count = { /* lock, get count, drop lock */ };
|
||||||
|
if count + 1 >= conn_count { break; }
|
||||||
|
|
||||||
|
// Wait for notification or 10s deadline
|
||||||
|
let timeout = tokio::time::sleep(Duration::from_secs(10));
|
||||||
|
tokio::pin!(timeout);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = session.notify.notified() => {
|
||||||
|
let count = { /* lock, get count */ };
|
||||||
|
if count + 1 >= conn_count { break; }
|
||||||
|
}
|
||||||
|
_ = &mut timeout => { break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] Multi-connection TCP tests pass.
|
||||||
|
- [ ] Test startup latency is ≤ 1ms after last connection joins (was up to 100ms).
|
||||||
|
- [ ] No deadlock under concurrent multi-connection tests.
|
||||||
|
|
||||||
|
**Effort:** 2 hrs
|
||||||
|
**Risk:** Medium — concurrency change; must carefully manage lock/notify ordering to avoid races
|
||||||
|
**Performance Impact:** Medium — improves multi-conn test startup latency by up to 100ms per test.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-011: Reuse UDP RX Timer Instead of Per-Call Timeout
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
Both client and server UDP RX loops create a new `tokio::time::timeout` on every `recv`/`recv_from` call:
|
||||||
|
```rust
|
||||||
|
tokio::time::timeout(Duration::from_secs(5), socket.recv(&mut buf)).await
|
||||||
|
```
|
||||||
|
At high packet rates, this registers and cancels timers on Tokio's timer wheel constantly.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** Internal async timing only. UDP packet processing is unchanged.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Reduce timer wheel churn in high-rate UDP RX loops.
|
||||||
|
|
||||||
|
**Design:**
|
||||||
|
Option A — `tokio::select!` with a pinned sleep future:
|
||||||
|
```rust
|
||||||
|
let mut timeout = tokio::time::sleep(Duration::from_secs(5));
|
||||||
|
tokio::pin!(timeout);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
biased; // prioritize recv
|
||||||
|
res = socket.recv(&mut buf) => { /* handle */ timeout.as_mut().reset(Instant::now() + Duration::from_secs(5)); }
|
||||||
|
_ = &mut timeout => { tracing::debug!("UDP RX timeout"); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Option B — Use `socket2` to set `SO_RCVTIMEO` on the underlying socket, then use blocking/async recv without Tokio timeouts. This moves timeout handling into the kernel, which is even cheaper.
|
||||||
|
|
||||||
|
**Recommendation:** Start with Option A (pure Tokio, no platform risk). Option B can be a follow-up.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] UDP send/receive/both tests pass.
|
||||||
|
- [ ] UDP RX still times out correctly when no packets arrive.
|
||||||
|
- [ ] No change to packet parsing or sequence tracking.
|
||||||
|
|
||||||
|
**Effort:** 2 hrs
|
||||||
|
**Risk:** Medium — changes timeout behavior; must ensure test abortion still works correctly
|
||||||
|
**Performance Impact:** Medium — reduces timer wheel registration overhead, noticeable at >50K pps.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tier 3: High Impact (Do These With Full Focus)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-012: Optimize TCP Client RX Status Message Scan
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`tcp_client_rx_loop` (`client.rs:210-216`) scans up to 256KB byte-by-byte on every `read()` call looking for a 12-byte status marker (`0x07` + `0x80|cpu`). Since data is all zeros, this is almost always a full scan.
|
||||||
|
|
||||||
|
**MikroTik Compatibility Consideration:**
|
||||||
|
- **High confidence of safety.** The protocol is: MikroTik injects 12-byte status messages into the TCP stream. Our client must detect them. Changing *how* we detect them (faster scan) does not change:
|
||||||
|
- What bytes are sent on the wire
|
||||||
|
- What bytes we expect
|
||||||
|
- How we respond to status messages
|
||||||
|
- **One edge case to handle:** TCP is a stream. A status message may be split across two `read()` calls. The current code does **not** handle this correctly (it scans each buffer independently). The optimized version *should* handle split messages to be strictly more correct than the current implementation.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Replace O(n) byte-by-byte scan with SIMD-accelerated or state-machine-based detection, while correctly handling split messages.
|
||||||
|
|
||||||
|
**Design — Recommended: Ring Buffer Approach**
|
||||||
|
|
||||||
|
Since status messages are 12 bytes and all other bytes are zeros, maintain a 12-byte ring buffer across reads:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
const STATUS_MSG_SIZE: usize = 12;
|
||||||
|
|
||||||
|
async fn tcp_client_rx_loop(mut reader: OwnedReadHalf, state: Arc<BandwidthState>) {
|
||||||
|
let mut buf = vec![0u8; 256 * 1024];
|
||||||
|
let mut carry = [0u8; STATUS_MSG_SIZE - 1]; // up to 11 bytes from previous read
|
||||||
|
let mut carry_len = 0usize;
|
||||||
|
|
||||||
|
while state.running.load(Ordering::Relaxed) {
|
||||||
|
match reader.read(&mut buf).await {
|
||||||
|
Ok(0) | Err(_) => break,
|
||||||
|
Ok(n) => {
|
||||||
|
state.rx_bytes.fetch_add(n as u64, Ordering::Relaxed);
|
||||||
|
|
||||||
|
// Check if a status message spans the carry + start of buf
|
||||||
|
if carry_len > 0 {
|
||||||
|
let needed = STATUS_MSG_SIZE - carry_len;
|
||||||
|
if n >= needed {
|
||||||
|
let mut candidate = [0u8; STATUS_MSG_SIZE];
|
||||||
|
candidate[..carry_len].copy_from_slice(&carry[..carry_len]);
|
||||||
|
candidate[carry_len..].copy_from_slice(&buf[..needed]);
|
||||||
|
if candidate[0] == STATUS_MSG_TYPE && candidate[1] >= 0x80 {
|
||||||
|
state.remote_cpu.store(candidate[1] & 0x7F, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scan within buf for status messages
|
||||||
|
// Since data is zeros, use memchr to find 0x07 candidates
|
||||||
|
if n >= STATUS_MSG_SIZE {
|
||||||
|
let search_end = n - STATUS_MSG_SIZE + 1;
|
||||||
|
let mut offset = 0;
|
||||||
|
while let Some(pos) = memchr::memchr(STATUS_MSG_TYPE, &buf[offset..search_end]) {
|
||||||
|
let i = offset + pos;
|
||||||
|
if buf[i + 1] >= 0x80 {
|
||||||
|
state.remote_cpu.store(buf[i + 1] & 0x7F, Ordering::Relaxed);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
offset = i + 1;
|
||||||
|
if offset >= search_end { break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save trailing bytes for next read
|
||||||
|
carry_len = (n).min(STATUS_MSG_SIZE - 1);
|
||||||
|
if n >= carry_len {
|
||||||
|
carry[..carry_len].copy_from_slice(&buf[n - carry_len..n]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Alternative: `memchr` crate only**
|
||||||
|
If we determine split messages are extremely rare and the current behavior is "good enough," simply replace the `for` loop with:
|
||||||
|
```rust
|
||||||
|
if let Some(pos) = memchr::memchr(STATUS_MSG_TYPE, &buf[..n - STATUS_MSG_SIZE + 1]) {
|
||||||
|
if buf[pos + 1] >= 0x80 { /* ... */ }
|
||||||
|
}
|
||||||
|
```
|
||||||
|
This is a 5-line change with massive speedup (SIMD scan). However, the ring buffer approach is strictly more correct and not much more complex.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] TCP bidirectional tests pass.
|
||||||
|
- [ ] Remote CPU reporting still works.
|
||||||
|
- [ ] Status messages split across reads are correctly detected (unit test for this).
|
||||||
|
- [ ] `memchr` crate added to deps (very lightweight).
|
||||||
|
- [ ] No change to wire bytes or server behavior.
|
||||||
|
|
||||||
|
**Effort:** 4 hrs
|
||||||
|
**Risk:** Medium — hot path change; must be carefully reviewed and tested
|
||||||
|
**Performance Impact:** **High** — eliminates 256KB byte scan per read. At 10K reads/sec, saves ~2.5GB of memory scanning per second.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### PRD-013: SQLite Connection Pool / Channel-Based Writer
|
||||||
|
|
||||||
|
**Background:**
|
||||||
|
`server_pro` uses a single `Arc<Mutex<Connection>>`. All quota checks, usage recordings, and auth lookups serialize through one lock. `remaining_budget()` issues 15 queries, locking 15+ times. This is the primary scalability bottleneck for the pro server.
|
||||||
|
|
||||||
|
**MikroTik Compatibility:**
|
||||||
|
- **100% safe.** Server-side infrastructure only. No protocol change.
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
Enable concurrent quota checks and usage recording without mutex contention.
|
||||||
|
|
||||||
|
**Design — Option A: Connection Pool (Recommended for reads)**
|
||||||
|
Use `r2d2_sqlite` or `deadpool-sqlite`:
|
||||||
|
1. Open a pool of ~4-8 connections to the same SQLite file (WAL mode supports this).
|
||||||
|
2. Read-only operations (`remaining_budget`, `get_user`, `check_user`) borrow a connection from the pool.
|
||||||
|
3. Write operations (`record_usage`, `record_session`) also borrow from the pool (WAL allows concurrent readers + one writer).
|
||||||
|
|
||||||
|
**Design — Option B: Channel-Based Writer (Recommended for writes)**
|
||||||
|
1. Keep one dedicated `Connection` owned by a single Tokio task.
|
||||||
|
2. Expose an `mpsc::channel` where other tasks send write requests (`RecordUsage { user, tx, rx }`).
|
||||||
|
3. The writer task batches or sequentially executes writes without any mutex.
|
||||||
|
4. Reads use a separate read-only connection or pool.
|
||||||
|
|
||||||
|
**Hybrid Recommendation:**
|
||||||
|
- **Reads:** Small connection pool (4 connections) for quota checks and auth lookups.
|
||||||
|
- **Writes:** Single dedicated async task with an `mpsc::unbounded_channel` for usage recording.
|
||||||
|
- **Cache:** Add a 5-second TTL cache for `remaining_budget()` results per user+IP to avoid redundant DB hits during test setup.
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- [ ] `pro` feature compiles and all tests pass.
|
||||||
|
- [ ] Concurrent test launches scale linearly up to at least 50 concurrent sessions.
|
||||||
|
- [ ] Quota enforcement remains correct (no over-quota usage).
|
||||||
|
- [ ] Session logging and interval recording remain accurate.
|
||||||
|
- [ ] No SQLite "database is locked" errors under load.
|
||||||
|
|
||||||
|
**Effort:** 1–2 days
|
||||||
|
**Risk:** High — touches every DB interaction in `server_pro`; potential for data races, quota leaks, or connection exhaustion
|
||||||
|
**Performance Impact:** **High** — enables horizontal scaling of concurrent tests; removes the primary pro server bottleneck.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Execution Roadmap
|
||||||
|
|
||||||
|
### Sprint 1: Quick Wins + Foundation (1 day)
|
||||||
|
- [ ] PRD-001: WCurve cache
|
||||||
|
- [ ] PRD-002: `Instant::now()` dedup
|
||||||
|
- [ ] PRD-003: `hash_password` hex fix
|
||||||
|
- [ ] PRD-004: CSV file handle cache
|
||||||
|
- [ ] PRD-005: Error string matching
|
||||||
|
- [ ] PRD-006: `chrono` date replacement
|
||||||
|
- [ ] PRD-007: Syslog optimization
|
||||||
|
- [ ] PRD-008: `ip.to_string()` cache
|
||||||
|
|
||||||
|
**Deliverable:** Low-risk PR with 8 clean commits. Run full integration tests.
|
||||||
|
|
||||||
|
### Sprint 2: Platform & Async Fixes (1 day)
|
||||||
|
- [ ] PRD-009: FreeBSD CPU FFI
|
||||||
|
- [ ] PRD-010: Multi-conn Notify wake
|
||||||
|
- [ ] PRD-011: UDP timer reuse
|
||||||
|
|
||||||
|
**Deliverable:** PR with platform + latency improvements.
|
||||||
|
|
||||||
|
### Sprint 3: Hot Path Optimization (1–2 days)
|
||||||
|
- [ ] PRD-012: TCP RX scan optimization
|
||||||
|
- [ ] Add unit test for split status messages
|
||||||
|
- [ ] Benchmark before/after with `criterion` (or manual throughput test)
|
||||||
|
|
||||||
|
**Deliverable:** PR with benchmark numbers proving improvement.
|
||||||
|
|
||||||
|
### Sprint 4: Scalability (2–3 days)
|
||||||
|
- [ ] PRD-013: SQLite connection pool / channel writer
|
||||||
|
- [ ] Load test: 50 concurrent tests, verify no DB lock contention
|
||||||
|
- [ ] Add `remaining_budget` cache
|
||||||
|
|
||||||
|
**Deliverable:** PR with load test results.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing Requirements for All PRDs
|
||||||
|
|
||||||
|
Since **no wire protocol changes** are made, the existing integration test suite is the primary validation tool. However, for PRD-012 and PRD-013, additional tests are required:
|
||||||
|
|
||||||
|
### New Tests to Add
|
||||||
|
|
||||||
|
1. **Split Status Message Unit Test (for PRD-012)**
|
||||||
|
```rust
|
||||||
|
#[test]
|
||||||
|
fn test_status_message_split_across_reads() {
|
||||||
|
// Feed first 5 bytes, then remaining 7 bytes
|
||||||
|
// Assert CPU value is extracted correctly
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Concurrent Quota Load Test (for PRD-013)**
|
||||||
|
```rust
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_concurrent_quota_checks() {
|
||||||
|
// Spawn 50 tasks doing remaining_budget() + record_usage()
|
||||||
|
// Assert no panics, no SQLite locked errors
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **FreeBSD CPU Parity Test (for PRD-009)**
|
||||||
|
Manual verification on FreeBSD that FFI `sysctl` returns same values as command.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Appendix: MikroTik Compatibility Checklist
|
||||||
|
|
||||||
|
For every PRD, verify:
|
||||||
|
- [ ] No change to `Command` or `StatusMessage` struct layouts or serialization
|
||||||
|
- [ ] No change to MD5 challenge-response handshake order
|
||||||
|
- [ ] No change to EC-SRP5 handshake order or byte values
|
||||||
|
- [ ] No change to TCP packet sizes or UDP payload format
|
||||||
|
- [ ] No change to status injection timing (1-second interval)
|
||||||
|
- [ ] No change to NAT probe behavior
|
||||||
|
- [ ] Client can still authenticate against stock RouterOS `btest` server
|
||||||
|
- [ ] Server can still accept connections from stock RouterOS `btest` client
|
||||||
|
|
||||||
|
All PRDs in this document satisfy the above checklist by construction.
|
||||||
Reference in New Issue
Block a user