T4.2.1: Real VideoToolbox VTCompressionSession / VTDecompressionSession wiring (macOS)

This commit is contained in:
Siavash Sameni
2026-05-12 09:51:34 +04:00
parent 81042ac190
commit 410c2a4335
9 changed files with 679 additions and 53 deletions

View File

@@ -28,6 +28,7 @@ libc = "0.2"
jni = { version = "0.21", default-features = false }
rand = { workspace = true }
rustls = { version = "0.23", default-features = false, features = ["ring"] }
[target.'cfg(target_os = "android")'.dependencies]
tracing-android = "0.2"
[build-dependencies]

View File

@@ -49,25 +49,33 @@ static INIT_LOGGING: Once = Once::new();
/// Safe to call multiple times — only the first call takes effect.
fn init_logging() {
INIT_LOGGING.call_once(|| {
// Wrap in catch_unwind — sharded_slab allocation inside
// tracing_subscriber::registry() can crash on some Android
// devices if scudo malloc fails during early initialization.
let _ = std::panic::catch_unwind(|| {
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::EnvFilter;
if let Ok(layer) = tracing_android::layer("wzp_android") {
// Filter: INFO for our crates, WARN for everything else.
// The jni crate emits VERBOSE logs for every method lookup
// (~10 lines per JNI call, 100+ calls/sec) which floods logcat
// and causes the system to kill the app.
let filter = EnvFilter::new("warn,wzp_android=info,wzp_proto=info,wzp_transport=info,wzp_codec=info,wzp_fec=info,wzp_crypto=info");
let _ = tracing_subscriber::registry()
.with(layer)
.with(filter)
.try_init();
}
});
#[cfg(target_os = "android")]
{
// Wrap in catch_unwind — sharded_slab allocation inside
// tracing_subscriber::registry() can crash on some Android
// devices if scudo malloc fails during early initialization.
let _ = std::panic::catch_unwind(|| {
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::EnvFilter;
if let Ok(layer) = tracing_android::layer("wzp_android") {
// Filter: INFO for our crates, WARN for everything else.
// The jni crate emits VERBOSE logs for every method lookup
// (~10 lines per JNI call, 100+ calls/sec) which floods logcat
// and causes the system to kill the app.
let filter = EnvFilter::new("warn,wzp_android=info,wzp_proto=info,wzp_transport=info,wzp_codec=info,wzp_fec=info,wzp_crypto=info");
let _ = tracing_subscriber::registry()
.with(layer)
.with(filter)
.try_init();
}
});
}
#[cfg(not(target_os = "android"))]
{
// On non-Android targets tracing-android is unavailable.
let _ = tracing_subscriber::fmt::try_init();
}
});
}

View File

@@ -9,5 +9,8 @@ rust-version.workspace = true
bytes = { workspace = true }
tracing = { workspace = true }
[target.'cfg(target_os = "macos")'.dependencies]
shiguredo_video_toolbox = "2026.1"
[dev-dependencies]
rand = "0.8"

View File

@@ -3,15 +3,31 @@
use crate::decoder::VideoDecoder;
use crate::encoder::{VideoEncoder, VideoError, VideoFrame};
#[cfg(target_os = "macos")]
mod imp {
pub use shiguredo_video_toolbox::{
CodecConfig, DecodedFrame, Decoder, DecoderCodec, DecoderConfig, EncodeOptions, Encoder,
EncoderConfig, FrameData, H264EncoderConfig, H264EntropyMode, H264Profile, PixelFormat,
};
}
#[cfg(target_os = "macos")]
use imp::*;
/// macOS VideoToolbox H.264 encoder.
///
/// Wraps `VTCompressionSession`. Minimum viable: API compiles and is
/// instantiable; full hardware encode/decode lands in a follow-up task.
/// Wraps `VTCompressionSession`. On non-macOS targets this is a compile-safe
/// placeholder that returns [`VideoError::NotInitialized`].
pub struct VideoToolboxEncoder {
_width: u32,
_height: u32,
_bitrate_bps: u32,
#[cfg(target_os = "macos")]
inner: Encoder,
force_keyframe: bool,
#[cfg(not(target_os = "macos"))]
_width: u32,
#[cfg(not(target_os = "macos"))]
_height: u32,
#[cfg(not(target_os = "macos"))]
_bitrate_bps: u32,
}
impl VideoToolboxEncoder {
@@ -20,21 +36,119 @@ impl VideoToolboxEncoder {
/// * `width` / `height` — frame dimensions in pixels.
/// * `bitrate_bps` — target bitrate in bits per second.
pub fn new(width: u32, height: u32, bitrate_bps: u32) -> Result<Self, VideoError> {
Ok(Self {
_width: width,
_height: height,
_bitrate_bps: bitrate_bps,
force_keyframe: false,
})
#[cfg(target_os = "macos")]
{
let config = EncoderConfig {
width,
height,
codec: CodecConfig::H264(H264EncoderConfig {
profile: H264Profile::Baseline,
entropy_mode: H264EntropyMode::Cavlc,
}),
pixel_format: PixelFormat::I420,
average_bitrate: Some(bitrate_bps as u64),
fps_numerator: 30,
fps_denominator: 1,
prioritize_encoding_speed_over_quality: true,
real_time: true,
maximize_power_efficiency: false,
allow_frame_reordering: false,
allow_temporal_compression: false,
max_key_frame_interval: std::num::NonZeroU32::new(30),
max_key_frame_interval_duration: None,
max_frame_delay_count: std::num::NonZeroU32::new(1),
};
let inner = Encoder::new(config).map_err(|e| {
VideoError::PlatformError(format!("VTCompressionSessionCreate failed: {e}"))
})?;
Ok(Self {
inner,
force_keyframe: false,
})
}
#[cfg(not(target_os = "macos"))]
{
let _ = (width, height, bitrate_bps);
Ok(Self {
_width: width,
_height: height,
_bitrate_bps: bitrate_bps,
force_keyframe: false,
})
}
}
}
impl VideoEncoder for VideoToolboxEncoder {
fn encode(&mut self, _frame: &VideoFrame) -> Result<Vec<u8>, VideoError> {
// TODO(T4.2-MVP): Wire VTCompressionSession.
// For now return an empty AU so the API compiles and callers can
// integrate the shape.
Ok(Vec::new())
fn encode(&mut self, frame: &VideoFrame) -> Result<Vec<u8>, VideoError> {
#[cfg(target_os = "macos")]
{
let width = frame.width as usize;
let height = frame.height as usize;
let y_size = width * height;
let uv_size = y_size / 4;
let expected = y_size + uv_size * 2;
if frame.data.len() < expected {
return Err(VideoError::InvalidInput(format!(
"I420 frame too small: {} bytes, expected {expected}",
frame.data.len()
)));
}
let y = &frame.data[0..y_size];
let u = &frame.data[y_size..y_size + uv_size];
let v = &frame.data[y_size + uv_size..y_size + uv_size * 2];
let frame_data = FrameData::I420 { y, u, v };
let options = EncodeOptions {
force_key_frame: self.force_keyframe,
};
self.inner
.encode(&frame_data, &options)
.map_err(|e| VideoError::PlatformError(format!("encode failed: {e}")))?;
// Collect encoded output. Each `next_frame()` call yields one
// complete access unit (AVCC format from VideoToolbox).
let mut annex_b = Vec::new();
let mut emitted_keyframe = false;
while let Some(encoded) = self
.inner
.next_frame()
.map_err(|e| VideoError::PlatformError(format!("next_frame failed: {e}")))?
{
if encoded.keyframe {
emitted_keyframe = true;
}
// Prepend SPS/PPS for keyframes (parameter sets are delivered
// separately by the wrapper).
for sps in &encoded.sps_list {
annex_b.extend_from_slice(&[0x00, 0x00, 0x00, 0x01]);
annex_b.extend_from_slice(sps);
}
for pps in &encoded.pps_list {
annex_b.extend_from_slice(&[0x00, 0x00, 0x00, 0x01]);
annex_b.extend_from_slice(pps);
}
// Convert slice NALs from AVCC (4-byte length prefix) to Annex-B.
annex_b.extend_from_slice(&avcc_to_annexb(&encoded.data));
}
// Only clear the keyframe request once a keyframe has actually
// been emitted — VideoToolbox may buffer several frames before
// producing output.
if emitted_keyframe {
self.force_keyframe = false;
}
Ok(annex_b)
}
#[cfg(not(target_os = "macos"))]
{
let _ = frame;
Err(VideoError::NotInitialized)
}
}
fn request_keyframe(&mut self) {
@@ -51,29 +165,222 @@ impl VideoEncoder for VideoToolboxEncoder {
}
}
/// Convert an AVCC blob (4-byte big-endian length prefixes) to Annex-B
/// (4-byte start codes `0x00 0x00 0x00 0x01`).
fn avcc_to_annexb(data: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(data.len() + data.len() / 4);
let mut offset = 0;
while offset + 4 <= data.len() {
let nal_len = u32::from_be_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
]) as usize;
offset += 4;
if offset + nal_len > data.len() {
break;
}
out.extend_from_slice(&[0x00, 0x00, 0x00, 0x01]);
out.extend_from_slice(&data[offset..offset + nal_len]);
offset += nal_len;
}
out
}
/// Parse an Annex-B access unit and return the first SPS and PPS found.
fn extract_sps_pps(annex_b: &[u8]) -> (Option<Vec<u8>>, Option<Vec<u8>>) {
let nals = split_annex_b(annex_b);
let mut sps = None;
let mut pps = None;
for nal in nals {
if nal.is_empty() {
continue;
}
let nal_type = nal[0] & 0x1F;
if nal_type == 7 && sps.is_none() {
sps = Some(nal.to_vec());
} else if nal_type == 8 && pps.is_none() {
pps = Some(nal.to_vec());
}
}
(sps, pps)
}
/// Split an Annex-B byte stream into individual NAL units (without start codes).
fn split_annex_b(data: &[u8]) -> Vec<&[u8]> {
let mut nals = Vec::new();
let mut i = 0;
while i < data.len() {
// Skip start code.
if i + 3 <= data.len() && data[i..i + 3] == [0x00, 0x00, 0x01] {
i += 3;
} else if i + 4 <= data.len() && data[i..i + 4] == [0x00, 0x00, 0x00, 0x01] {
i += 4;
} else {
i += 1;
continue;
}
let start = i;
// Find next start code.
while i < data.len() {
if i + 3 <= data.len() && data[i..i + 3] == [0x00, 0x00, 0x01] {
break;
}
if i + 4 <= data.len() && data[i..i + 4] == [0x00, 0x00, 0x00, 0x01] {
break;
}
i += 1;
}
nals.push(&data[start..i]);
}
nals
}
/// Convert Annex-B NAL units to AVCC (4-byte big-endian length prefixes).
fn annexb_to_avcc(annex_b: &[u8]) -> Vec<u8> {
let nals = split_annex_b(annex_b);
let mut out = Vec::with_capacity(annex_b.len());
for nal in nals {
let len = nal.len() as u32;
out.extend_from_slice(&len.to_be_bytes());
out.extend_from_slice(nal);
}
out
}
/// macOS VideoToolbox H.264 decoder.
///
/// Wraps `VTDecompressionSession`. Minimum viable: API compiles and is
/// instantiable.
/// Wraps `VTDecompressionSession`. On non-macOS targets this is a compile-safe
/// placeholder that returns [`VideoError::NotInitialized`].
pub struct VideoToolboxDecoder {
#[cfg(target_os = "macos")]
inner: Option<Decoder>,
#[cfg(target_os = "macos")]
width: u32,
#[cfg(target_os = "macos")]
height: u32,
#[cfg(not(target_os = "macos"))]
_width: u32,
#[cfg(not(target_os = "macos"))]
_height: u32,
}
impl VideoToolboxDecoder {
/// Create a new decoder.
///
/// The actual `VTDecompressionSession` is created lazily when the first
/// SPS/PPS parameter sets arrive in-band.
pub fn new(width: u32, height: u32) -> Result<Self, VideoError> {
Ok(Self {
_width: width,
_height: height,
})
#[cfg(target_os = "macos")]
{
Ok(Self {
inner: None,
width,
height,
})
}
#[cfg(not(target_os = "macos"))]
{
let _ = (width, height);
Ok(Self {
_width: width,
_height: height,
})
}
}
#[cfg(target_os = "macos")]
fn ensure_decoder(&mut self, sps: &[u8], pps: &[u8]) -> Result<(), VideoError> {
let needs_create = self.inner.is_none();
let needs_update = if let Some(dec) = &mut self.inner {
// Simple heuristic: if we already have a decoder, try updating
// its format description. If the same SPS/PPS arrive again
// `update_format` is a no-op.
let codec = DecoderCodec::H264 {
sps,
pps,
nalu_len_bytes: 4,
};
dec.update_format(codec).is_err()
} else {
false
};
if needs_create || needs_update {
let config = DecoderConfig {
codec: DecoderCodec::H264 {
sps,
pps,
nalu_len_bytes: 4,
},
pixel_format: PixelFormat::I420,
};
self.inner = Some(
Decoder::new(config)
.map_err(|e| VideoError::PlatformError(format!("decoder create: {e}")))?,
);
}
Ok(())
}
}
impl VideoDecoder for VideoToolboxDecoder {
fn decode(&mut self, _access_unit: &[u8]) -> Result<Option<VideoFrame>, VideoError> {
// TODO(T4.2-MVP): Wire VTDecompressionSession.
Ok(None)
fn decode(&mut self, access_unit: &[u8]) -> Result<Option<VideoFrame>, VideoError> {
#[cfg(target_os = "macos")]
{
if access_unit.is_empty() {
return Ok(None);
}
// Extract parameter sets if present.
let (sps, pps) = extract_sps_pps(access_unit);
// Build or refresh decoder when we see new parameter sets.
if let (Some(s), Some(p)) = (&sps, &pps) {
self.ensure_decoder(s, p)?;
}
let decoder = self.inner.as_mut().ok_or(VideoError::NotInitialized)?;
// Convert Annex-B input to AVCC (4-byte length prefixes) as
// required by the VideoToolbox decoder wrapper.
let avcc = annexb_to_avcc(access_unit);
if avcc.is_empty() {
return Ok(None);
}
let decoded = decoder
.decode(&avcc)
.map_err(|e| VideoError::PlatformError(format!("decode failed: {e}")))?;
match decoded {
Some(DecodedFrame::I420(frame)) => {
let y = frame.y_plane();
let u = frame.u_plane();
let v = frame.v_plane();
let mut data = Vec::with_capacity(y.len() + u.len() + v.len());
data.extend_from_slice(y);
data.extend_from_slice(u);
data.extend_from_slice(v);
Ok(Some(VideoFrame {
width: self.width,
height: self.height,
data,
timestamp_ms: 0,
}))
}
Some(DecodedFrame::Nv12(_)) => Err(VideoError::PlatformError(
"unexpected NV12 output from decoder".to_string(),
)),
None => Ok(None),
}
}
#[cfg(not(target_os = "macos"))]
{
let _ = access_unit;
Err(VideoError::NotInitialized)
}
}
}
@@ -107,4 +414,39 @@ mod tests {
enc.request_keyframe();
assert!(enc.force_keyframe);
}
#[test]
fn avcc_to_annexb_roundtrip() {
// Build a simple AVCC stream: two NALs.
let nal1 = vec![0x67, 0x42, 0xC0, 0x1E]; // SPS
let nal2 = vec![0x68, 0xCE, 0x3C, 0x80]; // PPS
let mut avcc = Vec::new();
avcc.extend_from_slice(&(nal1.len() as u32).to_be_bytes());
avcc.extend_from_slice(&nal1);
avcc.extend_from_slice(&(nal2.len() as u32).to_be_bytes());
avcc.extend_from_slice(&nal2);
let annex_b = avcc_to_annexb(&avcc);
let expected = vec![
0x00, 0x00, 0x00, 0x01, 0x67, 0x42, 0xC0, 0x1E, 0x00, 0x00, 0x00, 0x01, 0x68, 0xCE,
0x3C, 0x80,
];
assert_eq!(annex_b, expected);
// And back.
let avcc2 = annexb_to_avcc(&annex_b);
assert_eq!(avcc2, avcc);
}
#[test]
fn extract_sps_pps_finds_params() {
let au = vec![
0x00, 0x00, 0x00, 0x01, 0x67, 0x42, 0xC0, 0x1E, // SPS
0x00, 0x00, 0x00, 0x01, 0x68, 0xCE, 0x3C, 0x80, // PPS
0x00, 0x00, 0x00, 0x01, 0x65, 0x01, 0x02, // IDR
];
let (sps, pps) = extract_sps_pps(&au);
assert_eq!(sps, Some(vec![0x67, 0x42, 0xC0, 0x1E]));
assert_eq!(pps, Some(vec![0x68, 0xCE, 0x3C, 0x80]));
}
}

View File

@@ -0,0 +1,143 @@
//! Round-trip integration test: synthetic I420 frame → VideoToolbox encode →
//! depacketize → VideoToolbox decode → frame.
//!
//! This test requires macOS (VideoToolbox is not available elsewhere).
#![cfg(target_os = "macos")]
use std::sync::Mutex;
use wzp_video::{VideoDecoder, VideoEncoder, VideoFrame};
/// VideoToolbox uses global encoder registry state that can race when multiple
/// sessions are created concurrently. Serialize integration tests.
static VT_LOCK: Mutex<()> = Mutex::new(());
/// Generate a synthetic 640×360 I420 frame with a simple gradient pattern.
/// True if the Annex-B access unit contains at least one IDR slice (NAL type 5).
fn au_contains_idr(au: &[u8]) -> bool {
let mut i = 0;
while i < au.len() {
// Skip start code.
if i + 3 <= au.len() && au[i..i + 3] == [0x00, 0x00, 0x01] {
i += 3;
} else if i + 4 <= au.len() && au[i..i + 4] == [0x00, 0x00, 0x00, 0x01] {
i += 4;
} else {
i += 1;
continue;
}
if i < au.len() && (au[i] & 0x1F) == 5 {
return true;
}
}
false
}
fn synthetic_i420_frame(width: u32, height: u32) -> VideoFrame {
let y_size = (width * height) as usize;
let uv_size = y_size / 4;
let mut data = vec![0u8; y_size + uv_size * 2];
// Y plane: horizontal gradient.
for y in 0..height {
for x in 0..width {
let val = ((x * 255) / width) as u8;
data[(y * width + x) as usize] = val;
}
}
// U and V planes: flat mid-grey.
data[y_size..y_size + uv_size].fill(128);
data[y_size + uv_size..].fill(128);
VideoFrame {
width,
height,
data,
timestamp_ms: 0,
}
}
#[test]
fn encode_decode_roundtrip() {
let _guard = VT_LOCK.lock().unwrap();
let width = 640;
let height = 360;
let mut encoder = wzp_video::VideoToolboxEncoder::new(width, height, 2_000_000).unwrap();
let mut decoder = wzp_video::VideoToolboxDecoder::new(width, height).unwrap();
let mut keyframe_seen = false;
let mut decoded_any = false;
for i in 0..30 {
let mut frame = synthetic_i420_frame(width, height);
frame.timestamp_ms = i as u64 * 33;
if i == 0 {
encoder.request_keyframe();
}
let au = encoder.encode(&frame).unwrap();
if au.is_empty() {
// VideoToolbox may buffer frames; not every encode() yields output.
continue;
}
if au_contains_idr(&au) {
keyframe_seen = true;
}
// Decode the access unit.
let decoded = decoder.decode(&au).unwrap();
if let Some(decoded_frame) = decoded {
assert_eq!(decoded_frame.width, width);
assert_eq!(decoded_frame.height, height);
// I420 size check: Y + U + V = 1.5 * width * height
let expected_size = (width * height * 3 / 2) as usize;
assert!(
decoded_frame.data.len() >= expected_size,
"decoded frame data too small: {} < {expected_size}",
decoded_frame.data.len()
);
decoded_any = true;
}
}
assert!(
keyframe_seen,
"at least one keyframe should have been produced"
);
assert!(decoded_any, "at least one frame should have been decoded");
}
#[test]
fn keyframe_in_first_five_frames() {
let _guard = VT_LOCK.lock().unwrap();
let width = 640;
let height = 360;
let mut encoder = wzp_video::VideoToolboxEncoder::new(width, height, 2_000_000).unwrap();
let mut keyframe_seen = false;
for i in 0..5 {
let mut frame = synthetic_i420_frame(width, height);
frame.timestamp_ms = i as u64 * 33;
if i == 0 {
encoder.request_keyframe();
}
let au = encoder.encode(&frame).unwrap();
if !au.is_empty() && au_contains_idr(&au) {
keyframe_seen = true;
break;
}
}
assert!(
keyframe_seen,
"at least one keyframe should appear in the first 5 frames"
);
}