fix: sample-accurate playback buffer eliminates robotic audio

Previous version output 960 samples into 1024-sample callback frames,
causing 64 samples of silence per frame (choppy/robotic sound).

Now accumulates float samples in a continuous buffer, output callback
pulls exactly 1024 at a time regardless of input frame size.
Buffer capped at 200ms to prevent drift.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Siavash Sameni
2026-03-27 19:29:52 +04:00
parent 4de72e2d98
commit 1c91c4a1b5

View File

@@ -173,31 +173,33 @@ function startAudioCapture() {
scriptNode.connect(audioCtx.destination); scriptNode.connect(audioCtx.destination);
} }
// Ring buffer playback using AudioWorklet-style approach // Pull-based playback with sample-accurate ring buffer
let playbackBuffer = []; let playSamples = new Float32Array(0); // accumulated float samples
let playbackNode = null; let playbackNode = null;
const MAX_BUFFERED_SAMPLES = SAMPLE_RATE / 5; // 200ms max (~9600 samples)
function initPlayback() { function initPlayback() {
if (playbackNode) return; if (playbackNode) return;
// Use a ScriptProcessorNode as a pull-based audio sink.
// It asks for audio every ~21ms (1024 samples at 48kHz).
// We feed it from our ring buffer of received frames.
playbackNode = audioCtx.createScriptProcessor(1024, 1, 1); playbackNode = audioCtx.createScriptProcessor(1024, 1, 1);
playbackNode.onaudioprocess = (e) => { playbackNode.onaudioprocess = (e) => {
const output = e.outputBuffer.getChannelData(0); const output = e.outputBuffer.getChannelData(0);
// Pull from buffer — drop old frames if we're behind const need = output.length; // 1024
while (playbackBuffer.length > 10) {
playbackBuffer.shift(); // drop oldest, keeps latency bounded // Drop excess to cap latency
if (playSamples.length > MAX_BUFFERED_SAMPLES) {
playSamples = playSamples.slice(playSamples.length - MAX_BUFFERED_SAMPLES);
} }
if (playbackBuffer.length > 0) {
const frame = playbackBuffer.shift(); if (playSamples.length >= need) {
// frame is 960 samples, output is 1024 — copy what we can output.set(playSamples.subarray(0, need));
const len = Math.min(frame.length, output.length); playSamples = playSamples.slice(need);
for (let i = 0; i < len; i++) output[i] = frame[i]; } else if (playSamples.length > 0) {
for (let i = len; i < output.length; i++) output[i] = 0; // Partial — play what we have, pad with silence
output.set(playSamples.subarray(0, playSamples.length));
for (let i = playSamples.length; i < need; i++) output[i] = 0;
playSamples = new Float32Array(0);
} else { } else {
// Underrun — silence for (let i = 0; i < need; i++) output[i] = 0;
for (let i = 0; i < output.length; i++) output[i] = 0;
} }
}; };
playbackNode.connect(audioCtx.destination); playbackNode.connect(audioCtx.destination);
@@ -211,7 +213,12 @@ function playAudio(pcmInt16) {
for (let i = 0; i < pcmInt16.length; i++) { for (let i = 0; i < pcmInt16.length; i++) {
floatData[i] = pcmInt16[i] / 32768.0; floatData[i] = pcmInt16[i] / 32768.0;
} }
playbackBuffer.push(floatData);
// Append to sample buffer
const combined = new Float32Array(playSamples.length + floatData.length);
combined.set(playSamples);
combined.set(floatData, playSamples.length);
playSamples = combined;
} }
function startStatsUpdate() { function startStatsUpdate() {