From 8ec411d6aacb7b4e2110436dbddb7599d0569e0b Mon Sep 17 00:00:00 2001 From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com> Date: Thu, 23 Oct 2025 16:46:24 +0530 Subject: [PATCH] feat(tts): server chunked playback queue on server pathRefactor server-backedTS path to perform sentence chunking and queued playback via a dedicated _startServerChunkedPlayback method instead of generating a single monolithic audio blob. This change simplifies the server flow, avoids constructing an entire audio buffer in memory, and enables smoother playback and error recovery. On errors, the code still falls back to device TTS. --- .../chat/services/text_to_speech_service.dart | 175 ++++++++++++++++-- 1 file changed, 157 insertions(+), 18 deletions(-) diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart index 65aaa86..5d344e6 100644 --- a/lib/features/chat/services/text_to_speech_service.dart +++ b/lib/features/chat/services/text_to_speech_service.dart @@ -19,6 +19,11 @@ class TextToSpeechService { bool _initialized = false; bool _available = false; bool _voiceConfigured = false; + int _session = 0; // increments to cancel in-flight work + final List _buffered = []; // server chunks + int _expectedChunks = 0; + int _currentIndex = -1; + bool _waitingNext = false; VoidCallback? _onStart; VoidCallback? _onComplete; @@ -32,7 +37,7 @@ class TextToSpeechService { TextToSpeechService({ApiService? api}) : _api = api { // Wire minimal player events to callbacks - _player.onPlayerComplete.listen((_) => _handleComplete()); + _player.onPlayerComplete.listen((_) => _onAudioComplete()); _player.onPlayerStateChanged.listen((s) { if (s == PlayerState.playing) _handleStart(); }); @@ -120,26 +125,11 @@ class TextToSpeechService { } if (_engine == TtsEngine.server && _api != null) { - // Server-backed TTS path + // Server-backed TTS with sentence chunking & queued playback try { - final effectiveVoice = - (_preferredVoice == null || _preferredVoice!.trim().isEmpty) - ? 'alloy' - : _preferredVoice!; - - final bytes = await _api.generateSpeech( - text: text, - voice: effectiveVoice, - ); - if (bytes.isEmpty) { - throw Exception('Empty audio response'); - } - await _player.stop(); - final data = Uint8List.fromList(bytes); - await _player.play(BytesSource(data)); + await _startServerChunkedPlayback(text); } catch (e) { _onError?.call(e.toString()); - // Fallback to device TTS on failure await _speakOnDevice(text); } return; @@ -182,6 +172,12 @@ class TextToSpeechService { } try { + // Cancel any in-flight server work + _session++; + _buffered.clear(); + _expectedChunks = 0; + _currentIndex = -1; + _waitingNext = false; if (_engine == TtsEngine.server) { await _player.stop(); } else { @@ -341,6 +337,149 @@ class TextToSpeechService { } } + // ===== Server chunked playback ===== + + Future _startServerChunkedPlayback(String text) async { + final effectiveVoice = + (_preferredVoice == null || _preferredVoice!.trim().isEmpty) + ? 'alloy' + : _preferredVoice!; + + // Reset queue and create a new session + _session++; + final session = _session; + _buffered.clear(); + _expectedChunks = 0; + _currentIndex = -1; + _waitingNext = false; + + final chunks = _splitForTts(text); + if (chunks.isEmpty) return; + _expectedChunks = chunks.length; + + // Fetch first chunk to start playback quickly + final firstBytes = await _fetchServerAudio( + chunks.first, + effectiveVoice, + session, + ); + if (session != _session) return; // canceled + if (firstBytes.isEmpty) throw Exception('Empty audio response'); + + await _player.stop(); + _buffered.add(Uint8List.fromList(firstBytes)); + _currentIndex = 0; + await _player.play(BytesSource(_buffered.first)); + + // Prefetch the rest in background + unawaited( + _prefetchRemainingChunks( + chunks.skip(1).toList(), + effectiveVoice, + session, + ), + ); + } + + Future _prefetchRemainingChunks( + List remaining, + String voice, + int session, + ) async { + for (final chunk in remaining) { + if (session != _session) return; // canceled + try { + final audio = await _fetchServerAudio(chunk, voice, session); + if (session != _session) return; + if (audio.isNotEmpty) { + _buffered.add(Uint8List.fromList(audio)); + // If the player finished the previous chunk and is waiting, start now + if (_waitingNext && (_currentIndex + 1) < _buffered.length) { + _waitingNext = false; + await _playNextIfBuffered(session); + } + } + } catch (e) { + _onError?.call(e.toString()); + // continue with other chunks + } + } + } + + Future> _fetchServerAudio( + String text, + String voice, + int session, + ) async { + return await _api!.generateSpeech(text: text, voice: voice); + } + + Future _onAudioComplete() async { + final session = _session; + // If there are more expected chunks + if ((_currentIndex + 1) < _expectedChunks) { + // If next chunk is already buffered, play it + if ((_currentIndex + 1) < _buffered.length) { + await _playNextIfBuffered(session); + } else { + // Wait for prefetch to provide it + _waitingNext = true; + } + return; + } + // No more chunks – this is the real completion + _handleComplete(); + } + + Future _playNextIfBuffered(int session) async { + if (session != _session) return; + final nextIndex = _currentIndex + 1; + if (nextIndex < 0 || nextIndex >= _buffered.length) return; + _currentIndex = nextIndex; + final bytes = _buffered[nextIndex]; + await _player.play(BytesSource(bytes)); + } + + List _splitForTts(String text) { + // Normalize whitespace + final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim(); + if (normalized.isEmpty) return const []; + + // Split on sentence-ending punctuation while keeping the delimiter + final parts = []; + final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)"); + int index = 0; + for (final match in sentenceRegex.allMatches('$normalized ')) { + final s = match.group(1) ?? ''; + if (s.trim().isNotEmpty) parts.add(s.trim()); + index = match.end; + } + if (index < normalized.length) { + final tail = normalized.substring(index).trim(); + if (tail.isNotEmpty) parts.add(tail); + } + + // Fallback to length-based splits for very long segments + const maxLen = 300; + final chunks = []; + for (final p in parts.isEmpty ? [normalized] : parts) { + if (p.length <= maxLen) { + chunks.add(p); + } else { + // Try splitting on commas/spaces + var remaining = p; + while (remaining.length > maxLen) { + int cut = remaining.lastIndexOf(RegExp(r",\s|\s"), maxLen); + cut = cut <= 0 ? maxLen : cut; + chunks.add(remaining.substring(0, cut).trim()); + remaining = remaining.substring(cut).trim(); + } + if (remaining.isNotEmpty) chunks.add(remaining); + } + } + return chunks; + } + Future _configurePreferredVoice() async { if (_voiceConfigured) { return;