From 08d5de8a684703f5f9de5b03160a9c6206bc0dbe Mon Sep 17 00:00:00 2001 From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com> Date: Mon, 8 Dec 2025 10:28:59 +0530 Subject: [PATCH] refactor(tts): Simplify text-to-speech service by migrating to TtsManager --- .../providers/text_to_speech_provider.dart | 21 +- .../chat/services/text_to_speech_service.dart | 1179 ++--------------- lib/features/chat/services/tts_manager.dart | 916 +++++++++++++ 3 files changed, 1055 insertions(+), 1061 deletions(-) create mode 100644 lib/features/chat/services/tts_manager.dart diff --git a/lib/features/chat/providers/text_to_speech_provider.dart b/lib/features/chat/providers/text_to_speech_provider.dart index 7992596..759a19f 100644 --- a/lib/features/chat/providers/text_to_speech_provider.dart +++ b/lib/features/chat/providers/text_to_speech_provider.dart @@ -391,23 +391,12 @@ class TextToSpeechController extends Notifier { void _handleDeviceWordProgress(int start, int end) { if (!ref.mounted) return; - // Map global offsets to sentence index - final offsets = state.sentenceOffsets; - if (offsets.isEmpty) return; - int idx = 0; - for (var i = 0; i < offsets.length; i++) { - final sStart = offsets[i]; - final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30; - if (start >= sStart && start < sEnd) { - idx = i; - break; - } - } - final sentenceStart = offsets[idx]; + // Word progress offsets are relative to the current chunk/sentence being + // spoken, NOT the full original text. TtsChunkStarted already sets the + // correct activeSentenceIndex, so we only update word highlighting here. state = state.copyWith( - activeSentenceIndex: idx, - wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20), - wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20), + wordStartInSentence: start.clamp(0, 1 << 20), + wordEndInSentence: end.clamp(0, 1 << 20), ); } } diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart index 831befc..ee7c036 100644 --- a/lib/features/chat/services/text_to_speech_service.dart +++ b/lib/features/chat/services/text_to_speech_service.dart @@ -1,44 +1,31 @@ import 'dart:async'; -import 'dart:io' show Platform; -import 'package:audioplayers/audioplayers.dart'; import 'package:flutter/foundation.dart'; -import 'package:flutter/widgets.dart'; -import 'package:flutter_tts/flutter_tts.dart'; import '../../../core/services/api_service.dart'; import '../../../core/services/settings_service.dart'; +import 'tts_manager.dart'; -typedef _SpeechChunk = ({Uint8List bytes, String mimeType}); +export 'tts_manager.dart' show TtsEvent, TtsPlaybackSession; -class SpeechAudioChunk { - const SpeechAudioChunk({required this.bytes, required this.mimeType}); - - final Uint8List bytes; - final String mimeType; -} - -/// Lightweight wrapper around FlutterTts to centralize configuration +/// Wrapper around [TtsManager] that provides a callback-based API. +/// +/// This service is used by the [TextToSpeechController] and [VoiceCallService] +/// to interact with TTS. It translates [TtsEvent]s from the manager into +/// callbacks for backward compatibility. class TextToSpeechService { - final FlutterTts _tts = FlutterTts(); - final AudioPlayer _player = AudioPlayer(); - final ApiService? _api; - TtsEngine _engine = TtsEngine.device; - String? _preferredVoice; - String? _serverPreferredVoice; - double _speechRate = 0.5; - bool _initialized = false; - bool _available = false; - bool _voiceConfigured = false; - int _session = 0; // increments to cancel in-flight work - final List<_SpeechChunk> _buffered = <_SpeechChunk>[]; // server chunks - int _expectedChunks = 0; - int _currentIndex = -1; - bool _waitingNext = false; - bool _deviceEngineAvailable = false; - String? _serverDefaultVoice; - Future? _serverDefaultVoiceFuture; + TextToSpeechService({ApiService? api}) { + // Set the API service on the manager + TtsManager.instance.setApiService(api); + // Listen to TTS events and route to callbacks + _eventSubscription = TtsManager.instance.events.listen(_handleEvent); + } + + StreamSubscription? _eventSubscription; + bool _initialized = false; + + // Callbacks VoidCallback? _onStart; VoidCallback? _onComplete; VoidCallback? _onCancel; @@ -48,111 +35,29 @@ class TextToSpeechService { void Function(int sentenceIndex)? _onSentenceIndex; void Function(int start, int end)? _onDeviceWordProgress; + /// Whether the service has been initialized. bool get isInitialized => _initialized; - bool get isAvailable => _available; - bool get deviceEngineAvailable => _deviceEngineAvailable; - bool get serverEngineAvailable => _api != null; - bool get prefersServerEngine => _shouldUseServer(); - TextToSpeechService({ApiService? api}) : _api = api { - // Wire minimal player events to callbacks - _player.onPlayerComplete.listen((_) => _onAudioComplete()); - _player.onPlayerStateChanged.listen((state) { - switch (state) { - case PlayerState.playing: - _handleStart(); - break; - case PlayerState.paused: - _handlePause(); - break; - default: - break; - } - }); + /// Whether TTS is available. + bool get isAvailable => TtsManager.instance.isAvailable; - if (!kIsWeb && Platform.isAndroid) { - _player.setAudioContext( - AudioContext( - android: const AudioContextAndroid(), - ), - ); + /// Whether device TTS is available. + bool get deviceEngineAvailable => TtsManager.instance.deviceAvailable; + + /// Whether server TTS is available. + bool get serverEngineAvailable => TtsManager.instance.serverAvailable; + + /// Whether server TTS is preferred and available. + bool get prefersServerEngine { + final config = TtsManager.instance.config; + if (config.preferServer && TtsManager.instance.serverAvailable) { + return true; } + return !TtsManager.instance.deviceAvailable && + TtsManager.instance.serverAvailable; } - Future _configureDeviceEngine({ - required String? voice, - required double speechRate, - required double pitch, - required double volume, - }) async { - _deviceEngineAvailable = false; - try { - await _ensureAndroidDefaultEngine(); - // Ensure speak() futures complete only after playback finishes. - // This avoids race conditions where completion callbacks fire - // early in release builds (especially on iOS), which can cause - // our voice-call pipeline to resume listening and cut off speech. - await _tts.awaitSpeakCompletion(true); - await _tts.setVolume(volume); - await _tts.setSpeechRate(speechRate); - await _tts.setPitch(pitch); - - if (!kIsWeb && Platform.isIOS) { - await _tts.setSharedInstance(true); - // Rely on the native VoiceBackgroundAudioManager for iOS - // audio session configuration to avoid routing conflicts. - } - - if (_engine != TtsEngine.server) { - await _setVoiceByName(_preferredVoice); - } else { - _voiceConfigured = false; - } - - _deviceEngineAvailable = true; - } catch (e) { - _voiceConfigured = false; - _deviceEngineAvailable = false; - rethrow; - } - } - - Future _ensureAndroidDefaultEngine() async { - if (kIsWeb || !Platform.isAndroid) { - return; - } - try { - final engine = await _tts.getDefaultEngine; - if (engine is String && engine.isNotEmpty) { - await _tts.setEngine(engine); - } - } catch (e) { - _onError?.call(e.toString()); - } - } - - bool _computeAvailability() { - final serverAvailable = _api != null; - switch (_engine) { - case TtsEngine.device: - return _deviceEngineAvailable || serverAvailable; - case TtsEngine.server: - return serverAvailable; - } - } - - bool _shouldUseServer() { - if (_engine == TtsEngine.server) { - return _api != null; - } - // Device preference with graceful fallback to server if available. - if (_deviceEngineAvailable) { - return false; - } - return _api != null; - } - - /// Register callbacks for TTS lifecycle events + /// Registers callbacks for TTS lifecycle events. void bindHandlers({ VoidCallback? onStart, VoidCallback? onComplete, @@ -171,23 +76,9 @@ class TextToSpeechService { _onError = onError; _onSentenceIndex = onSentenceIndex; _onDeviceWordProgress = onDeviceWordProgress; - - _tts.setStartHandler(_handleStart); - _tts.setCompletionHandler(_handleComplete); - _tts.setCancelHandler(_handleCancel); - _tts.setPauseHandler(_handlePause); - _tts.setContinueHandler(_handleContinue); - _tts.setErrorHandler(_handleError); - try { - _tts.setProgressHandler((String text, int start, int end, String word) { - _onDeviceWordProgress?.call(start, end); - }); - } catch (_) { - // Some platforms may not support progress handler - } } - /// Initialize the native TTS engine lazily + /// Initializes the TTS engine. Future initialize({ String? deviceVoice, String? serverVoice, @@ -197,959 +88,157 @@ class TextToSpeechService { TtsEngine engine = TtsEngine.device, }) async { if (_initialized) { - _engine = engine; - _speechRate = speechRate; - if (deviceVoice != null) { - _preferredVoice = deviceVoice; - _voiceConfigured = false; - } - if (serverVoice != null) { - _serverPreferredVoice = serverVoice; - } - _available = _computeAvailability(); - return _available; - } - - _engine = engine; - _speechRate = speechRate; - _preferredVoice = deviceVoice; - _serverPreferredVoice = serverVoice; - _voiceConfigured = false; - - if (_engine != TtsEngine.server || _api == null) { - try { - await _configureDeviceEngine( + // Update config if already initialized + await TtsManager.instance.updateConfig( + TtsConfig( voice: deviceVoice, + serverVoice: serverVoice, speechRate: speechRate, pitch: pitch, volume: volume, - ); - } catch (e) { - if (_engine == TtsEngine.device) { - _available = false; - _onError?.call(e.toString()); - _initialized = true; - return _available; - } - } - } else { - _deviceEngineAvailable = false; - try { - await _tts.awaitSpeakCompletion(false); - await _tts.setVolume(volume); - await _tts.setSpeechRate(speechRate); - await _tts.setPitch(pitch); - } catch (_) {} + preferServer: engine == TtsEngine.server, + ), + ); + return isAvailable; } - _available = _computeAvailability(); + final available = await TtsManager.instance.initialize( + config: TtsConfig( + voice: deviceVoice, + serverVoice: serverVoice, + speechRate: speechRate, + pitch: pitch, + volume: volume, + preferServer: engine == TtsEngine.server, + ), + ); + _initialized = true; - return _available; + return available; } + /// Speaks the given text. Future speak(String text) async { if (text.trim().isEmpty) { throw ArgumentError('Cannot speak empty text'); } if (!_initialized) { - await initialize( - deviceVoice: _preferredVoice, - serverVoice: _serverPreferredVoice, - engine: _engine, - ); + await initialize(); } - final bool useServer = _shouldUseServer(); - - if (useServer) { - if (_api == null) { - if (_deviceEngineAvailable) { - await _speakOnDevice(text); - return; - } - throw StateError('Server text-to-speech is unavailable'); - } - // Server-backed TTS with sentence chunking & queued playback - try { - await _startServerChunkedPlayback(text); - } catch (e) { - _onError?.call(e.toString()); - if (_deviceEngineAvailable) { - await _speakOnDevice(text); - } else { - throw StateError('Server text-to-speech failed: $e'); - } - } - return; - } - - // Device TTS path - await _speakOnDevice(text); - } - - Future _speakOnDevice(String text) async { - if (!_deviceEngineAvailable) { - throw StateError('Device text-to-speech is unavailable'); - } - await _tts.stop(); - if (!_voiceConfigured) { - await _configurePreferredVoice(); - } - final result = await _tts.speak(text); - if (result is int && result != 1) { - _onError?.call('Text-to-speech engine returned code $result'); - } - _onSentenceIndex?.call(0); - } - - Future synthesizeServerSpeechChunk(String text) async { - if (text.trim().isEmpty) { - throw ArgumentError('Cannot synthesize empty text'); - } - if (_api == null) { - throw StateError('Server text-to-speech is unavailable'); - } - if (!_initialized) { - await initialize( - deviceVoice: _preferredVoice, - serverVoice: _serverPreferredVoice, - engine: _engine, - ); - } - final voice = await _resolveServerVoice(); - final chunk = await _api.generateSpeech( - text: text, - voice: voice, - speed: _speechRate, - ); - return SpeechAudioChunk(bytes: chunk.bytes, mimeType: chunk.mimeType); + await TtsManager.instance.speak(text); } + /// Pauses the current playback. Future pause() async { - if (!_initialized) return; - try { - if (_shouldUseServer()) { - await _player.pause(); - _handlePause(); - } else if (_deviceEngineAvailable) { - await _tts.pause(); - } - } catch (e) { - _onError?.call(e.toString()); - } + await TtsManager.instance.pause(); } + /// Resumes paused playback. Future resume() async { - if (!_initialized) return; - try { - if (_shouldUseServer()) { - if (_waitingNext && (_currentIndex + 1) < _buffered.length) { - _waitingNext = false; - await _playNextIfBuffered(_session); - } else { - await _player.resume(); - } - } - } catch (e) { - _onError?.call(e.toString()); - } + await TtsManager.instance.resume(); } + /// Stops the current playback. Future stop() async { - if (!_initialized) { - return; - } - - try { - // Cancel any in-flight server work - _session++; - _buffered.clear(); - _expectedChunks = 0; - _currentIndex = -1; - _waitingNext = false; - if (_shouldUseServer()) { - await _player.stop(); - _handleCancel(); - } else { - await _tts.stop(); - } - } catch (e) { - _onError?.call(e.toString()); - } + await TtsManager.instance.stop(); } + /// Disposes the service. Future dispose() async { - await stop(); - await _player.dispose(); + await _eventSubscription?.cancel(); + _eventSubscription = null; } - /// Update TTS settings on-the-fly + /// Updates TTS settings. Future updateSettings({ - Object? voice = const _VoiceNotProvided(), - Object? serverVoice = const _VoiceNotProvided(), + Object? voice = const _NotProvided(), + Object? serverVoice = const _NotProvided(), double? speechRate, double? pitch, double? volume, TtsEngine? engine, }) async { - final voiceProvided = voice is! _VoiceNotProvided; - final serverVoiceProvided = serverVoice is! _VoiceNotProvided; - final voiceValue = voiceProvided ? voice as String? : null; - final serverVoiceValue = serverVoiceProvided - ? serverVoice as String? - : null; - if (!_initialized || !_available) { - // Allow engine and voice to update before init - if (engine != null) _engine = engine; - if (voiceProvided) _preferredVoice = voiceValue; - if (serverVoiceProvided) _serverPreferredVoice = serverVoiceValue; - if (speechRate != null) _speechRate = speechRate; - return; - } + final current = TtsManager.instance.config; - try { - if (engine != null) { - _engine = engine; - } - if (voiceProvided) { - _preferredVoice = voiceValue; - } - if (serverVoiceProvided) { - _serverPreferredVoice = serverVoiceValue; - } - if (volume != null) { - await _tts.setVolume(volume); - } - if (speechRate != null) { - _speechRate = speechRate; - await _tts.setSpeechRate(speechRate); - } - if (pitch != null) { - await _tts.setPitch(pitch); - } - // Set specific voice by name on device-capable engines - if (_engine != TtsEngine.server && voiceProvided) { - await _setVoiceByName(_preferredVoice); - } - } catch (e) { - _onError?.call(e.toString()); - } - - _available = _computeAvailability(); - } - - /// Set voice by name, or use system default if null - Future _setVoiceByName(String? voiceName) async { - if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) { - return; - } - - try { - if (voiceName == null) { - // Use system default - reset voice configuration - _voiceConfigured = false; - await _configurePreferredVoice(); - return; - } - - // Get all available voices - final voicesRaw = await _tts.getVoices; - if (voicesRaw is! List) { - return; - } - - // Find the voice by name - Map? targetVoice; - for (final entry in voicesRaw) { - if (entry is Map) { - final normalized = _normalizeVoiceEntry(entry); - final name = normalized['name'] as String?; - if (name == voiceName) { - targetVoice = normalized; - break; - } - } - } - - // Set the voice if found - if (targetVoice != null) { - await _tts.setVoice(_voiceCommandFrom(targetVoice)); - _voiceConfigured = true; - } else { - // Voice not found, fall back to default - _voiceConfigured = false; - await _configurePreferredVoice(); - } - } catch (e) { - _onError?.call(e.toString()); - } - } - - /// Get available voices from the TTS engine - Future>> getAvailableVoices() async { - if (!_initialized) { - await initialize( - deviceVoice: _preferredVoice, - serverVoice: _serverPreferredVoice, - engine: _engine, - ); - } - - if (_engine == TtsEngine.server && _api != null) { - try { - final serverVoices = await _api.getAvailableServerVoices(); - final mapped = serverVoices - .map((v) { - final id = (v['id'] ?? v['name'] ?? '').toString(); - final name = (v['name'] ?? v['id'] ?? '').toString(); - final localeValue = (v['locale'] ?? v['language'] ?? '') - .toString(); - return {'id': id, 'name': name, 'locale': localeValue}; - }) - .where((entry) { - final name = entry['name']; - return name is String && name.trim().isNotEmpty; - }) - .toList(); - - final defaultVoice = await _getServerDefaultVoice(); - if (defaultVoice != null && defaultVoice.isNotEmpty) { - final normalized = defaultVoice.toLowerCase(); - final exists = mapped.any((voice) { - final name = voice['name']; - final id = voice['id']; - final lowerName = name is String ? name.toLowerCase() : ''; - final lowerId = id is String ? id.toLowerCase() : ''; - return lowerName == normalized || lowerId == normalized; - }); - if (!exists) { - mapped.insert(0, { - 'id': defaultVoice, - 'name': defaultVoice, - 'locale': '', - }); - } - } - - if (mapped.isEmpty) { - if (defaultVoice != null && defaultVoice.isNotEmpty) { - return [ - {'id': defaultVoice, 'name': defaultVoice, 'locale': ''}, - ]; - } - return const []; - } - return mapped; - } catch (e) { - _onError?.call(e.toString()); - // Fall back to device voices - } - } - - if (!_available) { - return []; - } - - try { - final voicesRaw = await _tts.getVoices; - if (voicesRaw is! List) { - return []; - } - - final parsedVoices = >[]; - for (final entry in voicesRaw) { - if (entry is Map) { - final normalized = _normalizeVoiceEntry(entry); - if (normalized.isNotEmpty) { - parsedVoices.add(normalized); - } - } - } - - return parsedVoices; - } catch (e) { - _onError?.call(e.toString()); - return []; - } - } - - Future _resolveServerVoice() async { - final serverSelected = _serverPreferredVoice?.trim(); - if (serverSelected != null && serverSelected.isNotEmpty) { - return serverSelected; - } - final selected = _preferredVoice?.trim(); - if (selected != null && selected.isNotEmpty) { - return selected; - } - final configVoice = await _getServerDefaultVoice(); - if (configVoice != null && configVoice.isNotEmpty) { - return configVoice; - } - return null; - } - - Future _getServerDefaultVoice() async { - if (_api == null) { - return null; - } - if (_serverDefaultVoice != null) { - return _serverDefaultVoice; - } - final pending = _serverDefaultVoiceFuture; - if (pending != null) { - return pending; - } - - final future = _api.getDefaultServerVoice(); - _serverDefaultVoiceFuture = future; - - try { - final voice = await future; - final trimmed = voice?.trim(); - if (trimmed != null && trimmed.isNotEmpty) { - _serverDefaultVoice = trimmed; - return _serverDefaultVoice; - } - return null; - } catch (e) { - _onError?.call(e.toString()); - return null; - } finally { - _serverDefaultVoiceFuture = null; - } - } - - Future preloadServerDefaults() async { - if (_api == null) { - return; - } - try { - await _getServerDefaultVoice(); - } catch (_) {} - } - - // ===== Server chunked playback ===== - - Future _startServerChunkedPlayback(String text) async { - final resolvedVoice = await _resolveServerVoice(); - final effectiveVoice = resolvedVoice; - - // Reset queue and create a new session - _session++; - final session = _session; - _buffered.clear(); - _expectedChunks = 0; - _currentIndex = -1; - _waitingNext = false; - - final chunks = _splitForTts(text); - if (chunks.isEmpty) return; - _expectedChunks = chunks.length; - - // Fetch first chunk to start playback quickly - final firstChunk = await _fetchServerAudio( - chunks.first, - effectiveVoice, - session, - ); - if (session != _session) return; // canceled - if (firstChunk.bytes.isEmpty) { - throw Exception('Empty audio response'); - } - - await _player.stop(); - final bufferedFirst = _cloneChunk(firstChunk); - _buffered.add(bufferedFirst); - _currentIndex = 0; - await _player.play( - BytesSource(bufferedFirst.bytes, mimeType: bufferedFirst.mimeType), - ); - _onSentenceIndex?.call(0); - - // Prefetch the rest in background - unawaited( - _prefetchRemainingChunks( - chunks.skip(1).toList(), - effectiveVoice, - session, + await TtsManager.instance.updateConfig( + TtsConfig( + voice: voice is _NotProvided ? current.voice : voice as String?, + serverVoice: serverVoice is _NotProvided + ? current.serverVoice + : serverVoice as String?, + speechRate: speechRate ?? current.speechRate, + pitch: pitch ?? current.pitch, + volume: volume ?? current.volume, + preferServer: engine != null + ? engine == TtsEngine.server + : current.preferServer, ), ); } - Future _prefetchRemainingChunks( - List remaining, - String? voice, - int session, - ) async { - for (final chunk in remaining) { - if (session != _session) return; // canceled - try { - final audioChunk = await _fetchServerAudio(chunk, voice, session); - if (session != _session) return; - if (audioChunk.bytes.isNotEmpty) { - _buffered.add(_cloneChunk(audioChunk)); - // If the player finished the previous chunk and is waiting, start now - if (_waitingNext && (_currentIndex + 1) < _buffered.length) { - _waitingNext = false; - await _playNextIfBuffered(session); - } - } - } catch (e) { - _onError?.call(e.toString()); - // continue with other chunks - } + /// Gets available voices from the device TTS engine. + Future>> getAvailableVoices() async { + if (!_initialized) { + await initialize(); } + + final config = TtsManager.instance.config; + if (config.preferServer && TtsManager.instance.serverAvailable) { + return TtsManager.instance.getServerVoices(); + } + + return TtsManager.instance.getDeviceVoices(); } - Future<_SpeechChunk> _fetchServerAudio( - String text, - String? voice, - int session, - ) async { - return await _api!.generateSpeech( - text: text, - voice: voice, - speed: _speechRate, - ); + /// Splits text into chunks for TTS playback. + List splitTextForSpeech(String text) { + return TtsManager.instance.splitTextForSpeech(text); } - /// Splits [text] into the chunks used for playback sequencing. - /// - /// This mirrors the server-side streaming behavior so UI consumers can stay - /// in sync with sentence indices reported during playback. - List splitTextForSpeech(String text) => _splitForTts(text); - - Future _onAudioComplete() async { - final session = _session; - // If there are more expected chunks - if ((_currentIndex + 1) < _expectedChunks) { - // If next chunk is already buffered, play it - if ((_currentIndex + 1) < _buffered.length) { - await _playNextIfBuffered(session); - } else { - // Wait for prefetch to provide it - _waitingNext = true; - } - return; - } - // No more chunks – this is the real completion - _handleComplete(); + /// Preloads server default voice configuration. + Future preloadServerDefaults() async { + await TtsManager.instance.preloadServerDefaults(); } - Future _playNextIfBuffered(int session) async { - if (session != _session) return; - final nextIndex = _currentIndex + 1; - if (nextIndex < 0 || nextIndex >= _buffered.length) return; - _currentIndex = nextIndex; - final chunk = _buffered[nextIndex]; - await _player.play(BytesSource(chunk.bytes, mimeType: chunk.mimeType)); - _onSentenceIndex?.call(_currentIndex); + /// Synthesizes a single chunk of text to audio (server TTS only). + Future synthesizeServerSpeechChunk(String text) async { + final result = await TtsManager.instance.synthesizeChunk(text); + return SpeechAudioChunk(bytes: result.bytes, mimeType: result.mimeType); } - _SpeechChunk _cloneChunk(_SpeechChunk chunk) { - return (bytes: Uint8List.fromList(chunk.bytes), mimeType: chunk.mimeType); - } - - List _splitForTts(String text) { - // Mirrors OpenWebUI's extractSentencesForAudio implementation - // See: src/lib/utils/index.ts lines 953-970, 907-928 - - // 1. Preserve code blocks (replace with placeholders) - final codeBlocks = []; - var processed = text; - var codeBlockIndex = 0; - - // Match triple backticks code blocks - final codeBlockRegex = RegExp(r'```[\s\S]*?```', multiLine: true); - processed = processed.replaceAllMapped(codeBlockRegex, (match) { - final placeholder = '\u0000$codeBlockIndex\u0000'; - codeBlocks.add(match.group(0)!); - codeBlockIndex++; - return placeholder; - }); - - // 2. Split on sentence-ending punctuation: .!? - // OpenWebUI uses: /(?<=[.!?])\s+/ - final sentences = processed - .split(RegExp(r'(?<=[.!?])\s+')) - .map((s) => s.trim()) - .where((s) => s.isNotEmpty) - .toList(); - - // 3. Restore code blocks from placeholders - final restoredSentences = sentences - .map((sentence) { - return sentence.replaceAllMapped(RegExp(r'\u0000(\d+)\u0000'), ( - match, - ) { - final idx = int.parse(match.group(1)!); - return idx < codeBlocks.length ? codeBlocks[idx] : ''; - }); - }) - .where((s) => s.isNotEmpty) - .toList(); - - // 4. Merge short sentences (< 4 words OR < 50 chars) - // OpenWebUI logic from extractSentencesForAudio - final mergedChunks = []; - for (final sentence in restoredSentences) { - if (mergedChunks.isEmpty) { - mergedChunks.add(sentence); - } else { - final lastIndex = mergedChunks.length - 1; - final previousText = mergedChunks[lastIndex]; - final wordCount = previousText.split(RegExp(r'\s+')).length; - final charCount = previousText.length; - - // Merge if previous chunk is too short - if (wordCount < 4 || charCount < 50) { - mergedChunks[lastIndex] = '$previousText $sentence'; - } else { - mergedChunks.add(sentence); - } - } + void _handleEvent(TtsEvent event) { + switch (event) { + case TtsStarted(): + _onStart?.call(); + case TtsChunkStarted(:final chunkIndex): + _onSentenceIndex?.call(chunkIndex); + case TtsWordProgress(:final start, :final end): + _onDeviceWordProgress?.call(start, end); + case TtsCompleted(): + _onComplete?.call(); + case TtsCancelled(): + _onCancel?.call(); + case TtsPaused(): + _onPause?.call(); + case TtsResumed(): + _onContinue?.call(); + case TtsError(:final message): + _onError?.call(message); } - - return mergedChunks.isEmpty ? [text.trim()] : mergedChunks; - } - - Future _configurePreferredVoice() async { - if (_voiceConfigured) { - return; - } - if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) { - _voiceConfigured = true; - return; - } - - var configured = false; - try { - await _ensureAndroidDefaultEngine(); - Map? defaultVoice; - bool voiceSet = false; - - if (Platform.isIOS) { - try { - final rawDefault = await _tts.getDefaultVoice; - if (rawDefault is Map) { - defaultVoice = _normalizeVoiceEntry(rawDefault); - await _tts.setVoice(_voiceCommandFrom(defaultVoice)); - configured = true; - voiceSet = true; - } - } catch (_) { - defaultVoice = null; - } - } - - if (voiceSet) { - return; - } - - final voicesRaw = await _tts.getVoices; - if (voicesRaw is! List) { - return; - } - - final parsedVoices = >[]; - for (final entry in voicesRaw) { - if (entry is Map) { - final normalized = _normalizeVoiceEntry(entry); - if (normalized.isNotEmpty) { - parsedVoices.add(normalized); - } - } - } - - if (parsedVoices.isEmpty) { - return; - } - - final localeTag = WidgetsBinding.instance.platformDispatcher.locale - .toLanguageTag() - .toLowerCase(); - final preferred = _selectPreferredVoice( - parsedVoices, - localeTag, - defaultVoice: defaultVoice, - ); - if (preferred == null) { - if (Platform.isIOS) { - configured = true; // Allow system default voice to be used - } - return; - } - - await _tts.setVoice(_voiceCommandFrom(preferred)); - configured = true; - } catch (e) { - _onError?.call(e.toString()); - } finally { - _voiceConfigured = configured || _voiceConfigured; - } - } - - Map _normalizeVoiceEntry(Map entry) { - final normalized = {}; - entry.forEach((key, value) { - if (key != null) { - normalized[key.toString()] = value; - } - }); - return normalized; - } - - Map _voiceCommandFrom(Map voice) { - final command = {}; - for (final key in [ - 'name', - 'locale', - 'identifier', - 'id', - 'voiceIdentifier', - 'engine', - ]) { - final value = voice[key]; - if (value != null) { - command[key] = value.toString(); - } - } - if (!command.containsKey('name') && voice['name'] != null) { - command['name'] = voice['name'].toString(); - } - if (!command.containsKey('locale') && voice['locale'] != null) { - command['locale'] = voice['locale'].toString(); - } - return command; - } - - int _iosVoiceScore(Map voice) { - final identifier = - voice['identifier']?.toString().toLowerCase() ?? - voice['id']?.toString().toLowerCase() ?? - ''; - final name = voice['name']?.toString().toLowerCase() ?? ''; - - int score = 0; - if (identifier.contains('premium')) { - score += 400; - } else if (identifier.contains('enhanced')) { - score += 250; - } else if (identifier.contains('compact')) { - score += 50; - } - - if (identifier.contains('siri') || name.contains('siri')) { - score += 150; - } - - if (identifier.contains('female') || name.contains('female')) { - score += 15; - } - if (identifier.contains('male') || name.contains('male')) { - score += 10; - } - - // Prefer non-compact by default when no other hints are present - if (!identifier.contains('compact')) { - score += 25; - } - - return score; - } - - Map? _selectPreferredVoice( - List> voices, - String localeTag, { - Map? defaultVoice, - }) { - Map? matchesLocale(Iterable> input) { - for (final voice in input) { - final locale = voice['locale']?.toString().toLowerCase(); - if (locale == null) continue; - if (locale == localeTag) { - return voice; - } - final localePrimary = locale.split(RegExp('[-_]')).first; - final tagPrimary = localeTag.split(RegExp('[-_]')).first; - if (localePrimary == tagPrimary) { - return voice; - } - } - return null; - } - - Map? matchDefaultVoice() { - final dv = defaultVoice; - if (dv == null) { - return null; - } - - final identifiers = {}; - for (final key in ['identifier', 'id', 'voiceIdentifier', 'voice']) { - final value = dv[key]?.toString(); - if (value != null && value.isNotEmpty) { - identifiers.add(value.toLowerCase()); - } - } - - if (identifiers.isNotEmpty) { - for (final voice in voices) { - for (final key in ['identifier', 'id', 'voiceIdentifier', 'voice']) { - final value = voice[key]?.toString(); - if (value != null && identifiers.contains(value.toLowerCase())) { - return voice; - } - } - } - } - - final defaultName = dv['name']?.toString(); - final defaultLocale = dv['locale']?.toString(); - if (defaultName != null && defaultLocale != null) { - final lowerName = defaultName.toLowerCase(); - final lowerLocale = defaultLocale.toLowerCase(); - for (final voice in voices) { - final name = voice['name']?.toString(); - final locale = voice['locale']?.toString(); - if (name != null && - locale != null && - name.toLowerCase() == lowerName && - locale.toLowerCase() == lowerLocale) { - return voice; - } - } - } - - return null; - } - - Map? pickIosVoice() { - final userDefault = matchDefaultVoice(); - if (userDefault != null) { - return userDefault; - } - - final siriCandidates = voices.where((voice) { - final name = voice['name']?.toString().toLowerCase() ?? ''; - final identifier = voice['identifier']?.toString().toLowerCase() ?? ''; - final voiceId = voice['id']?.toString().toLowerCase() ?? ''; - return name.contains('siri') || - identifier.contains('siri') || - voiceId.contains('siri'); - }).toList(); - - if (siriCandidates.isNotEmpty) { - siriCandidates.sort((a, b) => _iosVoiceScore(b) - _iosVoiceScore(a)); - final localeMatch = matchesLocale(siriCandidates); - if (localeMatch != null) { - return localeMatch; - } - return siriCandidates.first; - } - - final ranked = [...voices]; - ranked.sort((a, b) => _iosVoiceScore(b) - _iosVoiceScore(a)); - final localeMatch = matchesLocale(ranked); - if (localeMatch != null) { - return localeMatch; - } - return ranked.isNotEmpty ? ranked.first : null; - } - - Map? pickAndroidVoice() { - int qualityScore(String? quality) { - switch ((quality ?? '').toLowerCase()) { - case 'very_high': - case 'very-high': - return 3; - case 'high': - return 2; - case 'normal': - return 1; - default: - return 0; - } - } - - final preferredEngineVoices = voices - .where( - (voice) => - (voice['engine']?.toString() ?? '').toLowerCase().contains( - 'google', - ) || - voice['engine'] is! String, - ) - .toList(); - - preferredEngineVoices.sort((a, b) { - final qualityDiff = - qualityScore(b['quality']?.toString()) - - qualityScore(a['quality']?.toString()); - if (qualityDiff != 0) { - return qualityDiff; - } - final latencyA = a['latency']?.toString() ?? ''; - final latencyB = b['latency']?.toString() ?? ''; - return latencyA.compareTo(latencyB); - }); - - final ordered = preferredEngineVoices.isEmpty - ? voices - : preferredEngineVoices; - return matchesLocale(ordered) ?? matchesLocale(voices); - } - - Map? selected; - if (Platform.isIOS) { - selected = pickIosVoice(); - } else if (Platform.isAndroid) { - selected = pickAndroidVoice(); - } - - if (selected == null) { - return null; - } - - final name = selected['name']?.toString(); - final locale = selected['locale']?.toString(); - if (name == null || locale == null) { - return null; - } - - return selected; - } - - void _handleStart() { - _onStart?.call(); - } - - void _handleComplete() { - _onComplete?.call(); - } - - void _handleCancel() { - _onCancel?.call(); - } - - void _handlePause() { - _onPause?.call(); - } - - void _handleContinue() { - _onContinue?.call(); - } - - void _handleError(dynamic message) { - final safeMessage = message == null - ? 'Unknown TTS error' - : message.toString(); - _onError?.call(safeMessage); } } -class _VoiceNotProvided { - const _VoiceNotProvided(); +/// Marker class to distinguish "not provided" from null. +class _NotProvided { + const _NotProvided(); +} + +/// Audio chunk for server TTS synthesis. +class SpeechAudioChunk { + const SpeechAudioChunk({required this.bytes, required this.mimeType}); + + final Uint8List bytes; + final String mimeType; } diff --git a/lib/features/chat/services/tts_manager.dart b/lib/features/chat/services/tts_manager.dart new file mode 100644 index 0000000..950de14 --- /dev/null +++ b/lib/features/chat/services/tts_manager.dart @@ -0,0 +1,916 @@ +import 'dart:async'; +import 'dart:io' show Platform; + +import 'package:audioplayers/audioplayers.dart'; +import 'package:flutter/foundation.dart'; +import 'package:flutter_tts/flutter_tts.dart'; + +import '../../../core/services/api_service.dart'; + +// ============================================================================= +// TTS Events +// ============================================================================= + +/// Base class for all TTS events. +sealed class TtsEvent { + const TtsEvent(); +} + +/// Emitted when TTS playback starts. +class TtsStarted extends TtsEvent { + const TtsStarted(); +} + +/// Emitted when a new chunk starts playing. +class TtsChunkStarted extends TtsEvent { + const TtsChunkStarted(this.chunkIndex); + final int chunkIndex; +} + +/// Emitted for word-level progress (device TTS only). +class TtsWordProgress extends TtsEvent { + const TtsWordProgress(this.start, this.end); + final int start; + final int end; +} + +/// Emitted when all chunks have finished playing. +class TtsCompleted extends TtsEvent { + const TtsCompleted(); +} + +/// Emitted when playback is cancelled. +class TtsCancelled extends TtsEvent { + const TtsCancelled(); +} + +/// Emitted when playback is paused. +class TtsPaused extends TtsEvent { + const TtsPaused(); +} + +/// Emitted when playback resumes from pause. +class TtsResumed extends TtsEvent { + const TtsResumed(); +} + +/// Emitted when an error occurs. +class TtsError extends TtsEvent { + const TtsError(this.message); + final String message; +} + +// ============================================================================= +// Playback Session +// ============================================================================= + +/// Represents a single TTS playback session. +class TtsPlaybackSession { + TtsPlaybackSession._({ + required this.id, + required this.chunks, + required this.useServerTts, + }); + + /// Unique session identifier. + final int id; + + /// Text chunks to be spoken. + final List chunks; + + /// Whether to use server TTS (true) or device TTS (false). + final bool useServerTts; +} + +// ============================================================================= +// TTS Configuration +// ============================================================================= + +/// Configuration for TTS playback. +class TtsConfig { + const TtsConfig({ + this.voice, + this.serverVoice, + this.speechRate = 0.5, + this.pitch = 1.0, + this.volume = 1.0, + this.preferServer = false, + }); + + final String? voice; + final String? serverVoice; + final double speechRate; + final double pitch; + final double volume; + final bool preferServer; + + TtsConfig copyWith({ + String? voice, + String? serverVoice, + double? speechRate, + double? pitch, + double? volume, + bool? preferServer, + }) { + return TtsConfig( + voice: voice ?? this.voice, + serverVoice: serverVoice ?? this.serverVoice, + speechRate: speechRate ?? this.speechRate, + pitch: pitch ?? this.pitch, + volume: volume ?? this.volume, + preferServer: preferServer ?? this.preferServer, + ); + } +} + +// ============================================================================= +// TTS Manager +// ============================================================================= + +/// Single global manager for all TTS operations. +/// +/// This manager owns the FlutterTts and AudioPlayer instances and ensures +/// only one playback session is active at a time. Events are emitted via +/// a stream that consumers can listen to. +class TtsManager { + TtsManager._(); + static final instance = TtsManager._(); + + // FlutterTts instance (lazy initialized) + FlutterTts? _tts; + bool _ttsInitialized = false; + bool _handlersSet = false; + Completer? _initCompleter; + + // AudioPlayer for server TTS + final AudioPlayer _player = AudioPlayer(); + bool _playerConfigured = false; + + // API service for server TTS (must be set before using server TTS) + ApiService? _apiService; + + // Configuration + TtsConfig _config = const TtsConfig(); + bool _deviceEngineAvailable = false; + bool _voiceConfigured = false; + + // Session management + int _sessionCounter = 0; + TtsPlaybackSession? _activeSession; + + // Device TTS state + int _currentChunkIndex = -1; + + // Server TTS state + final List<_AudioChunk> _serverAudioBuffer = []; + int _serverCurrentIndex = -1; + bool _serverWaitingForNext = false; + + // Event stream + final _eventController = StreamController.broadcast(); + + // Cached server default voice + String? _serverDefaultVoice; + Future? _serverDefaultVoiceFuture; + + /// Stream of TTS events. + Stream get events => _eventController.stream; + + /// Whether device TTS is available. + bool get deviceAvailable => _deviceEngineAvailable; + + /// Whether server TTS is available. + bool get serverAvailable => _apiService != null; + + /// Whether any TTS is available. + bool get isAvailable => _deviceEngineAvailable || serverAvailable; + + /// Whether a session is currently active. + bool get isPlaying => _activeSession != null; + + /// Current configuration. + TtsConfig get config => _config; + + /// Sets the API service for server TTS. + void setApiService(ApiService? api) { + _apiService = api; + } + + /// Updates the TTS configuration. + Future updateConfig(TtsConfig config) async { + _config = config; + + if (_tts != null && _ttsInitialized) { + await _tts!.setVolume(config.volume); + await _tts!.setSpeechRate(config.speechRate); + await _tts!.setPitch(config.pitch); + + if (config.voice != null) { + await _setVoiceByName(config.voice); + } + } + } + + /// Initializes the TTS engine. + /// + /// This must be called before any TTS operations. + Future initialize({TtsConfig? config}) async { + if (config != null) { + _config = config; + } + + // Initialize FlutterTts + await _ensureTtsInitialized(); + + // Configure AudioPlayer for all platforms + if (!_playerConfigured) { + _player.onPlayerComplete.listen((_) => _onServerAudioComplete()); + _player.onPlayerStateChanged.listen((state) { + if (state == PlayerState.playing) { + _emitEvent(const TtsStarted()); + } else if (state == PlayerState.paused) { + _emitEvent(const TtsPaused()); + } + }); + // Android-specific audio context configuration + if (!kIsWeb && Platform.isAndroid) { + await _player.setAudioContext( + AudioContext(android: const AudioContextAndroid()), + ); + } + _playerConfigured = true; + } + + return isAvailable; + } + + /// Speaks the given text. + /// + /// Returns the playback session. If another session is active, it will be + /// cancelled first. + Future speak(String text, {bool? useServer}) async { + if (text.trim().isEmpty) { + return null; + } + + // Cancel any existing session + await stop(); + + // Ensure TTS is initialized + await _ensureTtsInitialized(); + + // Determine whether to use server or device TTS + final shouldUseServer = useServer ?? _shouldUseServer(); + + // Split text into chunks + final chunks = splitTextForSpeech(text); + if (chunks.isEmpty) { + return null; + } + + // Create new session + _sessionCounter++; + final session = TtsPlaybackSession._( + id: _sessionCounter, + chunks: chunks, + useServerTts: shouldUseServer, + ); + _activeSession = session; + + // Start playback + try { + if (shouldUseServer) { + await _startServerPlayback(session); + } else { + await _startDevicePlayback(session); + } + return session; + } catch (e) { + _emitEvent(TtsError(e.toString())); + + // Try fallback to device TTS if server fails + if (shouldUseServer && _deviceEngineAvailable) { + try { + // Create a new session with useServerTts: false so device TTS + // handlers emit events correctly + final fallbackSession = TtsPlaybackSession._( + id: session.id, + chunks: session.chunks, + useServerTts: false, + ); + _activeSession = fallbackSession; + await _startDevicePlayback(fallbackSession); + return fallbackSession; + } catch (e2) { + _emitEvent(TtsError(e2.toString())); + } + } + + _activeSession = null; + return null; + } + } + + /// Pauses the current playback. + Future pause() async { + final session = _activeSession; + if (session == null) return; + + try { + if (session.useServerTts) { + await _player.pause(); + } else { + await _tts?.pause(); + } + } catch (e) { + _emitEvent(TtsError(e.toString())); + } + } + + /// Resumes paused playback. + Future resume() async { + final session = _activeSession; + if (session == null) return; + + try { + if (session.useServerTts) { + await _player.resume(); + _emitEvent(const TtsResumed()); + } else { + // Device TTS resume is handled by the native handler + } + } catch (e) { + _emitEvent(TtsError(e.toString())); + } + } + + /// Stops the current playback. + Future stop() async { + final session = _activeSession; + if (session == null) return; + + _activeSession = null; + _resetPlaybackState(); + + try { + if (session.useServerTts) { + await _player.stop(); + } else { + await _tts?.stop(); + } + _emitEvent(const TtsCancelled()); + } catch (e) { + _emitEvent(TtsError(e.toString())); + } + } + + /// Disposes the manager and releases resources. + Future dispose() async { + await stop(); + await _player.dispose(); + await _eventController.close(); + } + + /// Splits text into chunks for TTS playback. + /// + /// This mirrors OpenWebUI's extractSentencesForAudio implementation. + List splitTextForSpeech(String text) { + // 1. Preserve code blocks (replace with placeholders) + final codeBlocks = []; + var processed = text; + var codeBlockIndex = 0; + + final codeBlockRegex = RegExp(r'```[\s\S]*?```', multiLine: true); + processed = processed.replaceAllMapped(codeBlockRegex, (match) { + final placeholder = '\u0000$codeBlockIndex\u0000'; + codeBlocks.add(match.group(0)!); + codeBlockIndex++; + return placeholder; + }); + + // 2. Split on sentence-ending punctuation: .!? + final sentences = processed + .split(RegExp(r'(?<=[.!?])\s+')) + .map((s) => s.trim()) + .where((s) => s.isNotEmpty) + .toList(); + + // 3. Restore code blocks from placeholders + final restoredSentences = sentences + .map((sentence) { + return sentence.replaceAllMapped(RegExp(r'\u0000(\d+)\u0000'), (m) { + final idx = int.parse(m.group(1)!); + return idx < codeBlocks.length ? codeBlocks[idx] : ''; + }); + }) + .where((s) => s.isNotEmpty) + .toList(); + + // 4. Merge short sentences (< 4 words OR < 50 chars) + final mergedChunks = []; + for (final sentence in restoredSentences) { + if (mergedChunks.isEmpty) { + mergedChunks.add(sentence); + } else { + final lastIndex = mergedChunks.length - 1; + final previousText = mergedChunks[lastIndex]; + final wordCount = previousText.split(RegExp(r'\s+')).length; + final charCount = previousText.length; + + if (wordCount < 4 || charCount < 50) { + mergedChunks[lastIndex] = '$previousText $sentence'; + } else { + mergedChunks.add(sentence); + } + } + } + + return mergedChunks.isEmpty ? [text.trim()] : mergedChunks; + } + + /// Gets available voices from the device TTS engine. + Future>> getDeviceVoices() async { + await _ensureTtsInitialized(); + if (_tts == null) return []; + + try { + final voicesRaw = await _tts!.getVoices; + if (voicesRaw is! List) return []; + + return voicesRaw + .whereType() + .map((e) => _normalizeVoiceEntry(e)) + .where((e) => e.isNotEmpty) + .toList(); + } catch (e) { + _emitEvent(TtsError(e.toString())); + return []; + } + } + + /// Gets available voices from the server. + Future>> getServerVoices() async { + if (_apiService == null) return []; + + try { + final serverVoices = await _apiService!.getAvailableServerVoices(); + return serverVoices + .map((v) { + final id = (v['id'] ?? v['name'] ?? '').toString(); + final name = (v['name'] ?? v['id'] ?? '').toString(); + final locale = (v['locale'] ?? v['language'] ?? '').toString(); + return {'id': id, 'name': name, 'locale': locale}; + }) + .where((e) => e['name']?.toString().trim().isNotEmpty ?? false) + .toList(); + } catch (e) { + _emitEvent(TtsError(e.toString())); + return []; + } + } + + /// Preloads server default voice configuration. + Future preloadServerDefaults() async { + if (_apiService == null) return; + try { + await _getServerDefaultVoice(); + } catch (_) {} + } + + /// Synthesizes a single text chunk to audio without playing it. + /// + /// This is used by [VoiceCallService] for its own audio playback pipeline. + /// Returns the audio bytes and mime type. + Future<({Uint8List bytes, String mimeType})> synthesizeChunk( + String text, + ) async { + if (_apiService == null) { + throw StateError('Server TTS is not available'); + } + if (text.trim().isEmpty) { + throw ArgumentError('Cannot synthesize empty text'); + } + + final voice = await _resolveServerVoice(); + final result = await _apiService!.generateSpeech( + text: text, + voice: voice, + speed: _config.speechRate, + ); + return (bytes: result.bytes, mimeType: result.mimeType); + } + + // =========================================================================== + // Private: Initialization + // =========================================================================== + + Future _ensureTtsInitialized() async { + if (_ttsInitialized) return; + + // Prevent concurrent initialization + if (_initCompleter != null) { + await _initCompleter!.future; + return; + } + + _initCompleter = Completer(); + + try { + final tts = FlutterTts(); + _tts = tts; + + // Wait for native TTS to be fully initialized before setting handlers. + // The flutter_tts plugin has a bug where setting handlers during onInit + // causes ConcurrentModificationException. + await Future.delayed(const Duration(milliseconds: 500)); + + if (!_handlersSet) { + _setupTtsHandlers(tts); + _handlersSet = true; + } + + // Configure device engine + await _configureDeviceEngine(); + + _ttsInitialized = true; + _initCompleter!.complete(); + } catch (e) { + _initCompleter!.completeError(e); + _initCompleter = null; + rethrow; + } + } + + void _setupTtsHandlers(FlutterTts tts) { + tts.setStartHandler(() { + if (_activeSession != null && !_activeSession!.useServerTts) { + _emitEvent(const TtsStarted()); + } + }); + + tts.setCompletionHandler(() { + _onDeviceChunkComplete(); + }); + + tts.setCancelHandler(() { + if (_activeSession != null && !_activeSession!.useServerTts) { + _activeSession = null; + _resetPlaybackState(); + _emitEvent(const TtsCancelled()); + } + }); + + tts.setPauseHandler(() { + if (_activeSession != null && !_activeSession!.useServerTts) { + _emitEvent(const TtsPaused()); + } + }); + + tts.setContinueHandler(() { + if (_activeSession != null && !_activeSession!.useServerTts) { + _emitEvent(const TtsResumed()); + } + }); + + tts.setErrorHandler((msg) { + _emitEvent(TtsError(msg.toString())); + }); + + try { + tts.setProgressHandler((String text, int start, int end, String word) { + if (_activeSession != null && !_activeSession!.useServerTts) { + _emitEvent(TtsWordProgress(start, end)); + } + }); + } catch (_) { + // Some platforms may not support progress handler + } + } + + Future _configureDeviceEngine() async { + if (_tts == null) return; + + _deviceEngineAvailable = false; + try { + // Set default engine on Android + if (!kIsWeb && Platform.isAndroid) { + try { + final engine = await _tts!.getDefaultEngine; + if (engine is String && engine.isNotEmpty) { + await _tts!.setEngine(engine); + } + } catch (_) {} + } + + await _tts!.awaitSpeakCompletion(true); + await _tts!.setVolume(_config.volume); + await _tts!.setSpeechRate(_config.speechRate); + await _tts!.setPitch(_config.pitch); + + if (!kIsWeb && Platform.isIOS) { + await _tts!.setSharedInstance(true); + } + + _deviceEngineAvailable = true; + } catch (e) { + _deviceEngineAvailable = false; + _emitEvent(TtsError(e.toString())); + } + } + + // =========================================================================== + // Private: Device TTS Playback + // =========================================================================== + + Future _startDevicePlayback(TtsPlaybackSession session) async { + if (!_deviceEngineAvailable || _tts == null) { + throw StateError('Device TTS is not available'); + } + + _currentChunkIndex = 0; + + // Configure voice if needed + if (!_voiceConfigured) { + await _configurePreferredVoice(); + } + + // Speak first chunk + _emitEvent(const TtsChunkStarted(0)); + final result = await _tts!.speak(session.chunks.first); + if (result is int && result != 1) { + throw StateError('TTS engine returned error code $result'); + } + } + + void _onDeviceChunkComplete() { + final session = _activeSession; + if (session == null || session.useServerTts) return; + + final nextIndex = _currentChunkIndex + 1; + + // Check if there are more chunks + if (nextIndex >= session.chunks.length) { + _activeSession = null; + _resetPlaybackState(); + _emitEvent(const TtsCompleted()); + return; + } + + // Play next chunk + _currentChunkIndex = nextIndex; + _emitEvent(TtsChunkStarted(nextIndex)); + + _tts?.speak(session.chunks[nextIndex]).then((result) { + if (result is int && result != 1) { + _emitEvent(TtsError('TTS engine returned error code $result')); + } + }); + } + + // =========================================================================== + // Private: Server TTS Playback + // =========================================================================== + + Future _startServerPlayback(TtsPlaybackSession session) async { + if (_apiService == null) { + throw StateError('Server TTS is not available'); + } + + _serverCurrentIndex = -1; + _serverAudioBuffer.clear(); + _serverWaitingForNext = false; + + final voice = await _resolveServerVoice(); + + // Fetch and play first chunk + final firstChunk = await _fetchServerAudio(session.chunks.first, voice); + if (_activeSession?.id != session.id) return; // Cancelled + + _serverAudioBuffer.add(firstChunk); + _serverCurrentIndex = 0; + + await _player.stop(); + await _player.play( + BytesSource(firstChunk.bytes, mimeType: firstChunk.mimeType), + ); + _emitEvent(const TtsChunkStarted(0)); + + // Prefetch remaining chunks in background + unawaited(_prefetchServerChunks(session, voice, 1)); + } + + Future _prefetchServerChunks( + TtsPlaybackSession session, + String? voice, + int startIndex, + ) async { + for (var i = startIndex; i < session.chunks.length; i++) { + if (_activeSession?.id != session.id) return; // Cancelled + + try { + final chunk = await _fetchServerAudio(session.chunks[i], voice); + if (_activeSession?.id != session.id) return; + + _serverAudioBuffer.add(chunk); + + // If player was waiting for this chunk, play it now + if (_serverWaitingForNext && + _serverCurrentIndex + 1 < _serverAudioBuffer.length) { + _serverWaitingForNext = false; + await _playNextServerChunk(); + } + } catch (e) { + _emitEvent(TtsError(e.toString())); + } + } + } + + Future<_AudioChunk> _fetchServerAudio(String text, String? voice) async { + final result = await _apiService!.generateSpeech( + text: text, + voice: voice, + speed: _config.speechRate, + ); + return _AudioChunk(bytes: result.bytes, mimeType: result.mimeType); + } + + void _onServerAudioComplete() { + final session = _activeSession; + if (session == null || !session.useServerTts) return; + + final nextIndex = _serverCurrentIndex + 1; + + // Check if all chunks are done + if (nextIndex >= session.chunks.length) { + _activeSession = null; + _resetPlaybackState(); + _emitEvent(const TtsCompleted()); + return; + } + + // Check if next chunk is buffered + if (nextIndex < _serverAudioBuffer.length) { + unawaited(_playNextServerChunk()); + } else { + _serverWaitingForNext = true; + } + } + + Future _playNextServerChunk() async { + final session = _activeSession; + if (session == null) return; + + final nextIndex = _serverCurrentIndex + 1; + if (nextIndex >= _serverAudioBuffer.length) return; + + _serverCurrentIndex = nextIndex; + final chunk = _serverAudioBuffer[nextIndex]; + + await _player.play(BytesSource(chunk.bytes, mimeType: chunk.mimeType)); + _emitEvent(TtsChunkStarted(nextIndex)); + } + + Future _resolveServerVoice() async { + final serverSelected = _config.serverVoice?.trim(); + if (serverSelected != null && serverSelected.isNotEmpty) { + return serverSelected; + } + final selected = _config.voice?.trim(); + if (selected != null && selected.isNotEmpty) { + return selected; + } + return await _getServerDefaultVoice(); + } + + Future _getServerDefaultVoice() async { + if (_apiService == null) return null; + if (_serverDefaultVoice != null) return _serverDefaultVoice; + + if (_serverDefaultVoiceFuture != null) { + return _serverDefaultVoiceFuture; + } + + _serverDefaultVoiceFuture = _apiService!.getDefaultServerVoice(); + try { + final voice = await _serverDefaultVoiceFuture; + _serverDefaultVoice = voice?.trim(); + return _serverDefaultVoice; + } catch (e) { + _emitEvent(TtsError(e.toString())); + return null; + } finally { + _serverDefaultVoiceFuture = null; + } + } + + // =========================================================================== + // Private: Helpers + // =========================================================================== + + bool _shouldUseServer() { + if (_config.preferServer && _apiService != null) { + return true; + } + if (_deviceEngineAvailable) { + return false; + } + return _apiService != null; + } + + void _resetPlaybackState() { + _currentChunkIndex = -1; + _serverCurrentIndex = -1; + _serverAudioBuffer.clear(); + _serverWaitingForNext = false; + } + + void _emitEvent(TtsEvent event) { + if (!_eventController.isClosed) { + _eventController.add(event); + } + } + + Future _setVoiceByName(String? voiceName) async { + if (_tts == null || voiceName == null) return; + if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) return; + + try { + final voicesRaw = await _tts!.getVoices; + if (voicesRaw is! List) return; + + for (final entry in voicesRaw) { + if (entry is Map) { + final normalized = _normalizeVoiceEntry(entry); + final name = normalized['name'] as String?; + if (name == voiceName) { + await _tts!.setVoice(_voiceCommandFrom(normalized)); + _voiceConfigured = true; + return; + } + } + } + } catch (e) { + _emitEvent(TtsError(e.toString())); + } + } + + Future _configurePreferredVoice() async { + if (_voiceConfigured || _tts == null) return; + if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) { + _voiceConfigured = true; + return; + } + + try { + // Try to use configured voice + if (_config.voice != null) { + await _setVoiceByName(_config.voice); + if (_voiceConfigured) return; + } + + // Fall back to system default + _voiceConfigured = true; + } catch (e) { + _emitEvent(TtsError(e.toString())); + _voiceConfigured = true; + } + } + + Map _normalizeVoiceEntry(Map entry) { + final normalized = {}; + entry.forEach((key, value) { + if (key != null) { + normalized[key.toString()] = value; + } + }); + return normalized; + } + + Map _voiceCommandFrom(Map voice) { + final command = {}; + for (final key in [ + 'name', + 'locale', + 'identifier', + 'id', + 'voiceIdentifier', + 'engine', + ]) { + final value = voice[key]; + if (value != null) { + command[key] = value.toString(); + } + } + return command; + } +} + +// ============================================================================= +// Internal Types +// ============================================================================= + +class _AudioChunk { + const _AudioChunk({required this.bytes, required this.mimeType}); + final Uint8List bytes; + final String mimeType; +}