Merge pull request #249 from cogwheel0/refactor-tts-service-migration

refactor(tts): Simplify text-to-speech service by migrating to TtsManager
2025-12-08 13:28:18 +08:00
parent df4bb346b4 08d5de8a68
commit fbeaebe0e8
3 changed files with 1055 additions and 1061 deletions
@@ -391,23 +391,12 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {

  void _handleDeviceWordProgress(int start, int end) {
    if (!ref.mounted) return;
-    // Map global offsets to sentence index
-    final offsets = state.sentenceOffsets;
-    if (offsets.isEmpty) return;
-    int idx = 0;
-    for (var i = 0; i < offsets.length; i++) {
-      final sStart = offsets[i];
-      final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
-      if (start >= sStart && start < sEnd) {
-        idx = i;
-        break;
-      }
-    }
-    final sentenceStart = offsets[idx];
+    // Word progress offsets are relative to the current chunk/sentence being
+    // spoken, NOT the full original text. TtsChunkStarted already sets the
+    // correct activeSentenceIndex, so we only update word highlighting here.
    state = state.copyWith(
-      activeSentenceIndex: idx,
-      wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
-      wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
+      wordStartInSentence: start.clamp(0, 1 << 20),
+      wordEndInSentence: end.clamp(0, 1 << 20),
    );
  }
 }
@@ -0,0 +1,916 @@
+import 'dart:async';
+import 'dart:io' show Platform;
+
+import 'package:audioplayers/audioplayers.dart';
+import 'package:flutter/foundation.dart';
+import 'package:flutter_tts/flutter_tts.dart';
+
+import '../../../core/services/api_service.dart';
+
+// =============================================================================
+// TTS Events
+// =============================================================================
+
+/// Base class for all TTS events.
+sealed class TtsEvent {
+  const TtsEvent();
+}
+
+/// Emitted when TTS playback starts.
+class TtsStarted extends TtsEvent {
+  const TtsStarted();
+}
+
+/// Emitted when a new chunk starts playing.
+class TtsChunkStarted extends TtsEvent {
+  const TtsChunkStarted(this.chunkIndex);
+  final int chunkIndex;
+}
+
+/// Emitted for word-level progress (device TTS only).
+class TtsWordProgress extends TtsEvent {
+  const TtsWordProgress(this.start, this.end);
+  final int start;
+  final int end;
+}
+
+/// Emitted when all chunks have finished playing.
+class TtsCompleted extends TtsEvent {
+  const TtsCompleted();
+}
+
+/// Emitted when playback is cancelled.
+class TtsCancelled extends TtsEvent {
+  const TtsCancelled();
+}
+
+/// Emitted when playback is paused.
+class TtsPaused extends TtsEvent {
+  const TtsPaused();
+}
+
+/// Emitted when playback resumes from pause.
+class TtsResumed extends TtsEvent {
+  const TtsResumed();
+}
+
+/// Emitted when an error occurs.
+class TtsError extends TtsEvent {
+  const TtsError(this.message);
+  final String message;
+}
+
+// =============================================================================
+// Playback Session
+// =============================================================================
+
+/// Represents a single TTS playback session.
+class TtsPlaybackSession {
+  TtsPlaybackSession._({
+    required this.id,
+    required this.chunks,
+    required this.useServerTts,
+  });
+
+  /// Unique session identifier.
+  final int id;
+
+  /// Text chunks to be spoken.
+  final List<String> chunks;
+
+  /// Whether to use server TTS (true) or device TTS (false).
+  final bool useServerTts;
+}
+
+// =============================================================================
+// TTS Configuration
+// =============================================================================
+
+/// Configuration for TTS playback.
+class TtsConfig {
+  const TtsConfig({
+    this.voice,
+    this.serverVoice,
+    this.speechRate = 0.5,
+    this.pitch = 1.0,
+    this.volume = 1.0,
+    this.preferServer = false,
+  });
+
+  final String? voice;
+  final String? serverVoice;
+  final double speechRate;
+  final double pitch;
+  final double volume;
+  final bool preferServer;
+
+  TtsConfig copyWith({
+    String? voice,
+    String? serverVoice,
+    double? speechRate,
+    double? pitch,
+    double? volume,
+    bool? preferServer,
+  }) {
+    return TtsConfig(
+      voice: voice ?? this.voice,
+      serverVoice: serverVoice ?? this.serverVoice,
+      speechRate: speechRate ?? this.speechRate,
+      pitch: pitch ?? this.pitch,
+      volume: volume ?? this.volume,
+      preferServer: preferServer ?? this.preferServer,
+    );
+  }
+}
+
+// =============================================================================
+// TTS Manager
+// =============================================================================
+
+/// Single global manager for all TTS operations.
+///
+/// This manager owns the FlutterTts and AudioPlayer instances and ensures
+/// only one playback session is active at a time. Events are emitted via
+/// a stream that consumers can listen to.
+class TtsManager {
+  TtsManager._();
+  static final instance = TtsManager._();
+
+  // FlutterTts instance (lazy initialized)
+  FlutterTts? _tts;
+  bool _ttsInitialized = false;
+  bool _handlersSet = false;
+  Completer<void>? _initCompleter;
+
+  // AudioPlayer for server TTS
+  final AudioPlayer _player = AudioPlayer();
+  bool _playerConfigured = false;
+
+  // API service for server TTS (must be set before using server TTS)
+  ApiService? _apiService;
+
+  // Configuration
+  TtsConfig _config = const TtsConfig();
+  bool _deviceEngineAvailable = false;
+  bool _voiceConfigured = false;
+
+  // Session management
+  int _sessionCounter = 0;
+  TtsPlaybackSession? _activeSession;
+
+  // Device TTS state
+  int _currentChunkIndex = -1;
+
+  // Server TTS state
+  final List<_AudioChunk> _serverAudioBuffer = [];
+  int _serverCurrentIndex = -1;
+  bool _serverWaitingForNext = false;
+
+  // Event stream
+  final _eventController = StreamController<TtsEvent>.broadcast();
+
+  // Cached server default voice
+  String? _serverDefaultVoice;
+  Future<String?>? _serverDefaultVoiceFuture;
+
+  /// Stream of TTS events.
+  Stream<TtsEvent> get events => _eventController.stream;
+
+  /// Whether device TTS is available.
+  bool get deviceAvailable => _deviceEngineAvailable;
+
+  /// Whether server TTS is available.
+  bool get serverAvailable => _apiService != null;
+
+  /// Whether any TTS is available.
+  bool get isAvailable => _deviceEngineAvailable || serverAvailable;
+
+  /// Whether a session is currently active.
+  bool get isPlaying => _activeSession != null;
+
+  /// Current configuration.
+  TtsConfig get config => _config;
+
+  /// Sets the API service for server TTS.
+  void setApiService(ApiService? api) {
+    _apiService = api;
+  }
+
+  /// Updates the TTS configuration.
+  Future<void> updateConfig(TtsConfig config) async {
+    _config = config;
+
+    if (_tts != null && _ttsInitialized) {
+      await _tts!.setVolume(config.volume);
+      await _tts!.setSpeechRate(config.speechRate);
+      await _tts!.setPitch(config.pitch);
+
+      if (config.voice != null) {
+        await _setVoiceByName(config.voice);
+      }
+    }
+  }
+
+  /// Initializes the TTS engine.
+  ///
+  /// This must be called before any TTS operations.
+  Future<bool> initialize({TtsConfig? config}) async {
+    if (config != null) {
+      _config = config;
+    }
+
+    // Initialize FlutterTts
+    await _ensureTtsInitialized();
+
+    // Configure AudioPlayer for all platforms
+    if (!_playerConfigured) {
+      _player.onPlayerComplete.listen((_) => _onServerAudioComplete());
+      _player.onPlayerStateChanged.listen((state) {
+        if (state == PlayerState.playing) {
+          _emitEvent(const TtsStarted());
+        } else if (state == PlayerState.paused) {
+          _emitEvent(const TtsPaused());
+        }
+      });
+      // Android-specific audio context configuration
+      if (!kIsWeb && Platform.isAndroid) {
+        await _player.setAudioContext(
+          AudioContext(android: const AudioContextAndroid()),
+        );
+      }
+      _playerConfigured = true;
+    }
+
+    return isAvailable;
+  }
+
+  /// Speaks the given text.
+  ///
+  /// Returns the playback session. If another session is active, it will be
+  /// cancelled first.
+  Future<TtsPlaybackSession?> speak(String text, {bool? useServer}) async {
+    if (text.trim().isEmpty) {
+      return null;
+    }
+
+    // Cancel any existing session
+    await stop();
+
+    // Ensure TTS is initialized
+    await _ensureTtsInitialized();
+
+    // Determine whether to use server or device TTS
+    final shouldUseServer = useServer ?? _shouldUseServer();
+
+    // Split text into chunks
+    final chunks = splitTextForSpeech(text);
+    if (chunks.isEmpty) {
+      return null;
+    }
+
+    // Create new session
+    _sessionCounter++;
+    final session = TtsPlaybackSession._(
+      id: _sessionCounter,
+      chunks: chunks,
+      useServerTts: shouldUseServer,
+    );
+    _activeSession = session;
+
+    // Start playback
+    try {
+      if (shouldUseServer) {
+        await _startServerPlayback(session);
+      } else {
+        await _startDevicePlayback(session);
+      }
+      return session;
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+
+      // Try fallback to device TTS if server fails
+      if (shouldUseServer && _deviceEngineAvailable) {
+        try {
+          // Create a new session with useServerTts: false so device TTS
+          // handlers emit events correctly
+          final fallbackSession = TtsPlaybackSession._(
+            id: session.id,
+            chunks: session.chunks,
+            useServerTts: false,
+          );
+          _activeSession = fallbackSession;
+          await _startDevicePlayback(fallbackSession);
+          return fallbackSession;
+        } catch (e2) {
+          _emitEvent(TtsError(e2.toString()));
+        }
+      }
+
+      _activeSession = null;
+      return null;
+    }
+  }
+
+  /// Pauses the current playback.
+  Future<void> pause() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    try {
+      if (session.useServerTts) {
+        await _player.pause();
+      } else {
+        await _tts?.pause();
+      }
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  /// Resumes paused playback.
+  Future<void> resume() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    try {
+      if (session.useServerTts) {
+        await _player.resume();
+        _emitEvent(const TtsResumed());
+      } else {
+        // Device TTS resume is handled by the native handler
+      }
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  /// Stops the current playback.
+  Future<void> stop() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    _activeSession = null;
+    _resetPlaybackState();
+
+    try {
+      if (session.useServerTts) {
+        await _player.stop();
+      } else {
+        await _tts?.stop();
+      }
+      _emitEvent(const TtsCancelled());
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  /// Disposes the manager and releases resources.
+  Future<void> dispose() async {
+    await stop();
+    await _player.dispose();
+    await _eventController.close();
+  }
+
+  /// Splits text into chunks for TTS playback.
+  ///
+  /// This mirrors OpenWebUI's extractSentencesForAudio implementation.
+  List<String> splitTextForSpeech(String text) {
+    // 1. Preserve code blocks (replace with placeholders)
+    final codeBlocks = <String>[];
+    var processed = text;
+    var codeBlockIndex = 0;
+
+    final codeBlockRegex = RegExp(r'```[\s\S]*?```', multiLine: true);
+    processed = processed.replaceAllMapped(codeBlockRegex, (match) {
+      final placeholder = '\u0000$codeBlockIndex\u0000';
+      codeBlocks.add(match.group(0)!);
+      codeBlockIndex++;
+      return placeholder;
+    });
+
+    // 2. Split on sentence-ending punctuation: .!?
+    final sentences = processed
+        .split(RegExp(r'(?<=[.!?])\s+'))
+        .map((s) => s.trim())
+        .where((s) => s.isNotEmpty)
+        .toList();
+
+    // 3. Restore code blocks from placeholders
+    final restoredSentences = sentences
+        .map((sentence) {
+          return sentence.replaceAllMapped(RegExp(r'\u0000(\d+)\u0000'), (m) {
+            final idx = int.parse(m.group(1)!);
+            return idx < codeBlocks.length ? codeBlocks[idx] : '';
+          });
+        })
+        .where((s) => s.isNotEmpty)
+        .toList();
+
+    // 4. Merge short sentences (< 4 words OR < 50 chars)
+    final mergedChunks = <String>[];
+    for (final sentence in restoredSentences) {
+      if (mergedChunks.isEmpty) {
+        mergedChunks.add(sentence);
+      } else {
+        final lastIndex = mergedChunks.length - 1;
+        final previousText = mergedChunks[lastIndex];
+        final wordCount = previousText.split(RegExp(r'\s+')).length;
+        final charCount = previousText.length;
+
+        if (wordCount < 4 || charCount < 50) {
+          mergedChunks[lastIndex] = '$previousText $sentence';
+        } else {
+          mergedChunks.add(sentence);
+        }
+      }
+    }
+
+    return mergedChunks.isEmpty ? [text.trim()] : mergedChunks;
+  }
+
+  /// Gets available voices from the device TTS engine.
+  Future<List<Map<String, dynamic>>> getDeviceVoices() async {
+    await _ensureTtsInitialized();
+    if (_tts == null) return [];
+
+    try {
+      final voicesRaw = await _tts!.getVoices;
+      if (voicesRaw is! List) return [];
+
+      return voicesRaw
+          .whereType<Map>()
+          .map((e) => _normalizeVoiceEntry(e))
+          .where((e) => e.isNotEmpty)
+          .toList();
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      return [];
+    }
+  }
+
+  /// Gets available voices from the server.
+  Future<List<Map<String, dynamic>>> getServerVoices() async {
+    if (_apiService == null) return [];
+
+    try {
+      final serverVoices = await _apiService!.getAvailableServerVoices();
+      return serverVoices
+          .map((v) {
+            final id = (v['id'] ?? v['name'] ?? '').toString();
+            final name = (v['name'] ?? v['id'] ?? '').toString();
+            final locale = (v['locale'] ?? v['language'] ?? '').toString();
+            return {'id': id, 'name': name, 'locale': locale};
+          })
+          .where((e) => e['name']?.toString().trim().isNotEmpty ?? false)
+          .toList();
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      return [];
+    }
+  }
+
+  /// Preloads server default voice configuration.
+  Future<void> preloadServerDefaults() async {
+    if (_apiService == null) return;
+    try {
+      await _getServerDefaultVoice();
+    } catch (_) {}
+  }
+
+  /// Synthesizes a single text chunk to audio without playing it.
+  ///
+  /// This is used by [VoiceCallService] for its own audio playback pipeline.
+  /// Returns the audio bytes and mime type.
+  Future<({Uint8List bytes, String mimeType})> synthesizeChunk(
+    String text,
+  ) async {
+    if (_apiService == null) {
+      throw StateError('Server TTS is not available');
+    }
+    if (text.trim().isEmpty) {
+      throw ArgumentError('Cannot synthesize empty text');
+    }
+
+    final voice = await _resolveServerVoice();
+    final result = await _apiService!.generateSpeech(
+      text: text,
+      voice: voice,
+      speed: _config.speechRate,
+    );
+    return (bytes: result.bytes, mimeType: result.mimeType);
+  }
+
+  // ===========================================================================
+  // Private: Initialization
+  // ===========================================================================
+
+  Future<void> _ensureTtsInitialized() async {
+    if (_ttsInitialized) return;
+
+    // Prevent concurrent initialization
+    if (_initCompleter != null) {
+      await _initCompleter!.future;
+      return;
+    }
+
+    _initCompleter = Completer<void>();
+
+    try {
+      final tts = FlutterTts();
+      _tts = tts;
+
+      // Wait for native TTS to be fully initialized before setting handlers.
+      // The flutter_tts plugin has a bug where setting handlers during onInit
+      // causes ConcurrentModificationException.
+      await Future<void>.delayed(const Duration(milliseconds: 500));
+
+      if (!_handlersSet) {
+        _setupTtsHandlers(tts);
+        _handlersSet = true;
+      }
+
+      // Configure device engine
+      await _configureDeviceEngine();
+
+      _ttsInitialized = true;
+      _initCompleter!.complete();
+    } catch (e) {
+      _initCompleter!.completeError(e);
+      _initCompleter = null;
+      rethrow;
+    }
+  }
+
+  void _setupTtsHandlers(FlutterTts tts) {
+    tts.setStartHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _emitEvent(const TtsStarted());
+      }
+    });
+
+    tts.setCompletionHandler(() {
+      _onDeviceChunkComplete();
+    });
+
+    tts.setCancelHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _activeSession = null;
+        _resetPlaybackState();
+        _emitEvent(const TtsCancelled());
+      }
+    });
+
+    tts.setPauseHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _emitEvent(const TtsPaused());
+      }
+    });
+
+    tts.setContinueHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _emitEvent(const TtsResumed());
+      }
+    });
+
+    tts.setErrorHandler((msg) {
+      _emitEvent(TtsError(msg.toString()));
+    });
+
+    try {
+      tts.setProgressHandler((String text, int start, int end, String word) {
+        if (_activeSession != null && !_activeSession!.useServerTts) {
+          _emitEvent(TtsWordProgress(start, end));
+        }
+      });
+    } catch (_) {
+      // Some platforms may not support progress handler
+    }
+  }
+
+  Future<void> _configureDeviceEngine() async {
+    if (_tts == null) return;
+
+    _deviceEngineAvailable = false;
+    try {
+      // Set default engine on Android
+      if (!kIsWeb && Platform.isAndroid) {
+        try {
+          final engine = await _tts!.getDefaultEngine;
+          if (engine is String && engine.isNotEmpty) {
+            await _tts!.setEngine(engine);
+          }
+        } catch (_) {}
+      }
+
+      await _tts!.awaitSpeakCompletion(true);
+      await _tts!.setVolume(_config.volume);
+      await _tts!.setSpeechRate(_config.speechRate);
+      await _tts!.setPitch(_config.pitch);
+
+      if (!kIsWeb && Platform.isIOS) {
+        await _tts!.setSharedInstance(true);
+      }
+
+      _deviceEngineAvailable = true;
+    } catch (e) {
+      _deviceEngineAvailable = false;
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  // ===========================================================================
+  // Private: Device TTS Playback
+  // ===========================================================================
+
+  Future<void> _startDevicePlayback(TtsPlaybackSession session) async {
+    if (!_deviceEngineAvailable || _tts == null) {
+      throw StateError('Device TTS is not available');
+    }
+
+    _currentChunkIndex = 0;
+
+    // Configure voice if needed
+    if (!_voiceConfigured) {
+      await _configurePreferredVoice();
+    }
+
+    // Speak first chunk
+    _emitEvent(const TtsChunkStarted(0));
+    final result = await _tts!.speak(session.chunks.first);
+    if (result is int && result != 1) {
+      throw StateError('TTS engine returned error code $result');
+    }
+  }
+
+  void _onDeviceChunkComplete() {
+    final session = _activeSession;
+    if (session == null || session.useServerTts) return;
+
+    final nextIndex = _currentChunkIndex + 1;
+
+    // Check if there are more chunks
+    if (nextIndex >= session.chunks.length) {
+      _activeSession = null;
+      _resetPlaybackState();
+      _emitEvent(const TtsCompleted());
+      return;
+    }
+
+    // Play next chunk
+    _currentChunkIndex = nextIndex;
+    _emitEvent(TtsChunkStarted(nextIndex));
+
+    _tts?.speak(session.chunks[nextIndex]).then((result) {
+      if (result is int && result != 1) {
+        _emitEvent(TtsError('TTS engine returned error code $result'));
+      }
+    });
+  }
+
+  // ===========================================================================
+  // Private: Server TTS Playback
+  // ===========================================================================
+
+  Future<void> _startServerPlayback(TtsPlaybackSession session) async {
+    if (_apiService == null) {
+      throw StateError('Server TTS is not available');
+    }
+
+    _serverCurrentIndex = -1;
+    _serverAudioBuffer.clear();
+    _serverWaitingForNext = false;
+
+    final voice = await _resolveServerVoice();
+
+    // Fetch and play first chunk
+    final firstChunk = await _fetchServerAudio(session.chunks.first, voice);
+    if (_activeSession?.id != session.id) return; // Cancelled
+
+    _serverAudioBuffer.add(firstChunk);
+    _serverCurrentIndex = 0;
+
+    await _player.stop();
+    await _player.play(
+      BytesSource(firstChunk.bytes, mimeType: firstChunk.mimeType),
+    );
+    _emitEvent(const TtsChunkStarted(0));
+
+    // Prefetch remaining chunks in background
+    unawaited(_prefetchServerChunks(session, voice, 1));
+  }
+
+  Future<void> _prefetchServerChunks(
+    TtsPlaybackSession session,
+    String? voice,
+    int startIndex,
+  ) async {
+    for (var i = startIndex; i < session.chunks.length; i++) {
+      if (_activeSession?.id != session.id) return; // Cancelled
+
+      try {
+        final chunk = await _fetchServerAudio(session.chunks[i], voice);
+        if (_activeSession?.id != session.id) return;
+
+        _serverAudioBuffer.add(chunk);
+
+        // If player was waiting for this chunk, play it now
+        if (_serverWaitingForNext &&
+            _serverCurrentIndex + 1 < _serverAudioBuffer.length) {
+          _serverWaitingForNext = false;
+          await _playNextServerChunk();
+        }
+      } catch (e) {
+        _emitEvent(TtsError(e.toString()));
+      }
+    }
+  }
+
+  Future<_AudioChunk> _fetchServerAudio(String text, String? voice) async {
+    final result = await _apiService!.generateSpeech(
+      text: text,
+      voice: voice,
+      speed: _config.speechRate,
+    );
+    return _AudioChunk(bytes: result.bytes, mimeType: result.mimeType);
+  }
+
+  void _onServerAudioComplete() {
+    final session = _activeSession;
+    if (session == null || !session.useServerTts) return;
+
+    final nextIndex = _serverCurrentIndex + 1;
+
+    // Check if all chunks are done
+    if (nextIndex >= session.chunks.length) {
+      _activeSession = null;
+      _resetPlaybackState();
+      _emitEvent(const TtsCompleted());
+      return;
+    }
+
+    // Check if next chunk is buffered
+    if (nextIndex < _serverAudioBuffer.length) {
+      unawaited(_playNextServerChunk());
+    } else {
+      _serverWaitingForNext = true;
+    }
+  }
+
+  Future<void> _playNextServerChunk() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    final nextIndex = _serverCurrentIndex + 1;
+    if (nextIndex >= _serverAudioBuffer.length) return;
+
+    _serverCurrentIndex = nextIndex;
+    final chunk = _serverAudioBuffer[nextIndex];
+
+    await _player.play(BytesSource(chunk.bytes, mimeType: chunk.mimeType));
+    _emitEvent(TtsChunkStarted(nextIndex));
+  }
+
+  Future<String?> _resolveServerVoice() async {
+    final serverSelected = _config.serverVoice?.trim();
+    if (serverSelected != null && serverSelected.isNotEmpty) {
+      return serverSelected;
+    }
+    final selected = _config.voice?.trim();
+    if (selected != null && selected.isNotEmpty) {
+      return selected;
+    }
+    return await _getServerDefaultVoice();
+  }
+
+  Future<String?> _getServerDefaultVoice() async {
+    if (_apiService == null) return null;
+    if (_serverDefaultVoice != null) return _serverDefaultVoice;
+
+    if (_serverDefaultVoiceFuture != null) {
+      return _serverDefaultVoiceFuture;
+    }
+
+    _serverDefaultVoiceFuture = _apiService!.getDefaultServerVoice();
+    try {
+      final voice = await _serverDefaultVoiceFuture;
+      _serverDefaultVoice = voice?.trim();
+      return _serverDefaultVoice;
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      return null;
+    } finally {
+      _serverDefaultVoiceFuture = null;
+    }
+  }
+
+  // ===========================================================================
+  // Private: Helpers
+  // ===========================================================================
+
+  bool _shouldUseServer() {
+    if (_config.preferServer && _apiService != null) {
+      return true;
+    }
+    if (_deviceEngineAvailable) {
+      return false;
+    }
+    return _apiService != null;
+  }
+
+  void _resetPlaybackState() {
+    _currentChunkIndex = -1;
+    _serverCurrentIndex = -1;
+    _serverAudioBuffer.clear();
+    _serverWaitingForNext = false;
+  }
+
+  void _emitEvent(TtsEvent event) {
+    if (!_eventController.isClosed) {
+      _eventController.add(event);
+    }
+  }
+
+  Future<void> _setVoiceByName(String? voiceName) async {
+    if (_tts == null || voiceName == null) return;
+    if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) return;
+
+    try {
+      final voicesRaw = await _tts!.getVoices;
+      if (voicesRaw is! List) return;
+
+      for (final entry in voicesRaw) {
+        if (entry is Map) {
+          final normalized = _normalizeVoiceEntry(entry);
+          final name = normalized['name'] as String?;
+          if (name == voiceName) {
+            await _tts!.setVoice(_voiceCommandFrom(normalized));
+            _voiceConfigured = true;
+            return;
+          }
+        }
+      }
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  Future<void> _configurePreferredVoice() async {
+    if (_voiceConfigured || _tts == null) return;
+    if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) {
+      _voiceConfigured = true;
+      return;
+    }
+
+    try {
+      // Try to use configured voice
+      if (_config.voice != null) {
+        await _setVoiceByName(_config.voice);
+        if (_voiceConfigured) return;
+      }
+
+      // Fall back to system default
+      _voiceConfigured = true;
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      _voiceConfigured = true;
+    }
+  }
+
+  Map<String, dynamic> _normalizeVoiceEntry(Map<dynamic, dynamic> entry) {
+    final normalized = <String, dynamic>{};
+    entry.forEach((key, value) {
+      if (key != null) {
+        normalized[key.toString()] = value;
+      }
+    });
+    return normalized;
+  }
+
+  Map<String, String> _voiceCommandFrom(Map<String, dynamic> voice) {
+    final command = <String, String>{};
+    for (final key in [
+      'name',
+      'locale',
+      'identifier',
+      'id',
+      'voiceIdentifier',
+      'engine',
+    ]) {
+      final value = voice[key];
+      if (value != null) {
+        command[key] = value.toString();
+      }
+    }
+    return command;
+  }
+}
+
+// =============================================================================
+// Internal Types
+// =============================================================================
+
+class _AudioChunk {
+  const _AudioChunk({required this.bytes, required this.mimeType});
+  final Uint8List bytes;
+  final String mimeType;
+}