From 08d5de8a684703f5f9de5b03160a9c6206bc0dbe Mon Sep 17 00:00:00 2001
From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com>
Date: Mon, 8 Dec 2025 10:28:59 +0530
Subject: [PATCH] refactor(tts): Simplify text-to-speech service by migrating
 to TtsManager

---
 .../providers/text_to_speech_provider.dart    |   21 +-
 .../chat/services/text_to_speech_service.dart | 1179 ++---------------
 lib/features/chat/services/tts_manager.dart   |  916 +++++++++++++
 3 files changed, 1055 insertions(+), 1061 deletions(-)
 create mode 100644 lib/features/chat/services/tts_manager.dart

diff --git a/lib/features/chat/providers/text_to_speech_provider.dart b/lib/features/chat/providers/text_to_speech_provider.dart
index 7992596..759a19f 100644
--- a/lib/features/chat/providers/text_to_speech_provider.dart
+++ b/lib/features/chat/providers/text_to_speech_provider.dart
@@ -391,23 +391,12 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
 
   void _handleDeviceWordProgress(int start, int end) {
     if (!ref.mounted) return;
-    // Map global offsets to sentence index
-    final offsets = state.sentenceOffsets;
-    if (offsets.isEmpty) return;
-    int idx = 0;
-    for (var i = 0; i < offsets.length; i++) {
-      final sStart = offsets[i];
-      final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
-      if (start >= sStart && start < sEnd) {
-        idx = i;
-        break;
-      }
-    }
-    final sentenceStart = offsets[idx];
+    // Word progress offsets are relative to the current chunk/sentence being
+    // spoken, NOT the full original text. TtsChunkStarted already sets the
+    // correct activeSentenceIndex, so we only update word highlighting here.
     state = state.copyWith(
-      activeSentenceIndex: idx,
-      wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
-      wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
+      wordStartInSentence: start.clamp(0, 1 << 20),
+      wordEndInSentence: end.clamp(0, 1 << 20),
     );
   }
 }
diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart
index 831befc..ee7c036 100644
--- a/lib/features/chat/services/text_to_speech_service.dart
+++ b/lib/features/chat/services/text_to_speech_service.dart
@@ -1,44 +1,31 @@
 import 'dart:async';
-import 'dart:io' show Platform;
 
-import 'package:audioplayers/audioplayers.dart';
 import 'package:flutter/foundation.dart';
-import 'package:flutter/widgets.dart';
-import 'package:flutter_tts/flutter_tts.dart';
 
 import '../../../core/services/api_service.dart';
 import '../../../core/services/settings_service.dart';
+import 'tts_manager.dart';
 
-typedef _SpeechChunk = ({Uint8List bytes, String mimeType});
+export 'tts_manager.dart' show TtsEvent, TtsPlaybackSession;
 
-class SpeechAudioChunk {
-  const SpeechAudioChunk({required this.bytes, required this.mimeType});
-
-  final Uint8List bytes;
-  final String mimeType;
-}
-
-/// Lightweight wrapper around FlutterTts to centralize configuration
+/// Wrapper around [TtsManager] that provides a callback-based API.
+///
+/// This service is used by the [TextToSpeechController] and [VoiceCallService]
+/// to interact with TTS. It translates [TtsEvent]s from the manager into
+/// callbacks for backward compatibility.
 class TextToSpeechService {
-  final FlutterTts _tts = FlutterTts();
-  final AudioPlayer _player = AudioPlayer();
-  final ApiService? _api;
-  TtsEngine _engine = TtsEngine.device;
-  String? _preferredVoice;
-  String? _serverPreferredVoice;
-  double _speechRate = 0.5;
-  bool _initialized = false;
-  bool _available = false;
-  bool _voiceConfigured = false;
-  int _session = 0; // increments to cancel in-flight work
-  final List<_SpeechChunk> _buffered = <_SpeechChunk>[]; // server chunks
-  int _expectedChunks = 0;
-  int _currentIndex = -1;
-  bool _waitingNext = false;
-  bool _deviceEngineAvailable = false;
-  String? _serverDefaultVoice;
-  Future<String?>? _serverDefaultVoiceFuture;
+  TextToSpeechService({ApiService? api}) {
+    // Set the API service on the manager
+    TtsManager.instance.setApiService(api);
 
+    // Listen to TTS events and route to callbacks
+    _eventSubscription = TtsManager.instance.events.listen(_handleEvent);
+  }
+
+  StreamSubscription<TtsEvent>? _eventSubscription;
+  bool _initialized = false;
+
+  // Callbacks
   VoidCallback? _onStart;
   VoidCallback? _onComplete;
   VoidCallback? _onCancel;
@@ -48,111 +35,29 @@ class TextToSpeechService {
   void Function(int sentenceIndex)? _onSentenceIndex;
   void Function(int start, int end)? _onDeviceWordProgress;
 
+  /// Whether the service has been initialized.
   bool get isInitialized => _initialized;
-  bool get isAvailable => _available;
-  bool get deviceEngineAvailable => _deviceEngineAvailable;
-  bool get serverEngineAvailable => _api != null;
-  bool get prefersServerEngine => _shouldUseServer();
 
-  TextToSpeechService({ApiService? api}) : _api = api {
-    // Wire minimal player events to callbacks
-    _player.onPlayerComplete.listen((_) => _onAudioComplete());
-    _player.onPlayerStateChanged.listen((state) {
-      switch (state) {
-        case PlayerState.playing:
-          _handleStart();
-          break;
-        case PlayerState.paused:
-          _handlePause();
-          break;
-        default:
-          break;
-      }
-    });
+  /// Whether TTS is available.
+  bool get isAvailable => TtsManager.instance.isAvailable;
 
-    if (!kIsWeb && Platform.isAndroid) {
-      _player.setAudioContext(
-        AudioContext(
-          android: const AudioContextAndroid(),
-        ),
-      );
+  /// Whether device TTS is available.
+  bool get deviceEngineAvailable => TtsManager.instance.deviceAvailable;
+
+  /// Whether server TTS is available.
+  bool get serverEngineAvailable => TtsManager.instance.serverAvailable;
+
+  /// Whether server TTS is preferred and available.
+  bool get prefersServerEngine {
+    final config = TtsManager.instance.config;
+    if (config.preferServer && TtsManager.instance.serverAvailable) {
+      return true;
     }
+    return !TtsManager.instance.deviceAvailable &&
+        TtsManager.instance.serverAvailable;
   }
 
-  Future<void> _configureDeviceEngine({
-    required String? voice,
-    required double speechRate,
-    required double pitch,
-    required double volume,
-  }) async {
-    _deviceEngineAvailable = false;
-    try {
-      await _ensureAndroidDefaultEngine();
-      // Ensure speak() futures complete only after playback finishes.
-      // This avoids race conditions where completion callbacks fire
-      // early in release builds (especially on iOS), which can cause
-      // our voice-call pipeline to resume listening and cut off speech.
-      await _tts.awaitSpeakCompletion(true);
-      await _tts.setVolume(volume);
-      await _tts.setSpeechRate(speechRate);
-      await _tts.setPitch(pitch);
-
-      if (!kIsWeb && Platform.isIOS) {
-        await _tts.setSharedInstance(true);
-        // Rely on the native VoiceBackgroundAudioManager for iOS
-        // audio session configuration to avoid routing conflicts.
-      }
-
-      if (_engine != TtsEngine.server) {
-        await _setVoiceByName(_preferredVoice);
-      } else {
-        _voiceConfigured = false;
-      }
-
-      _deviceEngineAvailable = true;
-    } catch (e) {
-      _voiceConfigured = false;
-      _deviceEngineAvailable = false;
-      rethrow;
-    }
-  }
-
-  Future<void> _ensureAndroidDefaultEngine() async {
-    if (kIsWeb || !Platform.isAndroid) {
-      return;
-    }
-    try {
-      final engine = await _tts.getDefaultEngine;
-      if (engine is String && engine.isNotEmpty) {
-        await _tts.setEngine(engine);
-      }
-    } catch (e) {
-      _onError?.call(e.toString());
-    }
-  }
-
-  bool _computeAvailability() {
-    final serverAvailable = _api != null;
-    switch (_engine) {
-      case TtsEngine.device:
-        return _deviceEngineAvailable || serverAvailable;
-      case TtsEngine.server:
-        return serverAvailable;
-    }
-  }
-
-  bool _shouldUseServer() {
-    if (_engine == TtsEngine.server) {
-      return _api != null;
-    }
-    // Device preference with graceful fallback to server if available.
-    if (_deviceEngineAvailable) {
-      return false;
-    }
-    return _api != null;
-  }
-
-  /// Register callbacks for TTS lifecycle events
+  /// Registers callbacks for TTS lifecycle events.
   void bindHandlers({
     VoidCallback? onStart,
     VoidCallback? onComplete,
@@ -171,23 +76,9 @@ class TextToSpeechService {
     _onError = onError;
     _onSentenceIndex = onSentenceIndex;
     _onDeviceWordProgress = onDeviceWordProgress;
-
-    _tts.setStartHandler(_handleStart);
-    _tts.setCompletionHandler(_handleComplete);
-    _tts.setCancelHandler(_handleCancel);
-    _tts.setPauseHandler(_handlePause);
-    _tts.setContinueHandler(_handleContinue);
-    _tts.setErrorHandler(_handleError);
-    try {
-      _tts.setProgressHandler((String text, int start, int end, String word) {
-        _onDeviceWordProgress?.call(start, end);
-      });
-    } catch (_) {
-      // Some platforms may not support progress handler
-    }
   }
 
-  /// Initialize the native TTS engine lazily
+  /// Initializes the TTS engine.
   Future<bool> initialize({
     String? deviceVoice,
     String? serverVoice,
@@ -197,959 +88,157 @@ class TextToSpeechService {
     TtsEngine engine = TtsEngine.device,
   }) async {
     if (_initialized) {
-      _engine = engine;
-      _speechRate = speechRate;
-      if (deviceVoice != null) {
-        _preferredVoice = deviceVoice;
-        _voiceConfigured = false;
-      }
-      if (serverVoice != null) {
-        _serverPreferredVoice = serverVoice;
-      }
-      _available = _computeAvailability();
-      return _available;
-    }
-
-    _engine = engine;
-    _speechRate = speechRate;
-    _preferredVoice = deviceVoice;
-    _serverPreferredVoice = serverVoice;
-    _voiceConfigured = false;
-
-    if (_engine != TtsEngine.server || _api == null) {
-      try {
-        await _configureDeviceEngine(
+      // Update config if already initialized
+      await TtsManager.instance.updateConfig(
+        TtsConfig(
           voice: deviceVoice,
+          serverVoice: serverVoice,
           speechRate: speechRate,
           pitch: pitch,
           volume: volume,
-        );
-      } catch (e) {
-        if (_engine == TtsEngine.device) {
-          _available = false;
-          _onError?.call(e.toString());
-          _initialized = true;
-          return _available;
-        }
-      }
-    } else {
-      _deviceEngineAvailable = false;
-      try {
-        await _tts.awaitSpeakCompletion(false);
-        await _tts.setVolume(volume);
-        await _tts.setSpeechRate(speechRate);
-        await _tts.setPitch(pitch);
-      } catch (_) {}
+          preferServer: engine == TtsEngine.server,
+        ),
+      );
+      return isAvailable;
     }
 
-    _available = _computeAvailability();
+    final available = await TtsManager.instance.initialize(
+      config: TtsConfig(
+        voice: deviceVoice,
+        serverVoice: serverVoice,
+        speechRate: speechRate,
+        pitch: pitch,
+        volume: volume,
+        preferServer: engine == TtsEngine.server,
+      ),
+    );
+
     _initialized = true;
-    return _available;
+    return available;
   }
 
+  /// Speaks the given text.
   Future<void> speak(String text) async {
     if (text.trim().isEmpty) {
       throw ArgumentError('Cannot speak empty text');
     }
 
     if (!_initialized) {
-      await initialize(
-        deviceVoice: _preferredVoice,
-        serverVoice: _serverPreferredVoice,
-        engine: _engine,
-      );
+      await initialize();
     }
 
-    final bool useServer = _shouldUseServer();
-
-    if (useServer) {
-      if (_api == null) {
-        if (_deviceEngineAvailable) {
-          await _speakOnDevice(text);
-          return;
-        }
-        throw StateError('Server text-to-speech is unavailable');
-      }
-      // Server-backed TTS with sentence chunking & queued playback
-      try {
-        await _startServerChunkedPlayback(text);
-      } catch (e) {
-        _onError?.call(e.toString());
-        if (_deviceEngineAvailable) {
-          await _speakOnDevice(text);
-        } else {
-          throw StateError('Server text-to-speech failed: $e');
-        }
-      }
-      return;
-    }
-
-    // Device TTS path
-    await _speakOnDevice(text);
-  }
-
-  Future<void> _speakOnDevice(String text) async {
-    if (!_deviceEngineAvailable) {
-      throw StateError('Device text-to-speech is unavailable');
-    }
-    await _tts.stop();
-    if (!_voiceConfigured) {
-      await _configurePreferredVoice();
-    }
-    final result = await _tts.speak(text);
-    if (result is int && result != 1) {
-      _onError?.call('Text-to-speech engine returned code $result');
-    }
-    _onSentenceIndex?.call(0);
-  }
-
-  Future<SpeechAudioChunk> synthesizeServerSpeechChunk(String text) async {
-    if (text.trim().isEmpty) {
-      throw ArgumentError('Cannot synthesize empty text');
-    }
-    if (_api == null) {
-      throw StateError('Server text-to-speech is unavailable');
-    }
-    if (!_initialized) {
-      await initialize(
-        deviceVoice: _preferredVoice,
-        serverVoice: _serverPreferredVoice,
-        engine: _engine,
-      );
-    }
-    final voice = await _resolveServerVoice();
-    final chunk = await _api.generateSpeech(
-      text: text,
-      voice: voice,
-      speed: _speechRate,
-    );
-    return SpeechAudioChunk(bytes: chunk.bytes, mimeType: chunk.mimeType);
+    await TtsManager.instance.speak(text);
   }
 
+  /// Pauses the current playback.
   Future<void> pause() async {
-    if (!_initialized) return;
-    try {
-      if (_shouldUseServer()) {
-        await _player.pause();
-        _handlePause();
-      } else if (_deviceEngineAvailable) {
-        await _tts.pause();
-      }
-    } catch (e) {
-      _onError?.call(e.toString());
-    }
+    await TtsManager.instance.pause();
   }
 
+  /// Resumes paused playback.
   Future<void> resume() async {
-    if (!_initialized) return;
-    try {
-      if (_shouldUseServer()) {
-        if (_waitingNext && (_currentIndex + 1) < _buffered.length) {
-          _waitingNext = false;
-          await _playNextIfBuffered(_session);
-        } else {
-          await _player.resume();
-        }
-      }
-    } catch (e) {
-      _onError?.call(e.toString());
-    }
+    await TtsManager.instance.resume();
   }
 
+  /// Stops the current playback.
   Future<void> stop() async {
-    if (!_initialized) {
-      return;
-    }
-
-    try {
-      // Cancel any in-flight server work
-      _session++;
-      _buffered.clear();
-      _expectedChunks = 0;
-      _currentIndex = -1;
-      _waitingNext = false;
-      if (_shouldUseServer()) {
-        await _player.stop();
-        _handleCancel();
-      } else {
-        await _tts.stop();
-      }
-    } catch (e) {
-      _onError?.call(e.toString());
-    }
+    await TtsManager.instance.stop();
   }
 
+  /// Disposes the service.
   Future<void> dispose() async {
-    await stop();
-    await _player.dispose();
+    await _eventSubscription?.cancel();
+    _eventSubscription = null;
   }
 
-  /// Update TTS settings on-the-fly
+  /// Updates TTS settings.
   Future<void> updateSettings({
-    Object? voice = const _VoiceNotProvided(),
-    Object? serverVoice = const _VoiceNotProvided(),
+    Object? voice = const _NotProvided(),
+    Object? serverVoice = const _NotProvided(),
     double? speechRate,
     double? pitch,
     double? volume,
     TtsEngine? engine,
   }) async {
-    final voiceProvided = voice is! _VoiceNotProvided;
-    final serverVoiceProvided = serverVoice is! _VoiceNotProvided;
-    final voiceValue = voiceProvided ? voice as String? : null;
-    final serverVoiceValue = serverVoiceProvided
-        ? serverVoice as String?
-        : null;
-    if (!_initialized || !_available) {
-      // Allow engine and voice to update before init
-      if (engine != null) _engine = engine;
-      if (voiceProvided) _preferredVoice = voiceValue;
-      if (serverVoiceProvided) _serverPreferredVoice = serverVoiceValue;
-      if (speechRate != null) _speechRate = speechRate;
-      return;
-    }
+    final current = TtsManager.instance.config;
 
-    try {
-      if (engine != null) {
-        _engine = engine;
-      }
-      if (voiceProvided) {
-        _preferredVoice = voiceValue;
-      }
-      if (serverVoiceProvided) {
-        _serverPreferredVoice = serverVoiceValue;
-      }
-      if (volume != null) {
-        await _tts.setVolume(volume);
-      }
-      if (speechRate != null) {
-        _speechRate = speechRate;
-        await _tts.setSpeechRate(speechRate);
-      }
-      if (pitch != null) {
-        await _tts.setPitch(pitch);
-      }
-      // Set specific voice by name on device-capable engines
-      if (_engine != TtsEngine.server && voiceProvided) {
-        await _setVoiceByName(_preferredVoice);
-      }
-    } catch (e) {
-      _onError?.call(e.toString());
-    }
-
-    _available = _computeAvailability();
-  }
-
-  /// Set voice by name, or use system default if null
-  Future<void> _setVoiceByName(String? voiceName) async {
-    if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) {
-      return;
-    }
-
-    try {
-      if (voiceName == null) {
-        // Use system default - reset voice configuration
-        _voiceConfigured = false;
-        await _configurePreferredVoice();
-        return;
-      }
-
-      // Get all available voices
-      final voicesRaw = await _tts.getVoices;
-      if (voicesRaw is! List) {
-        return;
-      }
-
-      // Find the voice by name
-      Map<String, dynamic>? targetVoice;
-      for (final entry in voicesRaw) {
-        if (entry is Map) {
-          final normalized = _normalizeVoiceEntry(entry);
-          final name = normalized['name'] as String?;
-          if (name == voiceName) {
-            targetVoice = normalized;
-            break;
-          }
-        }
-      }
-
-      // Set the voice if found
-      if (targetVoice != null) {
-        await _tts.setVoice(_voiceCommandFrom(targetVoice));
-        _voiceConfigured = true;
-      } else {
-        // Voice not found, fall back to default
-        _voiceConfigured = false;
-        await _configurePreferredVoice();
-      }
-    } catch (e) {
-      _onError?.call(e.toString());
-    }
-  }
-
-  /// Get available voices from the TTS engine
-  Future<List<Map<String, dynamic>>> getAvailableVoices() async {
-    if (!_initialized) {
-      await initialize(
-        deviceVoice: _preferredVoice,
-        serverVoice: _serverPreferredVoice,
-        engine: _engine,
-      );
-    }
-
-    if (_engine == TtsEngine.server && _api != null) {
-      try {
-        final serverVoices = await _api.getAvailableServerVoices();
-        final mapped = serverVoices
-            .map((v) {
-              final id = (v['id'] ?? v['name'] ?? '').toString();
-              final name = (v['name'] ?? v['id'] ?? '').toString();
-              final localeValue = (v['locale'] ?? v['language'] ?? '')
-                  .toString();
-              return {'id': id, 'name': name, 'locale': localeValue};
-            })
-            .where((entry) {
-              final name = entry['name'];
-              return name is String && name.trim().isNotEmpty;
-            })
-            .toList();
-
-        final defaultVoice = await _getServerDefaultVoice();
-        if (defaultVoice != null && defaultVoice.isNotEmpty) {
-          final normalized = defaultVoice.toLowerCase();
-          final exists = mapped.any((voice) {
-            final name = voice['name'];
-            final id = voice['id'];
-            final lowerName = name is String ? name.toLowerCase() : '';
-            final lowerId = id is String ? id.toLowerCase() : '';
-            return lowerName == normalized || lowerId == normalized;
-          });
-          if (!exists) {
-            mapped.insert(0, {
-              'id': defaultVoice,
-              'name': defaultVoice,
-              'locale': '',
-            });
-          }
-        }
-
-        if (mapped.isEmpty) {
-          if (defaultVoice != null && defaultVoice.isNotEmpty) {
-            return [
-              {'id': defaultVoice, 'name': defaultVoice, 'locale': ''},
-            ];
-          }
-          return const [];
-        }
-        return mapped;
-      } catch (e) {
-        _onError?.call(e.toString());
-        // Fall back to device voices
-      }
-    }
-
-    if (!_available) {
-      return [];
-    }
-
-    try {
-      final voicesRaw = await _tts.getVoices;
-      if (voicesRaw is! List) {
-        return [];
-      }
-
-      final parsedVoices = <Map<String, dynamic>>[];
-      for (final entry in voicesRaw) {
-        if (entry is Map) {
-          final normalized = _normalizeVoiceEntry(entry);
-          if (normalized.isNotEmpty) {
-            parsedVoices.add(normalized);
-          }
-        }
-      }
-
-      return parsedVoices;
-    } catch (e) {
-      _onError?.call(e.toString());
-      return [];
-    }
-  }
-
-  Future<String?> _resolveServerVoice() async {
-    final serverSelected = _serverPreferredVoice?.trim();
-    if (serverSelected != null && serverSelected.isNotEmpty) {
-      return serverSelected;
-    }
-    final selected = _preferredVoice?.trim();
-    if (selected != null && selected.isNotEmpty) {
-      return selected;
-    }
-    final configVoice = await _getServerDefaultVoice();
-    if (configVoice != null && configVoice.isNotEmpty) {
-      return configVoice;
-    }
-    return null;
-  }
-
-  Future<String?> _getServerDefaultVoice() async {
-    if (_api == null) {
-      return null;
-    }
-    if (_serverDefaultVoice != null) {
-      return _serverDefaultVoice;
-    }
-    final pending = _serverDefaultVoiceFuture;
-    if (pending != null) {
-      return pending;
-    }
-
-    final future = _api.getDefaultServerVoice();
-    _serverDefaultVoiceFuture = future;
-
-    try {
-      final voice = await future;
-      final trimmed = voice?.trim();
-      if (trimmed != null && trimmed.isNotEmpty) {
-        _serverDefaultVoice = trimmed;
-        return _serverDefaultVoice;
-      }
-      return null;
-    } catch (e) {
-      _onError?.call(e.toString());
-      return null;
-    } finally {
-      _serverDefaultVoiceFuture = null;
-    }
-  }
-
-  Future<void> preloadServerDefaults() async {
-    if (_api == null) {
-      return;
-    }
-    try {
-      await _getServerDefaultVoice();
-    } catch (_) {}
-  }
-
-  // ===== Server chunked playback =====
-
-  Future<void> _startServerChunkedPlayback(String text) async {
-    final resolvedVoice = await _resolveServerVoice();
-    final effectiveVoice = resolvedVoice;
-
-    // Reset queue and create a new session
-    _session++;
-    final session = _session;
-    _buffered.clear();
-    _expectedChunks = 0;
-    _currentIndex = -1;
-    _waitingNext = false;
-
-    final chunks = _splitForTts(text);
-    if (chunks.isEmpty) return;
-    _expectedChunks = chunks.length;
-
-    // Fetch first chunk to start playback quickly
-    final firstChunk = await _fetchServerAudio(
-      chunks.first,
-      effectiveVoice,
-      session,
-    );
-    if (session != _session) return; // canceled
-    if (firstChunk.bytes.isEmpty) {
-      throw Exception('Empty audio response');
-    }
-
-    await _player.stop();
-    final bufferedFirst = _cloneChunk(firstChunk);
-    _buffered.add(bufferedFirst);
-    _currentIndex = 0;
-    await _player.play(
-      BytesSource(bufferedFirst.bytes, mimeType: bufferedFirst.mimeType),
-    );
-    _onSentenceIndex?.call(0);
-
-    // Prefetch the rest in background
-    unawaited(
-      _prefetchRemainingChunks(
-        chunks.skip(1).toList(),
-        effectiveVoice,
-        session,
+    await TtsManager.instance.updateConfig(
+      TtsConfig(
+        voice: voice is _NotProvided ? current.voice : voice as String?,
+        serverVoice: serverVoice is _NotProvided
+            ? current.serverVoice
+            : serverVoice as String?,
+        speechRate: speechRate ?? current.speechRate,
+        pitch: pitch ?? current.pitch,
+        volume: volume ?? current.volume,
+        preferServer: engine != null
+            ? engine == TtsEngine.server
+            : current.preferServer,
       ),
     );
   }
 
-  Future<void> _prefetchRemainingChunks(
-    List<String> remaining,
-    String? voice,
-    int session,
-  ) async {
-    for (final chunk in remaining) {
-      if (session != _session) return; // canceled
-      try {
-        final audioChunk = await _fetchServerAudio(chunk, voice, session);
-        if (session != _session) return;
-        if (audioChunk.bytes.isNotEmpty) {
-          _buffered.add(_cloneChunk(audioChunk));
-          // If the player finished the previous chunk and is waiting, start now
-          if (_waitingNext && (_currentIndex + 1) < _buffered.length) {
-            _waitingNext = false;
-            await _playNextIfBuffered(session);
-          }
-        }
-      } catch (e) {
-        _onError?.call(e.toString());
-        // continue with other chunks
-      }
+  /// Gets available voices from the device TTS engine.
+  Future<List<Map<String, dynamic>>> getAvailableVoices() async {
+    if (!_initialized) {
+      await initialize();
     }
+
+    final config = TtsManager.instance.config;
+    if (config.preferServer && TtsManager.instance.serverAvailable) {
+      return TtsManager.instance.getServerVoices();
+    }
+
+    return TtsManager.instance.getDeviceVoices();
   }
 
-  Future<_SpeechChunk> _fetchServerAudio(
-    String text,
-    String? voice,
-    int session,
-  ) async {
-    return await _api!.generateSpeech(
-      text: text,
-      voice: voice,
-      speed: _speechRate,
-    );
+  /// Splits text into chunks for TTS playback.
+  List<String> splitTextForSpeech(String text) {
+    return TtsManager.instance.splitTextForSpeech(text);
   }
 
-  /// Splits [text] into the chunks used for playback sequencing.
-  ///
-  /// This mirrors the server-side streaming behavior so UI consumers can stay
-  /// in sync with sentence indices reported during playback.
-  List<String> splitTextForSpeech(String text) => _splitForTts(text);
-
-  Future<void> _onAudioComplete() async {
-    final session = _session;
-    // If there are more expected chunks
-    if ((_currentIndex + 1) < _expectedChunks) {
-      // If next chunk is already buffered, play it
-      if ((_currentIndex + 1) < _buffered.length) {
-        await _playNextIfBuffered(session);
-      } else {
-        // Wait for prefetch to provide it
-        _waitingNext = true;
-      }
-      return;
-    }
-    // No more chunks – this is the real completion
-    _handleComplete();
+  /// Preloads server default voice configuration.
+  Future<void> preloadServerDefaults() async {
+    await TtsManager.instance.preloadServerDefaults();
   }
 
-  Future<void> _playNextIfBuffered(int session) async {
-    if (session != _session) return;
-    final nextIndex = _currentIndex + 1;
-    if (nextIndex < 0 || nextIndex >= _buffered.length) return;
-    _currentIndex = nextIndex;
-    final chunk = _buffered[nextIndex];
-    await _player.play(BytesSource(chunk.bytes, mimeType: chunk.mimeType));
-    _onSentenceIndex?.call(_currentIndex);
+  /// Synthesizes a single chunk of text to audio (server TTS only).
+  Future<SpeechAudioChunk> synthesizeServerSpeechChunk(String text) async {
+    final result = await TtsManager.instance.synthesizeChunk(text);
+    return SpeechAudioChunk(bytes: result.bytes, mimeType: result.mimeType);
   }
 
-  _SpeechChunk _cloneChunk(_SpeechChunk chunk) {
-    return (bytes: Uint8List.fromList(chunk.bytes), mimeType: chunk.mimeType);
-  }
-
-  List<String> _splitForTts(String text) {
-    // Mirrors OpenWebUI's extractSentencesForAudio implementation
-    // See: src/lib/utils/index.ts lines 953-970, 907-928
-
-    // 1. Preserve code blocks (replace with placeholders)
-    final codeBlocks = <String>[];
-    var processed = text;
-    var codeBlockIndex = 0;
-
-    // Match triple backticks code blocks
-    final codeBlockRegex = RegExp(r'```[\s\S]*?```', multiLine: true);
-    processed = processed.replaceAllMapped(codeBlockRegex, (match) {
-      final placeholder = '\u0000$codeBlockIndex\u0000';
-      codeBlocks.add(match.group(0)!);
-      codeBlockIndex++;
-      return placeholder;
-    });
-
-    // 2. Split on sentence-ending punctuation: .!?
-    // OpenWebUI uses: /(?<=[.!?])\s+/
-    final sentences = processed
-        .split(RegExp(r'(?<=[.!?])\s+'))
-        .map((s) => s.trim())
-        .where((s) => s.isNotEmpty)
-        .toList();
-
-    // 3. Restore code blocks from placeholders
-    final restoredSentences = sentences
-        .map((sentence) {
-          return sentence.replaceAllMapped(RegExp(r'\u0000(\d+)\u0000'), (
-            match,
-          ) {
-            final idx = int.parse(match.group(1)!);
-            return idx < codeBlocks.length ? codeBlocks[idx] : '';
-          });
-        })
-        .where((s) => s.isNotEmpty)
-        .toList();
-
-    // 4. Merge short sentences (< 4 words OR < 50 chars)
-    // OpenWebUI logic from extractSentencesForAudio
-    final mergedChunks = <String>[];
-    for (final sentence in restoredSentences) {
-      if (mergedChunks.isEmpty) {
-        mergedChunks.add(sentence);
-      } else {
-        final lastIndex = mergedChunks.length - 1;
-        final previousText = mergedChunks[lastIndex];
-        final wordCount = previousText.split(RegExp(r'\s+')).length;
-        final charCount = previousText.length;
-
-        // Merge if previous chunk is too short
-        if (wordCount < 4 || charCount < 50) {
-          mergedChunks[lastIndex] = '$previousText $sentence';
-        } else {
-          mergedChunks.add(sentence);
-        }
-      }
+  void _handleEvent(TtsEvent event) {
+    switch (event) {
+      case TtsStarted():
+        _onStart?.call();
+      case TtsChunkStarted(:final chunkIndex):
+        _onSentenceIndex?.call(chunkIndex);
+      case TtsWordProgress(:final start, :final end):
+        _onDeviceWordProgress?.call(start, end);
+      case TtsCompleted():
+        _onComplete?.call();
+      case TtsCancelled():
+        _onCancel?.call();
+      case TtsPaused():
+        _onPause?.call();
+      case TtsResumed():
+        _onContinue?.call();
+      case TtsError(:final message):
+        _onError?.call(message);
     }
-
-    return mergedChunks.isEmpty ? [text.trim()] : mergedChunks;
-  }
-
-  Future<void> _configurePreferredVoice() async {
-    if (_voiceConfigured) {
-      return;
-    }
-    if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) {
-      _voiceConfigured = true;
-      return;
-    }
-
-    var configured = false;
-    try {
-      await _ensureAndroidDefaultEngine();
-      Map<String, dynamic>? defaultVoice;
-      bool voiceSet = false;
-
-      if (Platform.isIOS) {
-        try {
-          final rawDefault = await _tts.getDefaultVoice;
-          if (rawDefault is Map) {
-            defaultVoice = _normalizeVoiceEntry(rawDefault);
-            await _tts.setVoice(_voiceCommandFrom(defaultVoice));
-            configured = true;
-            voiceSet = true;
-          }
-        } catch (_) {
-          defaultVoice = null;
-        }
-      }
-
-      if (voiceSet) {
-        return;
-      }
-
-      final voicesRaw = await _tts.getVoices;
-      if (voicesRaw is! List) {
-        return;
-      }
-
-      final parsedVoices = <Map<String, dynamic>>[];
-      for (final entry in voicesRaw) {
-        if (entry is Map) {
-          final normalized = _normalizeVoiceEntry(entry);
-          if (normalized.isNotEmpty) {
-            parsedVoices.add(normalized);
-          }
-        }
-      }
-
-      if (parsedVoices.isEmpty) {
-        return;
-      }
-
-      final localeTag = WidgetsBinding.instance.platformDispatcher.locale
-          .toLanguageTag()
-          .toLowerCase();
-      final preferred = _selectPreferredVoice(
-        parsedVoices,
-        localeTag,
-        defaultVoice: defaultVoice,
-      );
-      if (preferred == null) {
-        if (Platform.isIOS) {
-          configured = true; // Allow system default voice to be used
-        }
-        return;
-      }
-
-      await _tts.setVoice(_voiceCommandFrom(preferred));
-      configured = true;
-    } catch (e) {
-      _onError?.call(e.toString());
-    } finally {
-      _voiceConfigured = configured || _voiceConfigured;
-    }
-  }
-
-  Map<String, dynamic> _normalizeVoiceEntry(Map<dynamic, dynamic> entry) {
-    final normalized = <String, dynamic>{};
-    entry.forEach((key, value) {
-      if (key != null) {
-        normalized[key.toString()] = value;
-      }
-    });
-    return normalized;
-  }
-
-  Map<String, String> _voiceCommandFrom(Map<String, dynamic> voice) {
-    final command = <String, String>{};
-    for (final key in [
-      'name',
-      'locale',
-      'identifier',
-      'id',
-      'voiceIdentifier',
-      'engine',
-    ]) {
-      final value = voice[key];
-      if (value != null) {
-        command[key] = value.toString();
-      }
-    }
-    if (!command.containsKey('name') && voice['name'] != null) {
-      command['name'] = voice['name'].toString();
-    }
-    if (!command.containsKey('locale') && voice['locale'] != null) {
-      command['locale'] = voice['locale'].toString();
-    }
-    return command;
-  }
-
-  int _iosVoiceScore(Map<String, dynamic> voice) {
-    final identifier =
-        voice['identifier']?.toString().toLowerCase() ??
-        voice['id']?.toString().toLowerCase() ??
-        '';
-    final name = voice['name']?.toString().toLowerCase() ?? '';
-
-    int score = 0;
-    if (identifier.contains('premium')) {
-      score += 400;
-    } else if (identifier.contains('enhanced')) {
-      score += 250;
-    } else if (identifier.contains('compact')) {
-      score += 50;
-    }
-
-    if (identifier.contains('siri') || name.contains('siri')) {
-      score += 150;
-    }
-
-    if (identifier.contains('female') || name.contains('female')) {
-      score += 15;
-    }
-    if (identifier.contains('male') || name.contains('male')) {
-      score += 10;
-    }
-
-    // Prefer non-compact by default when no other hints are present
-    if (!identifier.contains('compact')) {
-      score += 25;
-    }
-
-    return score;
-  }
-
-  Map<String, dynamic>? _selectPreferredVoice(
-    List<Map<String, dynamic>> voices,
-    String localeTag, {
-    Map<String, dynamic>? defaultVoice,
-  }) {
-    Map<String, dynamic>? matchesLocale(Iterable<Map<String, dynamic>> input) {
-      for (final voice in input) {
-        final locale = voice['locale']?.toString().toLowerCase();
-        if (locale == null) continue;
-        if (locale == localeTag) {
-          return voice;
-        }
-        final localePrimary = locale.split(RegExp('[-_]')).first;
-        final tagPrimary = localeTag.split(RegExp('[-_]')).first;
-        if (localePrimary == tagPrimary) {
-          return voice;
-        }
-      }
-      return null;
-    }
-
-    Map<String, dynamic>? matchDefaultVoice() {
-      final dv = defaultVoice;
-      if (dv == null) {
-        return null;
-      }
-
-      final identifiers = <String>{};
-      for (final key in ['identifier', 'id', 'voiceIdentifier', 'voice']) {
-        final value = dv[key]?.toString();
-        if (value != null && value.isNotEmpty) {
-          identifiers.add(value.toLowerCase());
-        }
-      }
-
-      if (identifiers.isNotEmpty) {
-        for (final voice in voices) {
-          for (final key in ['identifier', 'id', 'voiceIdentifier', 'voice']) {
-            final value = voice[key]?.toString();
-            if (value != null && identifiers.contains(value.toLowerCase())) {
-              return voice;
-            }
-          }
-        }
-      }
-
-      final defaultName = dv['name']?.toString();
-      final defaultLocale = dv['locale']?.toString();
-      if (defaultName != null && defaultLocale != null) {
-        final lowerName = defaultName.toLowerCase();
-        final lowerLocale = defaultLocale.toLowerCase();
-        for (final voice in voices) {
-          final name = voice['name']?.toString();
-          final locale = voice['locale']?.toString();
-          if (name != null &&
-              locale != null &&
-              name.toLowerCase() == lowerName &&
-              locale.toLowerCase() == lowerLocale) {
-            return voice;
-          }
-        }
-      }
-
-      return null;
-    }
-
-    Map<String, dynamic>? pickIosVoice() {
-      final userDefault = matchDefaultVoice();
-      if (userDefault != null) {
-        return userDefault;
-      }
-
-      final siriCandidates = voices.where((voice) {
-        final name = voice['name']?.toString().toLowerCase() ?? '';
-        final identifier = voice['identifier']?.toString().toLowerCase() ?? '';
-        final voiceId = voice['id']?.toString().toLowerCase() ?? '';
-        return name.contains('siri') ||
-            identifier.contains('siri') ||
-            voiceId.contains('siri');
-      }).toList();
-
-      if (siriCandidates.isNotEmpty) {
-        siriCandidates.sort((a, b) => _iosVoiceScore(b) - _iosVoiceScore(a));
-        final localeMatch = matchesLocale(siriCandidates);
-        if (localeMatch != null) {
-          return localeMatch;
-        }
-        return siriCandidates.first;
-      }
-
-      final ranked = [...voices];
-      ranked.sort((a, b) => _iosVoiceScore(b) - _iosVoiceScore(a));
-      final localeMatch = matchesLocale(ranked);
-      if (localeMatch != null) {
-        return localeMatch;
-      }
-      return ranked.isNotEmpty ? ranked.first : null;
-    }
-
-    Map<String, dynamic>? pickAndroidVoice() {
-      int qualityScore(String? quality) {
-        switch ((quality ?? '').toLowerCase()) {
-          case 'very_high':
-          case 'very-high':
-            return 3;
-          case 'high':
-            return 2;
-          case 'normal':
-            return 1;
-          default:
-            return 0;
-        }
-      }
-
-      final preferredEngineVoices = voices
-          .where(
-            (voice) =>
-                (voice['engine']?.toString() ?? '').toLowerCase().contains(
-                  'google',
-                ) ||
-                voice['engine'] is! String,
-          )
-          .toList();
-
-      preferredEngineVoices.sort((a, b) {
-        final qualityDiff =
-            qualityScore(b['quality']?.toString()) -
-            qualityScore(a['quality']?.toString());
-        if (qualityDiff != 0) {
-          return qualityDiff;
-        }
-        final latencyA = a['latency']?.toString() ?? '';
-        final latencyB = b['latency']?.toString() ?? '';
-        return latencyA.compareTo(latencyB);
-      });
-
-      final ordered = preferredEngineVoices.isEmpty
-          ? voices
-          : preferredEngineVoices;
-      return matchesLocale(ordered) ?? matchesLocale(voices);
-    }
-
-    Map<String, dynamic>? selected;
-    if (Platform.isIOS) {
-      selected = pickIosVoice();
-    } else if (Platform.isAndroid) {
-      selected = pickAndroidVoice();
-    }
-
-    if (selected == null) {
-      return null;
-    }
-
-    final name = selected['name']?.toString();
-    final locale = selected['locale']?.toString();
-    if (name == null || locale == null) {
-      return null;
-    }
-
-    return selected;
-  }
-
-  void _handleStart() {
-    _onStart?.call();
-  }
-
-  void _handleComplete() {
-    _onComplete?.call();
-  }
-
-  void _handleCancel() {
-    _onCancel?.call();
-  }
-
-  void _handlePause() {
-    _onPause?.call();
-  }
-
-  void _handleContinue() {
-    _onContinue?.call();
-  }
-
-  void _handleError(dynamic message) {
-    final safeMessage = message == null
-        ? 'Unknown TTS error'
-        : message.toString();
-    _onError?.call(safeMessage);
   }
 }
 
-class _VoiceNotProvided {
-  const _VoiceNotProvided();
+/// Marker class to distinguish "not provided" from null.
+class _NotProvided {
+  const _NotProvided();
+}
+
+/// Audio chunk for server TTS synthesis.
+class SpeechAudioChunk {
+  const SpeechAudioChunk({required this.bytes, required this.mimeType});
+
+  final Uint8List bytes;
+  final String mimeType;
 }
diff --git a/lib/features/chat/services/tts_manager.dart b/lib/features/chat/services/tts_manager.dart
new file mode 100644
index 0000000..950de14
--- /dev/null
+++ b/lib/features/chat/services/tts_manager.dart
@@ -0,0 +1,916 @@
+import 'dart:async';
+import 'dart:io' show Platform;
+
+import 'package:audioplayers/audioplayers.dart';
+import 'package:flutter/foundation.dart';
+import 'package:flutter_tts/flutter_tts.dart';
+
+import '../../../core/services/api_service.dart';
+
+// =============================================================================
+// TTS Events
+// =============================================================================
+
+/// Base class for all TTS events.
+sealed class TtsEvent {
+  const TtsEvent();
+}
+
+/// Emitted when TTS playback starts.
+class TtsStarted extends TtsEvent {
+  const TtsStarted();
+}
+
+/// Emitted when a new chunk starts playing.
+class TtsChunkStarted extends TtsEvent {
+  const TtsChunkStarted(this.chunkIndex);
+  final int chunkIndex;
+}
+
+/// Emitted for word-level progress (device TTS only).
+class TtsWordProgress extends TtsEvent {
+  const TtsWordProgress(this.start, this.end);
+  final int start;
+  final int end;
+}
+
+/// Emitted when all chunks have finished playing.
+class TtsCompleted extends TtsEvent {
+  const TtsCompleted();
+}
+
+/// Emitted when playback is cancelled.
+class TtsCancelled extends TtsEvent {
+  const TtsCancelled();
+}
+
+/// Emitted when playback is paused.
+class TtsPaused extends TtsEvent {
+  const TtsPaused();
+}
+
+/// Emitted when playback resumes from pause.
+class TtsResumed extends TtsEvent {
+  const TtsResumed();
+}
+
+/// Emitted when an error occurs.
+class TtsError extends TtsEvent {
+  const TtsError(this.message);
+  final String message;
+}
+
+// =============================================================================
+// Playback Session
+// =============================================================================
+
+/// Represents a single TTS playback session.
+class TtsPlaybackSession {
+  TtsPlaybackSession._({
+    required this.id,
+    required this.chunks,
+    required this.useServerTts,
+  });
+
+  /// Unique session identifier.
+  final int id;
+
+  /// Text chunks to be spoken.
+  final List<String> chunks;
+
+  /// Whether to use server TTS (true) or device TTS (false).
+  final bool useServerTts;
+}
+
+// =============================================================================
+// TTS Configuration
+// =============================================================================
+
+/// Configuration for TTS playback.
+class TtsConfig {
+  const TtsConfig({
+    this.voice,
+    this.serverVoice,
+    this.speechRate = 0.5,
+    this.pitch = 1.0,
+    this.volume = 1.0,
+    this.preferServer = false,
+  });
+
+  final String? voice;
+  final String? serverVoice;
+  final double speechRate;
+  final double pitch;
+  final double volume;
+  final bool preferServer;
+
+  TtsConfig copyWith({
+    String? voice,
+    String? serverVoice,
+    double? speechRate,
+    double? pitch,
+    double? volume,
+    bool? preferServer,
+  }) {
+    return TtsConfig(
+      voice: voice ?? this.voice,
+      serverVoice: serverVoice ?? this.serverVoice,
+      speechRate: speechRate ?? this.speechRate,
+      pitch: pitch ?? this.pitch,
+      volume: volume ?? this.volume,
+      preferServer: preferServer ?? this.preferServer,
+    );
+  }
+}
+
+// =============================================================================
+// TTS Manager
+// =============================================================================
+
+/// Single global manager for all TTS operations.
+///
+/// This manager owns the FlutterTts and AudioPlayer instances and ensures
+/// only one playback session is active at a time. Events are emitted via
+/// a stream that consumers can listen to.
+class TtsManager {
+  TtsManager._();
+  static final instance = TtsManager._();
+
+  // FlutterTts instance (lazy initialized)
+  FlutterTts? _tts;
+  bool _ttsInitialized = false;
+  bool _handlersSet = false;
+  Completer<void>? _initCompleter;
+
+  // AudioPlayer for server TTS
+  final AudioPlayer _player = AudioPlayer();
+  bool _playerConfigured = false;
+
+  // API service for server TTS (must be set before using server TTS)
+  ApiService? _apiService;
+
+  // Configuration
+  TtsConfig _config = const TtsConfig();
+  bool _deviceEngineAvailable = false;
+  bool _voiceConfigured = false;
+
+  // Session management
+  int _sessionCounter = 0;
+  TtsPlaybackSession? _activeSession;
+
+  // Device TTS state
+  int _currentChunkIndex = -1;
+
+  // Server TTS state
+  final List<_AudioChunk> _serverAudioBuffer = [];
+  int _serverCurrentIndex = -1;
+  bool _serverWaitingForNext = false;
+
+  // Event stream
+  final _eventController = StreamController<TtsEvent>.broadcast();
+
+  // Cached server default voice
+  String? _serverDefaultVoice;
+  Future<String?>? _serverDefaultVoiceFuture;
+
+  /// Stream of TTS events.
+  Stream<TtsEvent> get events => _eventController.stream;
+
+  /// Whether device TTS is available.
+  bool get deviceAvailable => _deviceEngineAvailable;
+
+  /// Whether server TTS is available.
+  bool get serverAvailable => _apiService != null;
+
+  /// Whether any TTS is available.
+  bool get isAvailable => _deviceEngineAvailable || serverAvailable;
+
+  /// Whether a session is currently active.
+  bool get isPlaying => _activeSession != null;
+
+  /// Current configuration.
+  TtsConfig get config => _config;
+
+  /// Sets the API service for server TTS.
+  void setApiService(ApiService? api) {
+    _apiService = api;
+  }
+
+  /// Updates the TTS configuration.
+  Future<void> updateConfig(TtsConfig config) async {
+    _config = config;
+
+    if (_tts != null && _ttsInitialized) {
+      await _tts!.setVolume(config.volume);
+      await _tts!.setSpeechRate(config.speechRate);
+      await _tts!.setPitch(config.pitch);
+
+      if (config.voice != null) {
+        await _setVoiceByName(config.voice);
+      }
+    }
+  }
+
+  /// Initializes the TTS engine.
+  ///
+  /// This must be called before any TTS operations.
+  Future<bool> initialize({TtsConfig? config}) async {
+    if (config != null) {
+      _config = config;
+    }
+
+    // Initialize FlutterTts
+    await _ensureTtsInitialized();
+
+    // Configure AudioPlayer for all platforms
+    if (!_playerConfigured) {
+      _player.onPlayerComplete.listen((_) => _onServerAudioComplete());
+      _player.onPlayerStateChanged.listen((state) {
+        if (state == PlayerState.playing) {
+          _emitEvent(const TtsStarted());
+        } else if (state == PlayerState.paused) {
+          _emitEvent(const TtsPaused());
+        }
+      });
+      // Android-specific audio context configuration
+      if (!kIsWeb && Platform.isAndroid) {
+        await _player.setAudioContext(
+          AudioContext(android: const AudioContextAndroid()),
+        );
+      }
+      _playerConfigured = true;
+    }
+
+    return isAvailable;
+  }
+
+  /// Speaks the given text.
+  ///
+  /// Returns the playback session. If another session is active, it will be
+  /// cancelled first.
+  Future<TtsPlaybackSession?> speak(String text, {bool? useServer}) async {
+    if (text.trim().isEmpty) {
+      return null;
+    }
+
+    // Cancel any existing session
+    await stop();
+
+    // Ensure TTS is initialized
+    await _ensureTtsInitialized();
+
+    // Determine whether to use server or device TTS
+    final shouldUseServer = useServer ?? _shouldUseServer();
+
+    // Split text into chunks
+    final chunks = splitTextForSpeech(text);
+    if (chunks.isEmpty) {
+      return null;
+    }
+
+    // Create new session
+    _sessionCounter++;
+    final session = TtsPlaybackSession._(
+      id: _sessionCounter,
+      chunks: chunks,
+      useServerTts: shouldUseServer,
+    );
+    _activeSession = session;
+
+    // Start playback
+    try {
+      if (shouldUseServer) {
+        await _startServerPlayback(session);
+      } else {
+        await _startDevicePlayback(session);
+      }
+      return session;
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+
+      // Try fallback to device TTS if server fails
+      if (shouldUseServer && _deviceEngineAvailable) {
+        try {
+          // Create a new session with useServerTts: false so device TTS
+          // handlers emit events correctly
+          final fallbackSession = TtsPlaybackSession._(
+            id: session.id,
+            chunks: session.chunks,
+            useServerTts: false,
+          );
+          _activeSession = fallbackSession;
+          await _startDevicePlayback(fallbackSession);
+          return fallbackSession;
+        } catch (e2) {
+          _emitEvent(TtsError(e2.toString()));
+        }
+      }
+
+      _activeSession = null;
+      return null;
+    }
+  }
+
+  /// Pauses the current playback.
+  Future<void> pause() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    try {
+      if (session.useServerTts) {
+        await _player.pause();
+      } else {
+        await _tts?.pause();
+      }
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  /// Resumes paused playback.
+  Future<void> resume() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    try {
+      if (session.useServerTts) {
+        await _player.resume();
+        _emitEvent(const TtsResumed());
+      } else {
+        // Device TTS resume is handled by the native handler
+      }
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  /// Stops the current playback.
+  Future<void> stop() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    _activeSession = null;
+    _resetPlaybackState();
+
+    try {
+      if (session.useServerTts) {
+        await _player.stop();
+      } else {
+        await _tts?.stop();
+      }
+      _emitEvent(const TtsCancelled());
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  /// Disposes the manager and releases resources.
+  Future<void> dispose() async {
+    await stop();
+    await _player.dispose();
+    await _eventController.close();
+  }
+
+  /// Splits text into chunks for TTS playback.
+  ///
+  /// This mirrors OpenWebUI's extractSentencesForAudio implementation.
+  List<String> splitTextForSpeech(String text) {
+    // 1. Preserve code blocks (replace with placeholders)
+    final codeBlocks = <String>[];
+    var processed = text;
+    var codeBlockIndex = 0;
+
+    final codeBlockRegex = RegExp(r'```[\s\S]*?```', multiLine: true);
+    processed = processed.replaceAllMapped(codeBlockRegex, (match) {
+      final placeholder = '\u0000$codeBlockIndex\u0000';
+      codeBlocks.add(match.group(0)!);
+      codeBlockIndex++;
+      return placeholder;
+    });
+
+    // 2. Split on sentence-ending punctuation: .!?
+    final sentences = processed
+        .split(RegExp(r'(?<=[.!?])\s+'))
+        .map((s) => s.trim())
+        .where((s) => s.isNotEmpty)
+        .toList();
+
+    // 3. Restore code blocks from placeholders
+    final restoredSentences = sentences
+        .map((sentence) {
+          return sentence.replaceAllMapped(RegExp(r'\u0000(\d+)\u0000'), (m) {
+            final idx = int.parse(m.group(1)!);
+            return idx < codeBlocks.length ? codeBlocks[idx] : '';
+          });
+        })
+        .where((s) => s.isNotEmpty)
+        .toList();
+
+    // 4. Merge short sentences (< 4 words OR < 50 chars)
+    final mergedChunks = <String>[];
+    for (final sentence in restoredSentences) {
+      if (mergedChunks.isEmpty) {
+        mergedChunks.add(sentence);
+      } else {
+        final lastIndex = mergedChunks.length - 1;
+        final previousText = mergedChunks[lastIndex];
+        final wordCount = previousText.split(RegExp(r'\s+')).length;
+        final charCount = previousText.length;
+
+        if (wordCount < 4 || charCount < 50) {
+          mergedChunks[lastIndex] = '$previousText $sentence';
+        } else {
+          mergedChunks.add(sentence);
+        }
+      }
+    }
+
+    return mergedChunks.isEmpty ? [text.trim()] : mergedChunks;
+  }
+
+  /// Gets available voices from the device TTS engine.
+  Future<List<Map<String, dynamic>>> getDeviceVoices() async {
+    await _ensureTtsInitialized();
+    if (_tts == null) return [];
+
+    try {
+      final voicesRaw = await _tts!.getVoices;
+      if (voicesRaw is! List) return [];
+
+      return voicesRaw
+          .whereType<Map>()
+          .map((e) => _normalizeVoiceEntry(e))
+          .where((e) => e.isNotEmpty)
+          .toList();
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      return [];
+    }
+  }
+
+  /// Gets available voices from the server.
+  Future<List<Map<String, dynamic>>> getServerVoices() async {
+    if (_apiService == null) return [];
+
+    try {
+      final serverVoices = await _apiService!.getAvailableServerVoices();
+      return serverVoices
+          .map((v) {
+            final id = (v['id'] ?? v['name'] ?? '').toString();
+            final name = (v['name'] ?? v['id'] ?? '').toString();
+            final locale = (v['locale'] ?? v['language'] ?? '').toString();
+            return {'id': id, 'name': name, 'locale': locale};
+          })
+          .where((e) => e['name']?.toString().trim().isNotEmpty ?? false)
+          .toList();
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      return [];
+    }
+  }
+
+  /// Preloads server default voice configuration.
+  Future<void> preloadServerDefaults() async {
+    if (_apiService == null) return;
+    try {
+      await _getServerDefaultVoice();
+    } catch (_) {}
+  }
+
+  /// Synthesizes a single text chunk to audio without playing it.
+  ///
+  /// This is used by [VoiceCallService] for its own audio playback pipeline.
+  /// Returns the audio bytes and mime type.
+  Future<({Uint8List bytes, String mimeType})> synthesizeChunk(
+    String text,
+  ) async {
+    if (_apiService == null) {
+      throw StateError('Server TTS is not available');
+    }
+    if (text.trim().isEmpty) {
+      throw ArgumentError('Cannot synthesize empty text');
+    }
+
+    final voice = await _resolveServerVoice();
+    final result = await _apiService!.generateSpeech(
+      text: text,
+      voice: voice,
+      speed: _config.speechRate,
+    );
+    return (bytes: result.bytes, mimeType: result.mimeType);
+  }
+
+  // ===========================================================================
+  // Private: Initialization
+  // ===========================================================================
+
+  Future<void> _ensureTtsInitialized() async {
+    if (_ttsInitialized) return;
+
+    // Prevent concurrent initialization
+    if (_initCompleter != null) {
+      await _initCompleter!.future;
+      return;
+    }
+
+    _initCompleter = Completer<void>();
+
+    try {
+      final tts = FlutterTts();
+      _tts = tts;
+
+      // Wait for native TTS to be fully initialized before setting handlers.
+      // The flutter_tts plugin has a bug where setting handlers during onInit
+      // causes ConcurrentModificationException.
+      await Future<void>.delayed(const Duration(milliseconds: 500));
+
+      if (!_handlersSet) {
+        _setupTtsHandlers(tts);
+        _handlersSet = true;
+      }
+
+      // Configure device engine
+      await _configureDeviceEngine();
+
+      _ttsInitialized = true;
+      _initCompleter!.complete();
+    } catch (e) {
+      _initCompleter!.completeError(e);
+      _initCompleter = null;
+      rethrow;
+    }
+  }
+
+  void _setupTtsHandlers(FlutterTts tts) {
+    tts.setStartHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _emitEvent(const TtsStarted());
+      }
+    });
+
+    tts.setCompletionHandler(() {
+      _onDeviceChunkComplete();
+    });
+
+    tts.setCancelHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _activeSession = null;
+        _resetPlaybackState();
+        _emitEvent(const TtsCancelled());
+      }
+    });
+
+    tts.setPauseHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _emitEvent(const TtsPaused());
+      }
+    });
+
+    tts.setContinueHandler(() {
+      if (_activeSession != null && !_activeSession!.useServerTts) {
+        _emitEvent(const TtsResumed());
+      }
+    });
+
+    tts.setErrorHandler((msg) {
+      _emitEvent(TtsError(msg.toString()));
+    });
+
+    try {
+      tts.setProgressHandler((String text, int start, int end, String word) {
+        if (_activeSession != null && !_activeSession!.useServerTts) {
+          _emitEvent(TtsWordProgress(start, end));
+        }
+      });
+    } catch (_) {
+      // Some platforms may not support progress handler
+    }
+  }
+
+  Future<void> _configureDeviceEngine() async {
+    if (_tts == null) return;
+
+    _deviceEngineAvailable = false;
+    try {
+      // Set default engine on Android
+      if (!kIsWeb && Platform.isAndroid) {
+        try {
+          final engine = await _tts!.getDefaultEngine;
+          if (engine is String && engine.isNotEmpty) {
+            await _tts!.setEngine(engine);
+          }
+        } catch (_) {}
+      }
+
+      await _tts!.awaitSpeakCompletion(true);
+      await _tts!.setVolume(_config.volume);
+      await _tts!.setSpeechRate(_config.speechRate);
+      await _tts!.setPitch(_config.pitch);
+
+      if (!kIsWeb && Platform.isIOS) {
+        await _tts!.setSharedInstance(true);
+      }
+
+      _deviceEngineAvailable = true;
+    } catch (e) {
+      _deviceEngineAvailable = false;
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  // ===========================================================================
+  // Private: Device TTS Playback
+  // ===========================================================================
+
+  Future<void> _startDevicePlayback(TtsPlaybackSession session) async {
+    if (!_deviceEngineAvailable || _tts == null) {
+      throw StateError('Device TTS is not available');
+    }
+
+    _currentChunkIndex = 0;
+
+    // Configure voice if needed
+    if (!_voiceConfigured) {
+      await _configurePreferredVoice();
+    }
+
+    // Speak first chunk
+    _emitEvent(const TtsChunkStarted(0));
+    final result = await _tts!.speak(session.chunks.first);
+    if (result is int && result != 1) {
+      throw StateError('TTS engine returned error code $result');
+    }
+  }
+
+  void _onDeviceChunkComplete() {
+    final session = _activeSession;
+    if (session == null || session.useServerTts) return;
+
+    final nextIndex = _currentChunkIndex + 1;
+
+    // Check if there are more chunks
+    if (nextIndex >= session.chunks.length) {
+      _activeSession = null;
+      _resetPlaybackState();
+      _emitEvent(const TtsCompleted());
+      return;
+    }
+
+    // Play next chunk
+    _currentChunkIndex = nextIndex;
+    _emitEvent(TtsChunkStarted(nextIndex));
+
+    _tts?.speak(session.chunks[nextIndex]).then((result) {
+      if (result is int && result != 1) {
+        _emitEvent(TtsError('TTS engine returned error code $result'));
+      }
+    });
+  }
+
+  // ===========================================================================
+  // Private: Server TTS Playback
+  // ===========================================================================
+
+  Future<void> _startServerPlayback(TtsPlaybackSession session) async {
+    if (_apiService == null) {
+      throw StateError('Server TTS is not available');
+    }
+
+    _serverCurrentIndex = -1;
+    _serverAudioBuffer.clear();
+    _serverWaitingForNext = false;
+
+    final voice = await _resolveServerVoice();
+
+    // Fetch and play first chunk
+    final firstChunk = await _fetchServerAudio(session.chunks.first, voice);
+    if (_activeSession?.id != session.id) return; // Cancelled
+
+    _serverAudioBuffer.add(firstChunk);
+    _serverCurrentIndex = 0;
+
+    await _player.stop();
+    await _player.play(
+      BytesSource(firstChunk.bytes, mimeType: firstChunk.mimeType),
+    );
+    _emitEvent(const TtsChunkStarted(0));
+
+    // Prefetch remaining chunks in background
+    unawaited(_prefetchServerChunks(session, voice, 1));
+  }
+
+  Future<void> _prefetchServerChunks(
+    TtsPlaybackSession session,
+    String? voice,
+    int startIndex,
+  ) async {
+    for (var i = startIndex; i < session.chunks.length; i++) {
+      if (_activeSession?.id != session.id) return; // Cancelled
+
+      try {
+        final chunk = await _fetchServerAudio(session.chunks[i], voice);
+        if (_activeSession?.id != session.id) return;
+
+        _serverAudioBuffer.add(chunk);
+
+        // If player was waiting for this chunk, play it now
+        if (_serverWaitingForNext &&
+            _serverCurrentIndex + 1 < _serverAudioBuffer.length) {
+          _serverWaitingForNext = false;
+          await _playNextServerChunk();
+        }
+      } catch (e) {
+        _emitEvent(TtsError(e.toString()));
+      }
+    }
+  }
+
+  Future<_AudioChunk> _fetchServerAudio(String text, String? voice) async {
+    final result = await _apiService!.generateSpeech(
+      text: text,
+      voice: voice,
+      speed: _config.speechRate,
+    );
+    return _AudioChunk(bytes: result.bytes, mimeType: result.mimeType);
+  }
+
+  void _onServerAudioComplete() {
+    final session = _activeSession;
+    if (session == null || !session.useServerTts) return;
+
+    final nextIndex = _serverCurrentIndex + 1;
+
+    // Check if all chunks are done
+    if (nextIndex >= session.chunks.length) {
+      _activeSession = null;
+      _resetPlaybackState();
+      _emitEvent(const TtsCompleted());
+      return;
+    }
+
+    // Check if next chunk is buffered
+    if (nextIndex < _serverAudioBuffer.length) {
+      unawaited(_playNextServerChunk());
+    } else {
+      _serverWaitingForNext = true;
+    }
+  }
+
+  Future<void> _playNextServerChunk() async {
+    final session = _activeSession;
+    if (session == null) return;
+
+    final nextIndex = _serverCurrentIndex + 1;
+    if (nextIndex >= _serverAudioBuffer.length) return;
+
+    _serverCurrentIndex = nextIndex;
+    final chunk = _serverAudioBuffer[nextIndex];
+
+    await _player.play(BytesSource(chunk.bytes, mimeType: chunk.mimeType));
+    _emitEvent(TtsChunkStarted(nextIndex));
+  }
+
+  Future<String?> _resolveServerVoice() async {
+    final serverSelected = _config.serverVoice?.trim();
+    if (serverSelected != null && serverSelected.isNotEmpty) {
+      return serverSelected;
+    }
+    final selected = _config.voice?.trim();
+    if (selected != null && selected.isNotEmpty) {
+      return selected;
+    }
+    return await _getServerDefaultVoice();
+  }
+
+  Future<String?> _getServerDefaultVoice() async {
+    if (_apiService == null) return null;
+    if (_serverDefaultVoice != null) return _serverDefaultVoice;
+
+    if (_serverDefaultVoiceFuture != null) {
+      return _serverDefaultVoiceFuture;
+    }
+
+    _serverDefaultVoiceFuture = _apiService!.getDefaultServerVoice();
+    try {
+      final voice = await _serverDefaultVoiceFuture;
+      _serverDefaultVoice = voice?.trim();
+      return _serverDefaultVoice;
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      return null;
+    } finally {
+      _serverDefaultVoiceFuture = null;
+    }
+  }
+
+  // ===========================================================================
+  // Private: Helpers
+  // ===========================================================================
+
+  bool _shouldUseServer() {
+    if (_config.preferServer && _apiService != null) {
+      return true;
+    }
+    if (_deviceEngineAvailable) {
+      return false;
+    }
+    return _apiService != null;
+  }
+
+  void _resetPlaybackState() {
+    _currentChunkIndex = -1;
+    _serverCurrentIndex = -1;
+    _serverAudioBuffer.clear();
+    _serverWaitingForNext = false;
+  }
+
+  void _emitEvent(TtsEvent event) {
+    if (!_eventController.isClosed) {
+      _eventController.add(event);
+    }
+  }
+
+  Future<void> _setVoiceByName(String? voiceName) async {
+    if (_tts == null || voiceName == null) return;
+    if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) return;
+
+    try {
+      final voicesRaw = await _tts!.getVoices;
+      if (voicesRaw is! List) return;
+
+      for (final entry in voicesRaw) {
+        if (entry is Map) {
+          final normalized = _normalizeVoiceEntry(entry);
+          final name = normalized['name'] as String?;
+          if (name == voiceName) {
+            await _tts!.setVoice(_voiceCommandFrom(normalized));
+            _voiceConfigured = true;
+            return;
+          }
+        }
+      }
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+    }
+  }
+
+  Future<void> _configurePreferredVoice() async {
+    if (_voiceConfigured || _tts == null) return;
+    if (kIsWeb || (!Platform.isIOS && !Platform.isAndroid)) {
+      _voiceConfigured = true;
+      return;
+    }
+
+    try {
+      // Try to use configured voice
+      if (_config.voice != null) {
+        await _setVoiceByName(_config.voice);
+        if (_voiceConfigured) return;
+      }
+
+      // Fall back to system default
+      _voiceConfigured = true;
+    } catch (e) {
+      _emitEvent(TtsError(e.toString()));
+      _voiceConfigured = true;
+    }
+  }
+
+  Map<String, dynamic> _normalizeVoiceEntry(Map<dynamic, dynamic> entry) {
+    final normalized = <String, dynamic>{};
+    entry.forEach((key, value) {
+      if (key != null) {
+        normalized[key.toString()] = value;
+      }
+    });
+    return normalized;
+  }
+
+  Map<String, String> _voiceCommandFrom(Map<String, dynamic> voice) {
+    final command = <String, String>{};
+    for (final key in [
+      'name',
+      'locale',
+      'identifier',
+      'id',
+      'voiceIdentifier',
+      'engine',
+    ]) {
+      final value = voice[key];
+      if (value != null) {
+        command[key] = value.toString();
+      }
+    }
+    return command;
+  }
+}
+
+// =============================================================================
+// Internal Types
+// =============================================================================
+
+class _AudioChunk {
+  const _AudioChunk({required this.bytes, required this.mimeType});
+  final Uint8List bytes;
+  final String mimeType;
+}