From 561e7dd61662b31b778482ac8e9fd7b33a04ea96 Mon Sep 17 00:00:00 2001
From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com>
Date: Thu, 23 Oct 2025 16:31:15 +0530
Subject: [PATCH 1/3] feat(tts): server-backed TTS engine selection

Introduce server TTS support and engine selection while keeping
device TTS as the default.

- Add new persistence keys for storing TTS engine and selected
  server voice (ttsEngine, ttsServerVoiceId, ttsServerVoiceName).
- Extend TextToSpeechService to support two engines:
  TtsEngine.device (FlutterTts) and TtsEngine.server (remote audio).
- Wire in an AudioPlayer and optional ApiService to fetch raw
  audio bytes from the server and play them, with event hooks
  mapped to existing lifecycle callbacks.
- Implement fallback to device TTS on server errors or empty
  responses, and ensure player lifecycle (pause/stop/dispose)
  is handled when using server engine.
- Allow engine and preferred voice to be configured before
  initialization and updated at runtime via updateSettings.

This enables selecting a server-side voice and using a remote
TTS provider while preserving compatibility with the existing
device TTS implementation.
---
 ios/Podfile.lock                              |   6 +
 lib/core/persistence/persistence_keys.dart    |   3 +
 lib/core/providers/app_providers.dart         |   6 +-
 lib/core/services/api_service.dart            |  28 +++-
 lib/core/services/settings_service.dart       |  74 ++++++++++
 .../providers/text_to_speech_provider.dart    |  15 +-
 .../chat/services/text_to_speech_service.dart | 117 +++++++++++++--
 .../profile/views/app_customization_page.dart | 134 ++++++++++++++++--
 pubspec.lock                                  |  56 ++++++++
 pubspec.yaml                                  |   1 +
 10 files changed, 404 insertions(+), 36 deletions(-)
diff --git a/ios/Podfile.lock b/ios/Podfile.lock
index c990258..9a90a3c 100644
--- a/ios/Podfile.lock
+++ b/ios/Podfile.lock
@@ -1,4 +1,6 @@
 PODS:
+  - audioplayers_darwin (0.0.1):
+    - Flutter
   - connectivity_plus (0.0.1):
     - Flutter
   - DKImagePickerController/Core (4.3.9):
@@ -84,6 +86,7 @@ PODS:
     - FlutterMacOS
 
 DEPENDENCIES:
+  - audioplayers_darwin (from `.symlinks/plugins/audioplayers_darwin/ios`)
   - connectivity_plus (from `.symlinks/plugins/connectivity_plus/ios`)
   - file_picker (from `.symlinks/plugins/file_picker/ios`)
   - Flutter (from `Flutter`)
@@ -113,6 +116,8 @@ SPEC REPOS:
     - SwiftyGif
 
 EXTERNAL SOURCES:
+  audioplayers_darwin:
+    :path: ".symlinks/plugins/audioplayers_darwin/ios"
   connectivity_plus:
     :path: ".symlinks/plugins/connectivity_plus/ios"
   file_picker:
@@ -155,6 +160,7 @@ EXTERNAL SOURCES:
     :path: ".symlinks/plugins/webview_flutter_wkwebview/darwin"
 
 SPEC CHECKSUMS:
+  audioplayers_darwin: ccf9c770ee768abb07e26d90af093f7bab1c12ab
   connectivity_plus: cb623214f4e1f6ef8fe7403d580fdad517d2f7dd
   DKImagePickerController: 946cec48c7873164274ecc4624d19e3da4c1ef3c
   DKPhotoGallery: b3834fecb755ee09a593d7c9e389d8b5d6deed60
diff --git a/lib/core/persistence/persistence_keys.dart b/lib/core/persistence/persistence_keys.dart
index 80e58b7..6afce43 100644
--- a/lib/core/persistence/persistence_keys.dart
+++ b/lib/core/persistence/persistence_keys.dart
@@ -25,6 +25,9 @@ final class PreferenceKeys {
   static const String ttsSpeechRate = 'tts_speech_rate';
   static const String ttsPitch = 'tts_pitch';
   static const String ttsVolume = 'tts_volume';
+  static const String ttsEngine = 'tts_engine'; // 'device' | 'server'
+  static const String ttsServerVoiceId = 'tts_server_voice_id';
+  static const String ttsServerVoiceName = 'tts_server_voice_name';
 }
 
 final class LegacyPreferenceKeys {
diff --git a/lib/core/providers/app_providers.dart b/lib/core/providers/app_providers.dart
index 9257d86..8ba3794 100644
--- a/lib/core/providers/app_providers.dart
+++ b/lib/core/providers/app_providers.dart
@@ -1830,7 +1830,11 @@ Future<List<String>> availableVoices(Ref ref) async {
   if (api == null) return [];
 
   try {
-    return await api.getAvailableVoices();
+    final voices = await api.getAvailableServerVoices();
+    return voices
+        .map((v) => (v['name'] ?? v['id'] ?? '').toString())
+        .where((s) => s.isNotEmpty)
+        .toList();
   } catch (e) {
     DebugLogger.error('voices-failed', scope: 'voices', error: e);
     return [];
diff --git a/lib/core/services/api_service.dart b/lib/core/services/api_service.dart
index acc7a77..46044aa 100644
--- a/lib/core/services/api_service.dart
+++ b/lib/core/services/api_service.dart
@@ -2261,12 +2261,24 @@ class ApiService {
   }
 
   // Audio
-  Future<List<String>> getAvailableVoices() async {
-    _traceApi('Fetching available voices');
+  Future<List<Map<String, dynamic>>> getAvailableServerVoices() async {
+    _traceApi('Fetching server TTS voices');
     final response = await _dio.get('/api/v1/audio/voices');
     final data = response.data;
+    if (data is Map<String, dynamic>) {
+      final voices = data['voices'];
+      if (voices is List) {
+        return voices
+            .whereType<Map>()
+            .map((e) => e.cast<String, dynamic>())
+            .toList();
+      }
+    }
     if (data is List) {
-      return data.cast<String>();
+      // Fallback: plain list of ids
+      return data
+          .map((e) => {'id': e.toString(), 'name': e.toString()})
+          .toList();
     }
     return [];
   }
@@ -2279,13 +2291,15 @@ class ApiService {
     _traceApi('Generating speech for text: $textPreview...');
     final response = await _dio.post(
       '/api/v1/audio/speech',
-      data: {'text': text, if (voice != null) 'voice': voice},
+      data: {'input': text, if (voice != null) 'voice': voice},
+      options: Options(responseType: ResponseType.bytes),
     );
 
     // Return audio data as bytes
-    if (response.data is List) {
-      return (response.data as List).cast<int>();
-    }
+    final data = response.data;
+    if (data is List<int>) return data;
+    if (data is Uint8List) return data.toList();
+    if (data is List) return (data).cast<int>();
     return [];
   }
 
diff --git a/lib/core/services/settings_service.dart b/lib/core/services/settings_service.dart
index b21a168..b40b533 100644
--- a/lib/core/services/settings_service.dart
+++ b/lib/core/services/settings_service.dart
@@ -8,6 +8,9 @@ import 'animation_service.dart';
 
 part 'settings_service.g.dart';
 
+/// TTS engine selection
+enum TtsEngine { device, server }
+
 /// Service for managing app-wide settings including accessibility preferences
 class SettingsService {
   static const String _reduceMotionKey = PreferenceKeys.reduceMotion;
@@ -142,6 +145,12 @@ class SettingsService {
         ttsPitch: (box.get(PreferenceKeys.ttsPitch) as num?)?.toDouble() ?? 1.0,
         ttsVolume:
             (box.get(PreferenceKeys.ttsVolume) as num?)?.toDouble() ?? 1.0,
+        ttsEngine: _parseTtsEngine(
+          box.get(PreferenceKeys.ttsEngine) as String?,
+        ),
+        ttsServerVoiceId: box.get(PreferenceKeys.ttsServerVoiceId) as String?,
+        ttsServerVoiceName:
+            box.get(PreferenceKeys.ttsServerVoiceName) as String?,
       ),
     );
   }
@@ -164,6 +173,7 @@ class SettingsService {
       PreferenceKeys.ttsSpeechRate: settings.ttsSpeechRate,
       PreferenceKeys.ttsPitch: settings.ttsPitch,
       PreferenceKeys.ttsVolume: settings.ttsVolume,
+      PreferenceKeys.ttsEngine: settings.ttsEngine.name,
     };
 
     await box.putAll(updates);
@@ -185,6 +195,33 @@ class SettingsService {
     } else {
       await box.delete(PreferenceKeys.ttsVoice);
     }
+
+    // Server-specific voice id and friendly name
+    if (settings.ttsServerVoiceId != null &&
+        settings.ttsServerVoiceId!.isNotEmpty) {
+      await box.put(PreferenceKeys.ttsServerVoiceId, settings.ttsServerVoiceId);
+    } else {
+      await box.delete(PreferenceKeys.ttsServerVoiceId);
+    }
+    if (settings.ttsServerVoiceName != null &&
+        settings.ttsServerVoiceName!.isNotEmpty) {
+      await box.put(
+        PreferenceKeys.ttsServerVoiceName,
+        settings.ttsServerVoiceName,
+      );
+    } else {
+      await box.delete(PreferenceKeys.ttsServerVoiceName);
+    }
+  }
+
+  static TtsEngine _parseTtsEngine(String? raw) {
+    switch ((raw ?? '').toLowerCase()) {
+      case 'server':
+        return TtsEngine.server;
+      case 'device':
+      default:
+        return TtsEngine.device;
+    }
   }
 
   // Voice input specific settings
@@ -314,6 +351,9 @@ class AppSettings {
   final double ttsSpeechRate;
   final double ttsPitch;
   final double ttsVolume;
+  final TtsEngine ttsEngine;
+  final String? ttsServerVoiceId;
+  final String? ttsServerVoiceName;
   const AppSettings({
     this.reduceMotion = false,
     this.animationSpeed = 1.0,
@@ -332,6 +372,9 @@ class AppSettings {
     this.ttsSpeechRate = 0.5,
     this.ttsPitch = 1.0,
     this.ttsVolume = 1.0,
+    this.ttsEngine = TtsEngine.device,
+    this.ttsServerVoiceId,
+    this.ttsServerVoiceName,
   });
 
   AppSettings copyWith({
@@ -352,6 +395,9 @@ class AppSettings {
     double? ttsSpeechRate,
     double? ttsPitch,
     double? ttsVolume,
+    TtsEngine? ttsEngine,
+    Object? ttsServerVoiceId = const _DefaultValue(),
+    Object? ttsServerVoiceName = const _DefaultValue(),
   }) {
     return AppSettings(
       reduceMotion: reduceMotion ?? this.reduceMotion,
@@ -375,6 +421,13 @@ class AppSettings {
       ttsSpeechRate: ttsSpeechRate ?? this.ttsSpeechRate,
       ttsPitch: ttsPitch ?? this.ttsPitch,
       ttsVolume: ttsVolume ?? this.ttsVolume,
+      ttsEngine: ttsEngine ?? this.ttsEngine,
+      ttsServerVoiceId: ttsServerVoiceId is _DefaultValue
+          ? this.ttsServerVoiceId
+          : ttsServerVoiceId as String?,
+      ttsServerVoiceName: ttsServerVoiceName is _DefaultValue
+          ? this.ttsServerVoiceName
+          : ttsServerVoiceName as String?,
     );
   }
 
@@ -397,6 +450,9 @@ class AppSettings {
         other.ttsSpeechRate == ttsSpeechRate &&
         other.ttsPitch == ttsPitch &&
         other.ttsVolume == ttsVolume &&
+        other.ttsEngine == ttsEngine &&
+        other.ttsServerVoiceId == ttsServerVoiceId &&
+        other.ttsServerVoiceName == ttsServerVoiceName &&
         _listEquals(other.quickPills, quickPills);
     // socketTransportMode intentionally not included in == to avoid frequent rebuilds
   }
@@ -420,6 +476,9 @@ class AppSettings {
       ttsSpeechRate,
       ttsPitch,
       ttsVolume,
+      ttsEngine,
+      ttsServerVoiceId,
+      ttsServerVoiceName,
       Object.hashAllUnordered(quickPills),
     );
   }
@@ -543,6 +602,21 @@ class AppSettingsNotifier extends _$AppSettingsNotifier {
     await SettingsService.saveSettings(state);
   }
 
+  Future<void> setTtsEngine(TtsEngine engine) async {
+    state = state.copyWith(ttsEngine: engine);
+    await SettingsService.saveSettings(state);
+  }
+
+  Future<void> setTtsServerVoiceName(String? name) async {
+    state = state.copyWith(ttsServerVoiceName: name);
+    await SettingsService.saveSettings(state);
+  }
+
+  Future<void> setTtsServerVoiceId(String? id) async {
+    state = state.copyWith(ttsServerVoiceId: id);
+    await SettingsService.saveSettings(state);
+  }
+
   Future<void> resetToDefaults() async {
     const defaultSettings = AppSettings();
     await SettingsService.saveSettings(defaultSettings);
diff --git a/lib/features/chat/providers/text_to_speech_provider.dart b/lib/features/chat/providers/text_to_speech_provider.dart
index b25e341..dc234d9 100644
--- a/lib/features/chat/providers/text_to_speech_provider.dart
+++ b/lib/features/chat/providers/text_to_speech_provider.dart
@@ -3,6 +3,7 @@ import 'dart:async';
 import 'package:flutter_riverpod/flutter_riverpod.dart';
 
 import '../../../core/services/settings_service.dart';
+import '../../../core/providers/app_providers.dart';
 import '../../../core/utils/markdown_to_text.dart';
 import '../services/text_to_speech_service.dart';
 
@@ -79,11 +80,15 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
     // Listen to settings changes and update TTS when initialized
     ref.listen<AppSettings>(appSettingsProvider, (previous, next) {
       if (_service.isInitialized && _service.isAvailable) {
+        final selectedVoice = next.ttsEngine == TtsEngine.server
+            ? next.ttsServerVoiceId
+            : next.ttsVoice;
         _service.updateSettings(
-          voice: next.ttsVoice,
+          voice: selectedVoice,
           speechRate: next.ttsSpeechRate,
           pitch: next.ttsPitch,
           volume: next.ttsVolume,
+          engine: next.ttsEngine,
         );
       }
     }, fireImmediately: false);
@@ -105,10 +110,13 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
     final settings = ref.read(appSettingsProvider);
     final future = _service
         .initialize(
-          voice: settings.ttsVoice,
+          voice: settings.ttsEngine == TtsEngine.server
+              ? settings.ttsServerVoiceId
+              : settings.ttsVoice,
           speechRate: settings.ttsSpeechRate,
           pitch: settings.ttsPitch,
           volume: settings.ttsVolume,
+          engine: settings.ttsEngine,
         )
         .then((available) {
           if (!ref.mounted) {
@@ -289,7 +297,8 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
 }
 
 final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {
-  final service = TextToSpeechService();
+  final api = ref.watch(apiServiceProvider);
+  final service = TextToSpeechService(api: api);
   ref.onDispose(() {
     unawaited(service.dispose());
   });
diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart
index 6591f41..65aaa86 100644
--- a/lib/features/chat/services/text_to_speech_service.dart
+++ b/lib/features/chat/services/text_to_speech_service.dart
@@ -1,13 +1,21 @@
 import 'dart:async';
 import 'dart:io' show Platform;
 
+import 'package:audioplayers/audioplayers.dart';
 import 'package:flutter/foundation.dart';
 import 'package:flutter/widgets.dart';
 import 'package:flutter_tts/flutter_tts.dart';
 
+import '../../../core/services/api_service.dart';
+import '../../../core/services/settings_service.dart';
+
 /// Lightweight wrapper around FlutterTts to centralize configuration
 class TextToSpeechService {
   final FlutterTts _tts = FlutterTts();
+  final AudioPlayer _player = AudioPlayer();
+  final ApiService? _api;
+  TtsEngine _engine = TtsEngine.device;
+  String? _preferredVoice;
   bool _initialized = false;
   bool _available = false;
   bool _voiceConfigured = false;
@@ -22,6 +30,14 @@ class TextToSpeechService {
   bool get isInitialized => _initialized;
   bool get isAvailable => _available;
 
+  TextToSpeechService({ApiService? api}) : _api = api {
+    // Wire minimal player events to callbacks
+    _player.onPlayerComplete.listen((_) => _handleComplete());
+    _player.onPlayerStateChanged.listen((s) {
+      if (s == PlayerState.playing) _handleStart();
+    });
+  }
+
   /// Register callbacks for TTS lifecycle events
   void bindHandlers({
     VoidCallback? onStart,
@@ -52,12 +68,15 @@ class TextToSpeechService {
     double speechRate = 0.5,
     double pitch = 1.0,
     double volume = 1.0,
+    TtsEngine engine = TtsEngine.device,
   }) async {
     if (_initialized) {
       return _available;
     }
 
     try {
+      _engine = engine;
+      _preferredVoice = voice;
       await _tts.awaitSpeakCompletion(false);
 
       // Set volume
@@ -97,34 +116,61 @@ class TextToSpeechService {
     }
 
     if (!_initialized) {
-      await initialize();
+      await initialize(voice: _preferredVoice, engine: _engine);
     }
 
+    if (_engine == TtsEngine.server && _api != null) {
+      // Server-backed TTS path
+      try {
+        final effectiveVoice =
+            (_preferredVoice == null || _preferredVoice!.trim().isEmpty)
+            ? 'alloy'
+            : _preferredVoice!;
+
+        final bytes = await _api.generateSpeech(
+          text: text,
+          voice: effectiveVoice,
+        );
+        if (bytes.isEmpty) {
+          throw Exception('Empty audio response');
+        }
+        await _player.stop();
+        final data = Uint8List.fromList(bytes);
+        await _player.play(BytesSource(data));
+      } catch (e) {
+        _onError?.call(e.toString());
+        // Fallback to device TTS on failure
+        await _speakOnDevice(text);
+      }
+      return;
+    }
+
+    // Device TTS path
+    await _speakOnDevice(text);
+  }
+
+  Future<void> _speakOnDevice(String text) async {
     if (!_available) {
       throw StateError('Text-to-speech is unavailable on this device');
     }
-
     await _tts.stop();
     if (!_voiceConfigured) {
       await _configurePreferredVoice();
     }
     final result = await _tts.speak(text);
-    if (result == null) {
-      return;
-    }
-
     if (result is int && result != 1) {
       _onError?.call('Text-to-speech engine returned code $result');
     }
   }
 
   Future<void> pause() async {
-    if (!_initialized || !_available) {
-      return;
-    }
-
+    if (!_initialized) return;
     try {
-      await _tts.pause();
+      if (_engine == TtsEngine.server) {
+        await _player.pause();
+      } else if (_available) {
+        await _tts.pause();
+      }
     } catch (e) {
       _onError?.call(e.toString());
     }
@@ -136,7 +182,11 @@ class TextToSpeechService {
     }
 
     try {
-      await _tts.stop();
+      if (_engine == TtsEngine.server) {
+        await _player.stop();
+      } else {
+        await _tts.stop();
+      }
     } catch (e) {
       _onError?.call(e.toString());
     }
@@ -144,6 +194,7 @@ class TextToSpeechService {
 
   Future<void> dispose() async {
     await stop();
+    await _player.dispose();
   }
 
   /// Update TTS settings on-the-fly
@@ -152,12 +203,22 @@ class TextToSpeechService {
     double? speechRate,
     double? pitch,
     double? volume,
+    TtsEngine? engine,
   }) async {
     if (!_initialized || !_available) {
+      // Allow engine and voice to update before init
+      if (engine != null) _engine = engine;
+      if (voice != null) _preferredVoice = voice;
       return;
     }
 
     try {
+      if (engine != null) {
+        _engine = engine;
+      }
+      if (voice != null) {
+        _preferredVoice = voice;
+      }
       if (volume != null) {
         await _tts.setVolume(volume);
       }
@@ -167,8 +228,10 @@ class TextToSpeechService {
       if (pitch != null) {
         await _tts.setPitch(pitch);
       }
-      // Set specific voice by name
-      await _setVoiceByName(voice);
+      // Set specific voice by name on device engine
+      if (_engine == TtsEngine.device) {
+        await _setVoiceByName(_preferredVoice);
+      }
     } catch (e) {
       _onError?.call(e.toString());
     }
@@ -224,7 +287,31 @@ class TextToSpeechService {
   /// Get available voices from the TTS engine
   Future<List<Map<String, dynamic>>> getAvailableVoices() async {
     if (!_initialized) {
-      await initialize();
+      await initialize(voice: _preferredVoice, engine: _engine);
+    }
+
+    if (_engine == TtsEngine.server && _api != null) {
+      try {
+        final serverVoices = await _api.getAvailableServerVoices();
+        final mapped = serverVoices
+            .map(
+              (v) => {
+                'name': (v['name'] ?? v['id'] ?? '').toString(),
+                'locale': (v['locale'] ?? '').toString(),
+              },
+            )
+            .where((e) => (e['name'] as String).isNotEmpty)
+            .toList();
+        if (mapped.isEmpty) {
+          return [
+            {'name': 'alloy', 'locale': ''},
+          ];
+        }
+        return mapped;
+      } catch (e) {
+        _onError?.call(e.toString());
+        // Fall back to device voices
+      }
     }
 
     if (!_available) {
diff --git a/lib/features/profile/views/app_customization_page.dart b/lib/features/profile/views/app_customization_page.dart
index ba2c4ac..bc5e01d 100644
--- a/lib/features/profile/views/app_customization_page.dart
+++ b/lib/features/profile/views/app_customization_page.dart
@@ -441,10 +441,97 @@ class AppCustomizationPage extends ConsumerWidget {
               TextStyle(color: theme.sidebarForeground, fontSize: 18),
         ),
         const SizedBox(height: Spacing.sm),
+        ConduitCard(
+          padding: const EdgeInsets.all(Spacing.md),
+          child: Column(
+            crossAxisAlignment: CrossAxisAlignment.start,
+            children: [
+              Row(
+                children: [
+                  _buildIconBadge(
+                    context,
+                    UiUtils.platformIcon(
+                      ios: CupertinoIcons.settings,
+                      android: Icons.settings_voice,
+                    ),
+                    color: theme.buttonPrimary,
+                  ),
+                  const SizedBox(width: Spacing.sm),
+                  const Text('Engine'),
+                  const Spacer(),
+                  Wrap(
+                    spacing: Spacing.sm,
+                    children: [
+                      ChoiceChip(
+                        label: const Text('On Device'),
+                        selected: settings.ttsEngine == TtsEngine.device,
+                        showCheckmark: false,
+                        selectedColor: theme.buttonPrimary,
+                        backgroundColor: theme.cardBackground,
+                        side: BorderSide(
+                          color: settings.ttsEngine == TtsEngine.device
+                              ? theme.buttonPrimary.withValues(alpha: 0.6)
+                              : theme.textPrimary.withValues(alpha: 0.2),
+                        ),
+                        labelStyle: TextStyle(
+                          color: settings.ttsEngine == TtsEngine.device
+                              ? theme.buttonPrimaryText
+                              : theme.textPrimary,
+                          fontWeight: FontWeight.w600,
+                        ),
+                        onSelected: (v) {
+                          if (v) {
+                            final notifier = ref.read(
+                              appSettingsProvider.notifier,
+                            );
+                            notifier.setTtsEngine(TtsEngine.device);
+                            // Keep previous voice (device voices)
+                          }
+                        },
+                      ),
+                      ChoiceChip(
+                        label: const Text('Server'),
+                        selected: settings.ttsEngine == TtsEngine.server,
+                        showCheckmark: false,
+                        selectedColor: theme.buttonPrimary,
+                        backgroundColor: theme.cardBackground,
+                        side: BorderSide(
+                          color: settings.ttsEngine == TtsEngine.server
+                              ? theme.buttonPrimary.withValues(alpha: 0.6)
+                              : theme.textPrimary.withValues(alpha: 0.2),
+                        ),
+                        labelStyle: TextStyle(
+                          color: settings.ttsEngine == TtsEngine.server
+                              ? theme.buttonPrimaryText
+                              : theme.textPrimary,
+                          fontWeight: FontWeight.w600,
+                        ),
+                        onSelected: (v) {
+                          if (v) {
+                            final notifier = ref.read(
+                              appSettingsProvider.notifier,
+                            );
+                            // Clear device-specific voice so server can default
+                            notifier.setTtsVoice(null);
+                            notifier.setTtsEngine(TtsEngine.server);
+                          }
+                        },
+                      ),
+                    ],
+                  ),
+                ],
+              ),
+            ],
+          ),
+        ),
+        const SizedBox(height: Spacing.sm),
         _ExpandableCard(
           title: l10n.ttsVoice,
           subtitle: _getDisplayVoiceName(
-            settings.ttsVoice,
+            settings.ttsEngine == TtsEngine.server
+                ? ((settings.ttsServerVoiceName ?? settings.ttsServerVoiceId) ??
+                      '')
+                : (settings.ttsVoice ?? ''),
             l10n.ttsSystemDefault,
           ),
           icon: UiUtils.platformIcon(
@@ -466,7 +553,11 @@ class AppCustomizationPage extends ConsumerWidget {
                 ),
                 title: l10n.ttsVoice,
                 subtitle: _getDisplayVoiceName(
-                  settings.ttsVoice,
+                  settings.ttsEngine == TtsEngine.server
+                      ? ((settings.ttsServerVoiceName ??
+                                settings.ttsServerVoiceId) ??
+                            '')
+                      : (settings.ttsVoice ?? ''),
                   l10n.ttsSystemDefault,
                 ),
                 onTap: () => _showVoicePickerSheet(context, ref, settings),
@@ -616,7 +707,10 @@ class AppCustomizationPage extends ConsumerWidget {
     final theme = context.conduitTheme;
     final ttsService = ref.read(textToSpeechServiceProvider);
 
-    // Fetch available voices
+    // Ensure the service uses the currently selected engine before fetching
+    await ttsService.updateSettings(engine: settings.ttsEngine);
+
+    // Fetch available voices from the active engine
     final allVoices = await ttsService.getAvailableVoices();
 
     if (!context.mounted) return;
@@ -729,17 +823,29 @@ class AppCustomizationPage extends ConsumerWidget {
                     style:
                         theme.bodyMedium?.copyWith(
                           color: theme.sidebarForeground,
-                          fontWeight: settings.ttsVoice == null
+                          fontWeight:
+                              (settings.ttsEngine == TtsEngine.server
+                                  ? settings.ttsServerVoiceId == null
+                                  : settings.ttsVoice == null)
                               ? FontWeight.bold
                               : FontWeight.normal,
                         ) ??
                         TextStyle(color: theme.sidebarForeground),
                   ),
-                  trailing: settings.ttsVoice == null
+                  trailing:
+                      (settings.ttsEngine == TtsEngine.server
+                          ? settings.ttsServerVoiceId == null
+                          : settings.ttsVoice == null)
                       ? Icon(Icons.check, color: theme.buttonPrimary)
                       : null,
                   onTap: () {
-                    ref.read(appSettingsProvider.notifier).setTtsVoice(null);
+                    final notifier = ref.read(appSettingsProvider.notifier);
+                    if (settings.ttsEngine == TtsEngine.server) {
+                      notifier.setTtsServerVoiceId(null);
+                      notifier.setTtsServerVoiceName(null);
+                    } else {
+                      notifier.setTtsVoice(null);
+                    }
                     Navigator.of(sheetContext).pop();
                   },
                 ),
@@ -823,7 +929,9 @@ class AppCustomizationPage extends ConsumerWidget {
                       final voiceId = _getVoiceIdentifier(voice);
                       final displayName = _formatVoiceName(voice);
                       final subtitle = _getVoiceSubtitle(voice);
-                      final isSelected = settings.ttsVoice == voiceId;
+                      final isSelected = settings.ttsEngine == TtsEngine.server
+                          ? settings.ttsServerVoiceId == voiceId
+                          : settings.ttsVoice == voiceId;
 
                       return ListTile(
                         leading: Icon(
@@ -865,9 +973,15 @@ class AppCustomizationPage extends ConsumerWidget {
                             ? Icon(Icons.check, color: theme.buttonPrimary)
                             : null,
                         onTap: () {
-                          ref
-                              .read(appSettingsProvider.notifier)
-                              .setTtsVoice(voiceId);
+                          final notifier = ref.read(
+                            appSettingsProvider.notifier,
+                          );
+                          if (settings.ttsEngine == TtsEngine.server) {
+                            notifier.setTtsServerVoiceId(voiceId);
+                            notifier.setTtsServerVoiceName(displayName);
+                          } else {
+                            notifier.setTtsVoice(voiceId);
+                          }
                           Navigator.of(sheetContext).pop();
                         },
                       );
diff --git a/pubspec.lock b/pubspec.lock
index 9b612c1..a57444b 100644
--- a/pubspec.lock
+++ b/pubspec.lock
@@ -65,6 +65,62 @@ packages:
       url: "https://pub.dev"
     source: hosted
     version: "2.13.0"
+  audioplayers:
+    dependency: "direct main"
+    description:
+      name: audioplayers
+      sha256: c05c6147124cd63e725e861335a8b4d57300b80e6e92cea7c145c739223bbaef
+      url: "https://pub.dev"
+    source: hosted
+    version: "5.2.1"
+  audioplayers_android:
+    dependency: transitive
+    description:
+      name: audioplayers_android
+      sha256: b00e1a0e11365d88576320ec2d8c192bc21f1afb6c0e5995d1c57ae63156acb5
+      url: "https://pub.dev"
+    source: hosted
+    version: "4.0.3"
+  audioplayers_darwin:
+    dependency: transitive
+    description:
+      name: audioplayers_darwin
+      sha256: "3034e99a6df8d101da0f5082dcca0a2a99db62ab1d4ddb3277bed3f6f81afe08"
+      url: "https://pub.dev"
+    source: hosted
+    version: "5.0.2"
+  audioplayers_linux:
+    dependency: transitive
+    description:
+      name: audioplayers_linux
+      sha256: "60787e73fefc4d2e0b9c02c69885402177e818e4e27ef087074cf27c02246c9e"
+      url: "https://pub.dev"
+    source: hosted
+    version: "3.1.0"
+  audioplayers_platform_interface:
+    dependency: transitive
+    description:
+      name: audioplayers_platform_interface
+      sha256: "365c547f1bb9e77d94dd1687903a668d8f7ac3409e48e6e6a3668a1ac2982adb"
+      url: "https://pub.dev"
+    source: hosted
+    version: "6.1.0"
+  audioplayers_web:
+    dependency: transitive
+    description:
+      name: audioplayers_web
+      sha256: "22cd0173e54d92bd9b2c80b1204eb1eb159ece87475ab58c9788a70ec43c2a62"
+      url: "https://pub.dev"
+    source: hosted
+    version: "4.1.0"
+  audioplayers_windows:
+    dependency: transitive
+    description:
+      name: audioplayers_windows
+      sha256: "9536812c9103563644ada2ef45ae523806b0745f7a78e89d1b5fb1951de90e1a"
+      url: "https://pub.dev"
+    source: hosted
+    version: "3.1.0"
   boolean_selector:
     dependency: transitive
     description:
diff --git a/pubspec.yaml b/pubspec.yaml
index 4aa6980..8e2d10a 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -47,6 +47,7 @@ dependencies:
   record: ^6.1.1
   stts: ^1.2.5
   flutter_tts: ^4.2.3
+  audioplayers: ^5.2.1
   image_picker: ^1.2.0
   file_picker: ^10.3.3
   path_provider: ^2.1.4

From 8ec411d6aacb7b4e2110436dbddb7599d0569e0b Mon Sep 17 00:00:00 2001
From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com>
Date: Thu, 23 Oct 2025 16:46:24 +0530
Subject: [PATCH 2/3] feat(tts): server chunked playback queue on server
 pathRefactor server-backedTS path to perform sentence chunking and

queued playback via a dedicated _startServerChunkedPlayback method
instead of generating a single monolithic audio blob.

This change simplifies the server flow, avoids constructing an entire
audio buffer in memory, and enables smoother playback and error
recovery. On errors, the code still falls back to device TTS.
---
 .../chat/services/text_to_speech_service.dart | 175 ++++++++++++++++--
 1 file changed, 157 insertions(+), 18 deletions(-)

diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart
index 65aaa86..5d344e6 100644
--- a/lib/features/chat/services/text_to_speech_service.dart
+++ b/lib/features/chat/services/text_to_speech_service.dart
@@ -19,6 +19,11 @@ class TextToSpeechService {
   bool _initialized = false;
   bool _available = false;
   bool _voiceConfigured = false;
+  int _session = 0; // increments to cancel in-flight work
+  final List<Uint8List> _buffered = <Uint8List>[]; // server chunks
+  int _expectedChunks = 0;
+  int _currentIndex = -1;
+  bool _waitingNext = false;
 
   VoidCallback? _onStart;
   VoidCallback? _onComplete;
@@ -32,7 +37,7 @@ class TextToSpeechService {
 
   TextToSpeechService({ApiService? api}) : _api = api {
     // Wire minimal player events to callbacks
-    _player.onPlayerComplete.listen((_) => _handleComplete());
+    _player.onPlayerComplete.listen((_) => _onAudioComplete());
     _player.onPlayerStateChanged.listen((s) {
       if (s == PlayerState.playing) _handleStart();
     });
@@ -120,26 +125,11 @@ class TextToSpeechService {
     }
 
     if (_engine == TtsEngine.server && _api != null) {
-      // Server-backed TTS path
+      // Server-backed TTS with sentence chunking & queued playback
       try {
-        final effectiveVoice =
-            (_preferredVoice == null || _preferredVoice!.trim().isEmpty)
-            ? 'alloy'
-            : _preferredVoice!;
-
-        final bytes = await _api.generateSpeech(
-          text: text,
-          voice: effectiveVoice,
-        );
-        if (bytes.isEmpty) {
-          throw Exception('Empty audio response');
-        }
-        await _player.stop();
-        final data = Uint8List.fromList(bytes);
-        await _player.play(BytesSource(data));
+        await _startServerChunkedPlayback(text);
       } catch (e) {
         _onError?.call(e.toString());
-        // Fallback to device TTS on failure
         await _speakOnDevice(text);
       }
       return;
@@ -182,6 +172,12 @@ class TextToSpeechService {
     }
 
     try {
+      // Cancel any in-flight server work
+      _session++;
+      _buffered.clear();
+      _expectedChunks = 0;
+      _currentIndex = -1;
+      _waitingNext = false;
       if (_engine == TtsEngine.server) {
         await _player.stop();
       } else {
@@ -341,6 +337,149 @@ class TextToSpeechService {
     }
   }
 
+  // ===== Server chunked playback =====
+
+  Future<void> _startServerChunkedPlayback(String text) async {
+    final effectiveVoice =
+        (_preferredVoice == null || _preferredVoice!.trim().isEmpty)
+        ? 'alloy'
+        : _preferredVoice!;
+
+    // Reset queue and create a new session
+    _session++;
+    final session = _session;
+    _buffered.clear();
+    _expectedChunks = 0;
+    _currentIndex = -1;
+    _waitingNext = false;
+
+    final chunks = _splitForTts(text);
+    if (chunks.isEmpty) return;
+    _expectedChunks = chunks.length;
+
+    // Fetch first chunk to start playback quickly
+    final firstBytes = await _fetchServerAudio(
+      chunks.first,
+      effectiveVoice,
+      session,
+    );
+    if (session != _session) return; // canceled
+    if (firstBytes.isEmpty) throw Exception('Empty audio response');
+
+    await _player.stop();
+    _buffered.add(Uint8List.fromList(firstBytes));
+    _currentIndex = 0;
+    await _player.play(BytesSource(_buffered.first));
+
+    // Prefetch the rest in background
+    unawaited(
+      _prefetchRemainingChunks(
+        chunks.skip(1).toList(),
+        effectiveVoice,
+        session,
+      ),
+    );
+  }
+
+  Future<void> _prefetchRemainingChunks(
+    List<String> remaining,
+    String voice,
+    int session,
+  ) async {
+    for (final chunk in remaining) {
+      if (session != _session) return; // canceled
+      try {
+        final audio = await _fetchServerAudio(chunk, voice, session);
+        if (session != _session) return;
+        if (audio.isNotEmpty) {
+          _buffered.add(Uint8List.fromList(audio));
+          // If the player finished the previous chunk and is waiting, start now
+          if (_waitingNext && (_currentIndex + 1) < _buffered.length) {
+            _waitingNext = false;
+            await _playNextIfBuffered(session);
+          }
+        }
+      } catch (e) {
+        _onError?.call(e.toString());
+        // continue with other chunks
+      }
+    }
+  }
+
+  Future<List<int>> _fetchServerAudio(
+    String text,
+    String voice,
+    int session,
+  ) async {
+    return await _api!.generateSpeech(text: text, voice: voice);
+  }
+
+  Future<void> _onAudioComplete() async {
+    final session = _session;
+    // If there are more expected chunks
+    if ((_currentIndex + 1) < _expectedChunks) {
+      // If next chunk is already buffered, play it
+      if ((_currentIndex + 1) < _buffered.length) {
+        await _playNextIfBuffered(session);
+      } else {
+        // Wait for prefetch to provide it
+        _waitingNext = true;
+      }
+      return;
+    }
+    // No more chunks – this is the real completion
+    _handleComplete();
+  }
+
+  Future<void> _playNextIfBuffered(int session) async {
+    if (session != _session) return;
+    final nextIndex = _currentIndex + 1;
+    if (nextIndex < 0 || nextIndex >= _buffered.length) return;
+    _currentIndex = nextIndex;
+    final bytes = _buffered[nextIndex];
+    await _player.play(BytesSource(bytes));
+  }
+
+  List<String> _splitForTts(String text) {
+    // Normalize whitespace
+    final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim();
+    if (normalized.isEmpty) return const [];
+
+    // Split on sentence-ending punctuation while keeping the delimiter
+    final parts = <String>[];
+    final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)");
+    int index = 0;
+    for (final match in sentenceRegex.allMatches('$normalized ')) {
+      final s = match.group(1) ?? '';
+      if (s.trim().isNotEmpty) parts.add(s.trim());
+      index = match.end;
+    }
+    if (index < normalized.length) {
+      final tail = normalized.substring(index).trim();
+      if (tail.isNotEmpty) parts.add(tail);
+    }
+
+    // Fallback to length-based splits for very long segments
+    const maxLen = 300;
+    final chunks = <String>[];
+    for (final p in parts.isEmpty ? [normalized] : parts) {
+      if (p.length <= maxLen) {
+        chunks.add(p);
+      } else {
+        // Try splitting on commas/spaces
+        var remaining = p;
+        while (remaining.length > maxLen) {
+          int cut = remaining.lastIndexOf(RegExp(r",\s|\s"), maxLen);
+          cut = cut <= 0 ? maxLen : cut;
+          chunks.add(remaining.substring(0, cut).trim());
+          remaining = remaining.substring(cut).trim();
+        }
+        if (remaining.isNotEmpty) chunks.add(remaining);
+      }
+    }
+    return chunks;
+  }
+
   Future<void> _configurePreferredVoice() async {
     if (_voiceConfigured) {
       return;

From 56246507de5e4e1a95d37de871fd41059961a4c6 Mon Sep 17 00:00:00 2001
From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com>
Date: Thu, 23 Oct 2025 17:05:35 +0530
Subject: [PATCH 3/3] feat(tts): add karaoke-style TTS progress bar to
 assistant UI

Add rendering and support for a karaoke-style text-to-speechprogress bar in assistant messages so users can see the currently
spoken sentence and highlighted word during playback.

- Append TTS karaoke bar to AssistantMessageWidget when the message is
  the active TTS target and playback is speaking/paused/loading.
- Implement _buildKaraokeBar to render the active sentence with a
  highlighted word span, using ConduitCard and theme styles.
- Import conduit_components for shared UI primitives.
- Extend TextToSpeechState with sentence data:
  sentences, sentenceOffsets, activeSentenceIndex, and per-word
  progress (wordStartInSentence, wordEndInSentence).
- Add provider callbacks wiring: onSentenceIndex and
  onDeviceWordProgress handlers (hooked into TTS backend).
- Prepare sentence splitting and word-progress plumbing in the TTS
  provider (prepares data used to drive the karaoke display).

This change improves UX by visually indicating the spoken sentence
and current word during TTS playback, aiding comprehension and
accessibility.
---
 .../providers/text_to_speech_provider.dart    | 100 +++++++++++++++++-
 .../chat/services/text_to_speech_service.dart |  16 +++
 .../widgets/assistant_message_widget.dart     |  61 +++++++++++
 3 files changed, 176 insertions(+), 1 deletion(-)

diff --git a/lib/features/chat/providers/text_to_speech_provider.dart b/lib/features/chat/providers/text_to_speech_provider.dart
index dc234d9..a68aff4 100644
--- a/lib/features/chat/providers/text_to_speech_provider.dart
+++ b/lib/features/chat/providers/text_to_speech_provider.dart
@@ -15,6 +15,11 @@ class TextToSpeechState {
   final TtsPlaybackStatus status;
   final String? activeMessageId;
   final String? errorMessage;
+  final List<String> sentences;
+  final List<int> sentenceOffsets; // start indices in full text
+  final int activeSentenceIndex; // -1 when none
+  final int? wordStartInSentence; // nullable; only for on-device
+  final int? wordEndInSentence; // nullable; only for on-device
 
   const TextToSpeechState({
     this.initialized = false,
@@ -22,6 +27,11 @@ class TextToSpeechState {
     this.status = TtsPlaybackStatus.idle,
     this.activeMessageId,
     this.errorMessage,
+    this.sentences = const [],
+    this.sentenceOffsets = const [],
+    this.activeSentenceIndex = -1,
+    this.wordStartInSentence,
+    this.wordEndInSentence,
   });
 
   bool get isSpeaking => status == TtsPlaybackStatus.speaking;
@@ -37,6 +47,12 @@ class TextToSpeechState {
     bool clearActiveMessageId = false,
     String? errorMessage,
     bool clearErrorMessage = false,
+    List<String>? sentences,
+    List<int>? sentenceOffsets,
+    int? activeSentenceIndex,
+    bool clearWord = false,
+    int? wordStartInSentence,
+    int? wordEndInSentence,
   }) {
     return TextToSpeechState(
       initialized: initialized ?? this.initialized,
@@ -48,6 +64,15 @@ class TextToSpeechState {
       errorMessage: clearErrorMessage
           ? null
           : errorMessage ?? this.errorMessage,
+      sentences: sentences ?? this.sentences,
+      sentenceOffsets: sentenceOffsets ?? this.sentenceOffsets,
+      activeSentenceIndex: activeSentenceIndex ?? this.activeSentenceIndex,
+      wordStartInSentence: clearWord
+          ? null
+          : (wordStartInSentence ?? this.wordStartInSentence),
+      wordEndInSentence: clearWord
+          ? null
+          : (wordEndInSentence ?? this.wordEndInSentence),
     );
   }
 }
@@ -70,6 +95,8 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
         onPause: _handlePause,
         onContinue: _handleContinue,
         onError: _handleError,
+        onSentenceIndex: _handleSentenceIndex,
+        onDeviceWordProgress: _handleDeviceWordProgress,
       );
 
       ref.onDispose(() {
@@ -184,15 +211,23 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
       return;
     }
 
+    // Prepare sentence split for highlighting
+    final cleanText = MarkdownToText.convert(text);
+    final sentences = _splitForTts(cleanText);
+    final offsets = _computeOffsets(sentences);
+
     state = state.copyWith(
       status: TtsPlaybackStatus.loading,
       activeMessageId: messageId,
       clearErrorMessage: true,
+      sentences: sentences,
+      sentenceOffsets: offsets,
+      activeSentenceIndex: sentences.isEmpty ? -1 : 0,
+      clearWord: true,
     );
 
     try {
       // Convert markdown to clean text for TTS
-      final cleanText = MarkdownToText.convert(text);
       if (cleanText.isEmpty) {
         // No speakable content
         if (!ref.mounted) {
@@ -224,6 +259,34 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
     }
   }
 
+  List<String> _splitForTts(String text) {
+    final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim();
+    if (normalized.isEmpty) return const [];
+    final parts = <String>[];
+    final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)");
+    int index = 0;
+    for (final match in sentenceRegex.allMatches('$normalized ')) {
+      final s = match.group(1) ?? '';
+      if (s.trim().isNotEmpty) parts.add(s.trim());
+      index = match.end;
+    }
+    if (index < normalized.length) {
+      final tail = normalized.substring(index).trim();
+      if (tail.isNotEmpty) parts.add(tail);
+    }
+    return parts;
+  }
+
+  List<int> _computeOffsets(List<String> sentences) {
+    final offsets = <int>[];
+    int acc = 0;
+    for (final s in sentences) {
+      offsets.add(acc);
+      acc += s.length + 1; // assume a space or punctuation between
+    }
+    return offsets;
+  }
+
   Future<void> pause() async {
     if (!state.initialized || !state.available) {
       return;
@@ -294,6 +357,41 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
       clearActiveMessageId: true,
     );
   }
+
+  void _handleSentenceIndex(int index) {
+    if (!ref.mounted) return;
+    final clamped = index.clamp(
+      -1,
+      state.sentences.isEmpty ? -1 : state.sentences.length - 1,
+    );
+    state = state.copyWith(
+      activeSentenceIndex: clamped,
+      // clear per-word highlight when sentence switches (server or device)
+      clearWord: true,
+    );
+  }
+
+  void _handleDeviceWordProgress(int start, int end) {
+    if (!ref.mounted) return;
+    // Map global offsets to sentence index
+    final offsets = state.sentenceOffsets;
+    if (offsets.isEmpty) return;
+    int idx = 0;
+    for (var i = 0; i < offsets.length; i++) {
+      final sStart = offsets[i];
+      final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
+      if (start >= sStart && start < sEnd) {
+        idx = i;
+        break;
+      }
+    }
+    final sentenceStart = offsets[idx];
+    state = state.copyWith(
+      activeSentenceIndex: idx,
+      wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
+      wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
+    );
+  }
 }
 
 final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {
diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart
index 5d344e6..9f01ebb 100644
--- a/lib/features/chat/services/text_to_speech_service.dart
+++ b/lib/features/chat/services/text_to_speech_service.dart
@@ -31,6 +31,8 @@ class TextToSpeechService {
   VoidCallback? _onPause;
   VoidCallback? _onContinue;
   void Function(String message)? _onError;
+  void Function(int sentenceIndex)? _onSentenceIndex;
+  void Function(int start, int end)? _onDeviceWordProgress;
 
   bool get isInitialized => _initialized;
   bool get isAvailable => _available;
@@ -51,6 +53,8 @@ class TextToSpeechService {
     VoidCallback? onPause,
     VoidCallback? onContinue,
     void Function(String message)? onError,
+    void Function(int sentenceIndex)? onSentenceIndex,
+    void Function(int start, int end)? onDeviceWordProgress,
   }) {
     _onStart = onStart;
     _onComplete = onComplete;
@@ -58,6 +62,8 @@ class TextToSpeechService {
     _onPause = onPause;
     _onContinue = onContinue;
     _onError = onError;
+    _onSentenceIndex = onSentenceIndex;
+    _onDeviceWordProgress = onDeviceWordProgress;
 
     _tts.setStartHandler(_handleStart);
     _tts.setCompletionHandler(_handleComplete);
@@ -65,6 +71,13 @@ class TextToSpeechService {
     _tts.setPauseHandler(_handlePause);
     _tts.setContinueHandler(_handleContinue);
     _tts.setErrorHandler(_handleError);
+    try {
+      _tts.setProgressHandler((String text, int start, int end, String word) {
+        _onDeviceWordProgress?.call(start, end);
+      });
+    } catch (_) {
+      // Some platforms may not support progress handler
+    }
   }
 
   /// Initialize the native TTS engine lazily
@@ -151,6 +164,7 @@ class TextToSpeechService {
     if (result is int && result != 1) {
       _onError?.call('Text-to-speech engine returned code $result');
     }
+    _onSentenceIndex?.call(0);
   }
 
   Future<void> pause() async {
@@ -370,6 +384,7 @@ class TextToSpeechService {
     _buffered.add(Uint8List.fromList(firstBytes));
     _currentIndex = 0;
     await _player.play(BytesSource(_buffered.first));
+    _onSentenceIndex?.call(0);
 
     // Prefetch the rest in background
     unawaited(
@@ -438,6 +453,7 @@ class TextToSpeechService {
     _currentIndex = nextIndex;
     final bytes = _buffered[nextIndex];
     await _player.play(BytesSource(bytes));
+    _onSentenceIndex?.call(_currentIndex);
   }
 
   List<String> _splitForTts(String text) {
diff --git a/lib/features/chat/widgets/assistant_message_widget.dart b/lib/features/chat/widgets/assistant_message_widget.dart
index 370ee77..5d6a42b 100644
--- a/lib/features/chat/widgets/assistant_message_widget.dart
+++ b/lib/features/chat/widgets/assistant_message_widget.dart
@@ -18,6 +18,7 @@ import 'package:conduit/l10n/app_localizations.dart';
 import 'enhanced_attachment.dart';
 import 'package:conduit/shared/widgets/chat_action_button.dart';
 import '../../../shared/widgets/model_avatar.dart';
+import '../../../shared/widgets/conduit_components.dart';
 import 'package:url_launcher/url_launcher_string.dart';
 import '../providers/chat_providers.dart' show sendMessageWithContainer;
 import '../../../core/utils/debug_logger.dart';
@@ -457,12 +458,72 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
     }
 
     if (children.isEmpty) return const SizedBox.shrink();
+    // Append TTS karaoke bar if this is the active message
+    final ttsState = ref.watch(textToSpeechControllerProvider);
+    final isActive =
+        ttsState.activeMessageId == _messageId &&
+        (ttsState.status == TtsPlaybackStatus.speaking ||
+            ttsState.status == TtsPlaybackStatus.paused ||
+            ttsState.status == TtsPlaybackStatus.loading);
+    if (isActive && ttsState.activeSentenceIndex >= 0) {
+      children.add(const SizedBox(height: Spacing.sm));
+      children.add(_buildKaraokeBar(ttsState));
+    }
+
     return Column(
       crossAxisAlignment: CrossAxisAlignment.start,
       children: children,
     );
   }
 
+  Widget _buildKaraokeBar(TextToSpeechState ttsState) {
+    final theme = context.conduitTheme;
+    final idx = ttsState.activeSentenceIndex;
+    if (idx < 0 || idx >= ttsState.sentences.length) {
+      return const SizedBox.shrink();
+    }
+    final sentence = ttsState.sentences[idx];
+    final ws = ttsState.wordStartInSentence;
+    final we = ttsState.wordEndInSentence;
+
+    final baseStyle = TextStyle(
+      color: theme.textPrimary,
+      height: 1.2,
+      fontSize: 14,
+    );
+    final highlightStyle = baseStyle.copyWith(
+      backgroundColor: theme.buttonPrimary.withValues(alpha: 0.25),
+      color: theme.textPrimary,
+      fontWeight: FontWeight.w600,
+    );
+
+    InlineSpan buildSpans() {
+      if (ws == null ||
+          we == null ||
+          ws < 0 ||
+          we <= ws ||
+          ws >= sentence.length) {
+        return TextSpan(text: sentence, style: baseStyle);
+      }
+      final safeEnd = we.clamp(0, sentence.length);
+      final before = sentence.substring(0, ws);
+      final word = sentence.substring(ws, safeEnd);
+      final after = sentence.substring(safeEnd);
+      return TextSpan(
+        children: [
+          if (before.isNotEmpty) TextSpan(text: before, style: baseStyle),
+          TextSpan(text: word, style: highlightStyle),
+          if (after.isNotEmpty) TextSpan(text: after, style: baseStyle),
+        ],
+      );
+    }
+
+    return ConduitCard(
+      padding: const EdgeInsets.all(Spacing.sm),
+      child: RichText(text: buildSpans()),
+    );
+  }
+
   bool get _shouldShowTypingIndicator =>
       widget.isStreaming && _isAssistantResponseEmpty;