feat(tts): add karaoke-style TTS progress bar to assistant UI

Add rendering and support for a karaoke-style text-to-speechprogress bar in assistant messages so users can see the currently spoken sentence and highlighted word during playback. - Append TTS karaoke bar to AssistantMessageWidget when the message is the active TTS target and playback is speaking/paused/loading. - Implement _buildKaraokeBar to render the active sentence with a highlighted word span, using ConduitCard and theme styles. - Import conduit_components for shared UI primitives. - Extend TextToSpeechState with sentence data: sentences, sentenceOffsets, activeSentenceIndex, and per-word progress (wordStartInSentence, wordEndInSentence). - Add provider callbacks wiring: onSentenceIndex and onDeviceWordProgress handlers (hooked into TTS backend). - Prepare sentence splitting and word-progress plumbing in the TTS provider (prepares data used to drive the karaoke display). This change improves UX by visually indicating the spoken sentence and current word during TTS playback, aiding comprehension and accessibility.
2025-10-23 17:05:35 +05:30
parent 8ec411d6aa
commit 56246507de
3 changed files with 176 additions and 1 deletions
@@ -15,6 +15,11 @@ class TextToSpeechState {
  final TtsPlaybackStatus status;
  final String? activeMessageId;
  final String? errorMessage;
+  final List<String> sentences;
+  final List<int> sentenceOffsets; // start indices in full text
+  final int activeSentenceIndex; // -1 when none
+  final int? wordStartInSentence; // nullable; only for on-device
+  final int? wordEndInSentence; // nullable; only for on-device

  const TextToSpeechState({
    this.initialized = false,
@@ -22,6 +27,11 @@ class TextToSpeechState {
    this.status = TtsPlaybackStatus.idle,
    this.activeMessageId,
    this.errorMessage,
+    this.sentences = const [],
+    this.sentenceOffsets = const [],
+    this.activeSentenceIndex = -1,
+    this.wordStartInSentence,
+    this.wordEndInSentence,
  });

  bool get isSpeaking => status == TtsPlaybackStatus.speaking;
@@ -37,6 +47,12 @@ class TextToSpeechState {
    bool clearActiveMessageId = false,
    String? errorMessage,
    bool clearErrorMessage = false,
+    List<String>? sentences,
+    List<int>? sentenceOffsets,
+    int? activeSentenceIndex,
+    bool clearWord = false,
+    int? wordStartInSentence,
+    int? wordEndInSentence,
  }) {
    return TextToSpeechState(
      initialized: initialized ?? this.initialized,
@@ -48,6 +64,15 @@ class TextToSpeechState {
      errorMessage: clearErrorMessage
          ? null
          : errorMessage ?? this.errorMessage,
+      sentences: sentences ?? this.sentences,
+      sentenceOffsets: sentenceOffsets ?? this.sentenceOffsets,
+      activeSentenceIndex: activeSentenceIndex ?? this.activeSentenceIndex,
+      wordStartInSentence: clearWord
+          ? null
+          : (wordStartInSentence ?? this.wordStartInSentence),
+      wordEndInSentence: clearWord
+          ? null
+          : (wordEndInSentence ?? this.wordEndInSentence),
    );
  }
 }
@@ -70,6 +95,8 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
        onPause: _handlePause,
        onContinue: _handleContinue,
        onError: _handleError,
+        onSentenceIndex: _handleSentenceIndex,
+        onDeviceWordProgress: _handleDeviceWordProgress,
      );

      ref.onDispose(() {
@@ -184,15 +211,23 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
      return;
    }

+    // Prepare sentence split for highlighting
+    final cleanText = MarkdownToText.convert(text);
+    final sentences = _splitForTts(cleanText);
+    final offsets = _computeOffsets(sentences);
+
    state = state.copyWith(
      status: TtsPlaybackStatus.loading,
      activeMessageId: messageId,
      clearErrorMessage: true,
+      sentences: sentences,
+      sentenceOffsets: offsets,
+      activeSentenceIndex: sentences.isEmpty ? -1 : 0,
+      clearWord: true,
    );

    try {
      // Convert markdown to clean text for TTS
-      final cleanText = MarkdownToText.convert(text);
      if (cleanText.isEmpty) {
        // No speakable content
        if (!ref.mounted) {
@@ -224,6 +259,34 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
    }
  }

+  List<String> _splitForTts(String text) {
+    final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim();
+    if (normalized.isEmpty) return const [];
+    final parts = <String>[];
+    final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)");
+    int index = 0;
+    for (final match in sentenceRegex.allMatches('$normalized ')) {
+      final s = match.group(1) ?? '';
+      if (s.trim().isNotEmpty) parts.add(s.trim());
+      index = match.end;
+    }
+    if (index < normalized.length) {
+      final tail = normalized.substring(index).trim();
+      if (tail.isNotEmpty) parts.add(tail);
+    }
+    return parts;
+  }
+
+  List<int> _computeOffsets(List<String> sentences) {
+    final offsets = <int>[];
+    int acc = 0;
+    for (final s in sentences) {
+      offsets.add(acc);
+      acc += s.length + 1; // assume a space or punctuation between
+    }
+    return offsets;
+  }
+
  Future<void> pause() async {
    if (!state.initialized || !state.available) {
      return;
@@ -294,6 +357,41 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
      clearActiveMessageId: true,
    );
  }
+
+  void _handleSentenceIndex(int index) {
+    if (!ref.mounted) return;
+    final clamped = index.clamp(
+      -1,
+      state.sentences.isEmpty ? -1 : state.sentences.length - 1,
+    );
+    state = state.copyWith(
+      activeSentenceIndex: clamped,
+      // clear per-word highlight when sentence switches (server or device)
+      clearWord: true,
+    );
+  }
+
+  void _handleDeviceWordProgress(int start, int end) {
+    if (!ref.mounted) return;
+    // Map global offsets to sentence index
+    final offsets = state.sentenceOffsets;
+    if (offsets.isEmpty) return;
+    int idx = 0;
+    for (var i = 0; i < offsets.length; i++) {
+      final sStart = offsets[i];
+      final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
+      if (start >= sStart && start < sEnd) {
+        idx = i;
+        break;
+      }
+    }
+    final sentenceStart = offsets[idx];
+    state = state.copyWith(
+      activeSentenceIndex: idx,
+      wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
+      wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
+    );
+  }
 }

 final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {
@@ -31,6 +31,8 @@ class TextToSpeechService {
  VoidCallback? _onPause;
  VoidCallback? _onContinue;
  void Function(String message)? _onError;
+  void Function(int sentenceIndex)? _onSentenceIndex;
+  void Function(int start, int end)? _onDeviceWordProgress;

  bool get isInitialized => _initialized;
  bool get isAvailable => _available;
@@ -51,6 +53,8 @@ class TextToSpeechService {
    VoidCallback? onPause,
    VoidCallback? onContinue,
    void Function(String message)? onError,
+    void Function(int sentenceIndex)? onSentenceIndex,
+    void Function(int start, int end)? onDeviceWordProgress,
  }) {
    _onStart = onStart;
    _onComplete = onComplete;
@@ -58,6 +62,8 @@ class TextToSpeechService {
    _onPause = onPause;
    _onContinue = onContinue;
    _onError = onError;
+    _onSentenceIndex = onSentenceIndex;
+    _onDeviceWordProgress = onDeviceWordProgress;

    _tts.setStartHandler(_handleStart);
    _tts.setCompletionHandler(_handleComplete);
@@ -65,6 +71,13 @@ class TextToSpeechService {
    _tts.setPauseHandler(_handlePause);
    _tts.setContinueHandler(_handleContinue);
    _tts.setErrorHandler(_handleError);
+    try {
+      _tts.setProgressHandler((String text, int start, int end, String word) {
+        _onDeviceWordProgress?.call(start, end);
+      });
+    } catch (_) {
+      // Some platforms may not support progress handler
+    }
  }

  /// Initialize the native TTS engine lazily
@@ -151,6 +164,7 @@ class TextToSpeechService {
    if (result is int && result != 1) {
      _onError?.call('Text-to-speech engine returned code $result');
    }
+    _onSentenceIndex?.call(0);
  }

  Future<void> pause() async {
@@ -370,6 +384,7 @@ class TextToSpeechService {
    _buffered.add(Uint8List.fromList(firstBytes));
    _currentIndex = 0;
    await _player.play(BytesSource(_buffered.first));
+    _onSentenceIndex?.call(0);

    // Prefetch the rest in background
    unawaited(
@@ -438,6 +453,7 @@ class TextToSpeechService {
    _currentIndex = nextIndex;
    final bytes = _buffered[nextIndex];
    await _player.play(BytesSource(bytes));
+    _onSentenceIndex?.call(_currentIndex);
  }

  List<String> _splitForTts(String text) {
@@ -18,6 +18,7 @@ import 'package:conduit/l10n/app_localizations.dart';
 import 'enhanced_attachment.dart';
 import 'package:conduit/shared/widgets/chat_action_button.dart';
 import '../../../shared/widgets/model_avatar.dart';
+import '../../../shared/widgets/conduit_components.dart';
 import 'package:url_launcher/url_launcher_string.dart';
 import '../providers/chat_providers.dart' show sendMessageWithContainer;
 import '../../../core/utils/debug_logger.dart';
@@ -457,12 +458,72 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
    }

    if (children.isEmpty) return const SizedBox.shrink();
+    // Append TTS karaoke bar if this is the active message
+    final ttsState = ref.watch(textToSpeechControllerProvider);
+    final isActive =
+        ttsState.activeMessageId == _messageId &&
+        (ttsState.status == TtsPlaybackStatus.speaking ||
+            ttsState.status == TtsPlaybackStatus.paused ||
+            ttsState.status == TtsPlaybackStatus.loading);
+    if (isActive && ttsState.activeSentenceIndex >= 0) {
+      children.add(const SizedBox(height: Spacing.sm));
+      children.add(_buildKaraokeBar(ttsState));
+    }
+
    return Column(
      crossAxisAlignment: CrossAxisAlignment.start,
      children: children,
    );
  }

+  Widget _buildKaraokeBar(TextToSpeechState ttsState) {
+    final theme = context.conduitTheme;
+    final idx = ttsState.activeSentenceIndex;
+    if (idx < 0 || idx >= ttsState.sentences.length) {
+      return const SizedBox.shrink();
+    }
+    final sentence = ttsState.sentences[idx];
+    final ws = ttsState.wordStartInSentence;
+    final we = ttsState.wordEndInSentence;
+
+    final baseStyle = TextStyle(
+      color: theme.textPrimary,
+      height: 1.2,
+      fontSize: 14,
+    );
+    final highlightStyle = baseStyle.copyWith(
+      backgroundColor: theme.buttonPrimary.withValues(alpha: 0.25),
+      color: theme.textPrimary,
+      fontWeight: FontWeight.w600,
+    );
+
+    InlineSpan buildSpans() {
+      if (ws == null ||
+          we == null ||
+          ws < 0 ||
+          we <= ws ||
+          ws >= sentence.length) {
+        return TextSpan(text: sentence, style: baseStyle);
+      }
+      final safeEnd = we.clamp(0, sentence.length);
+      final before = sentence.substring(0, ws);
+      final word = sentence.substring(ws, safeEnd);
+      final after = sentence.substring(safeEnd);
+      return TextSpan(
+        children: [
+          if (before.isNotEmpty) TextSpan(text: before, style: baseStyle),
+          TextSpan(text: word, style: highlightStyle),
+          if (after.isNotEmpty) TextSpan(text: after, style: baseStyle),
+        ],
+      );
+    }
+
+    return ConduitCard(
+      padding: const EdgeInsets.all(Spacing.sm),
+      child: RichText(text: buildSpans()),
+    );
+  }
+
  bool get _shouldShowTypingIndicator =>
      widget.isStreaming && _isAssistantResponseEmpty;