feat(tts): add karaoke-style TTS progress bar to assistant UI

Add rendering and support for a karaoke-style text-to-speechprogress bar in assistant messages so users can see the currently
spoken sentence and highlighted word during playback.

- Append TTS karaoke bar to AssistantMessageWidget when the message is
  the active TTS target and playback is speaking/paused/loading.
- Implement _buildKaraokeBar to render the active sentence with a
  highlighted word span, using ConduitCard and theme styles.
- Import conduit_components for shared UI primitives.
- Extend TextToSpeechState with sentence data:
  sentences, sentenceOffsets, activeSentenceIndex, and per-word
  progress (wordStartInSentence, wordEndInSentence).
- Add provider callbacks wiring: onSentenceIndex and
  onDeviceWordProgress handlers (hooked into TTS backend).
- Prepare sentence splitting and word-progress plumbing in the TTS
  provider (prepares data used to drive the karaoke display).

This change improves UX by visually indicating the spoken sentence
and current word during TTS playback, aiding comprehension and
accessibility.
This commit is contained in:
cogwheel0
2025-10-23 17:05:35 +05:30
parent 8ec411d6aa
commit 56246507de
3 changed files with 176 additions and 1 deletions

View File

@@ -15,6 +15,11 @@ class TextToSpeechState {
final TtsPlaybackStatus status;
final String? activeMessageId;
final String? errorMessage;
final List<String> sentences;
final List<int> sentenceOffsets; // start indices in full text
final int activeSentenceIndex; // -1 when none
final int? wordStartInSentence; // nullable; only for on-device
final int? wordEndInSentence; // nullable; only for on-device
const TextToSpeechState({
this.initialized = false,
@@ -22,6 +27,11 @@ class TextToSpeechState {
this.status = TtsPlaybackStatus.idle,
this.activeMessageId,
this.errorMessage,
this.sentences = const [],
this.sentenceOffsets = const [],
this.activeSentenceIndex = -1,
this.wordStartInSentence,
this.wordEndInSentence,
});
bool get isSpeaking => status == TtsPlaybackStatus.speaking;
@@ -37,6 +47,12 @@ class TextToSpeechState {
bool clearActiveMessageId = false,
String? errorMessage,
bool clearErrorMessage = false,
List<String>? sentences,
List<int>? sentenceOffsets,
int? activeSentenceIndex,
bool clearWord = false,
int? wordStartInSentence,
int? wordEndInSentence,
}) {
return TextToSpeechState(
initialized: initialized ?? this.initialized,
@@ -48,6 +64,15 @@ class TextToSpeechState {
errorMessage: clearErrorMessage
? null
: errorMessage ?? this.errorMessage,
sentences: sentences ?? this.sentences,
sentenceOffsets: sentenceOffsets ?? this.sentenceOffsets,
activeSentenceIndex: activeSentenceIndex ?? this.activeSentenceIndex,
wordStartInSentence: clearWord
? null
: (wordStartInSentence ?? this.wordStartInSentence),
wordEndInSentence: clearWord
? null
: (wordEndInSentence ?? this.wordEndInSentence),
);
}
}
@@ -70,6 +95,8 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
onPause: _handlePause,
onContinue: _handleContinue,
onError: _handleError,
onSentenceIndex: _handleSentenceIndex,
onDeviceWordProgress: _handleDeviceWordProgress,
);
ref.onDispose(() {
@@ -184,15 +211,23 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
return;
}
// Prepare sentence split for highlighting
final cleanText = MarkdownToText.convert(text);
final sentences = _splitForTts(cleanText);
final offsets = _computeOffsets(sentences);
state = state.copyWith(
status: TtsPlaybackStatus.loading,
activeMessageId: messageId,
clearErrorMessage: true,
sentences: sentences,
sentenceOffsets: offsets,
activeSentenceIndex: sentences.isEmpty ? -1 : 0,
clearWord: true,
);
try {
// Convert markdown to clean text for TTS
final cleanText = MarkdownToText.convert(text);
if (cleanText.isEmpty) {
// No speakable content
if (!ref.mounted) {
@@ -224,6 +259,34 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
}
}
List<String> _splitForTts(String text) {
final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim();
if (normalized.isEmpty) return const [];
final parts = <String>[];
final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)");
int index = 0;
for (final match in sentenceRegex.allMatches('$normalized ')) {
final s = match.group(1) ?? '';
if (s.trim().isNotEmpty) parts.add(s.trim());
index = match.end;
}
if (index < normalized.length) {
final tail = normalized.substring(index).trim();
if (tail.isNotEmpty) parts.add(tail);
}
return parts;
}
List<int> _computeOffsets(List<String> sentences) {
final offsets = <int>[];
int acc = 0;
for (final s in sentences) {
offsets.add(acc);
acc += s.length + 1; // assume a space or punctuation between
}
return offsets;
}
Future<void> pause() async {
if (!state.initialized || !state.available) {
return;
@@ -294,6 +357,41 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
clearActiveMessageId: true,
);
}
void _handleSentenceIndex(int index) {
if (!ref.mounted) return;
final clamped = index.clamp(
-1,
state.sentences.isEmpty ? -1 : state.sentences.length - 1,
);
state = state.copyWith(
activeSentenceIndex: clamped,
// clear per-word highlight when sentence switches (server or device)
clearWord: true,
);
}
void _handleDeviceWordProgress(int start, int end) {
if (!ref.mounted) return;
// Map global offsets to sentence index
final offsets = state.sentenceOffsets;
if (offsets.isEmpty) return;
int idx = 0;
for (var i = 0; i < offsets.length; i++) {
final sStart = offsets[i];
final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
if (start >= sStart && start < sEnd) {
idx = i;
break;
}
}
final sentenceStart = offsets[idx];
state = state.copyWith(
activeSentenceIndex: idx,
wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
);
}
}
final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {

View File

@@ -31,6 +31,8 @@ class TextToSpeechService {
VoidCallback? _onPause;
VoidCallback? _onContinue;
void Function(String message)? _onError;
void Function(int sentenceIndex)? _onSentenceIndex;
void Function(int start, int end)? _onDeviceWordProgress;
bool get isInitialized => _initialized;
bool get isAvailable => _available;
@@ -51,6 +53,8 @@ class TextToSpeechService {
VoidCallback? onPause,
VoidCallback? onContinue,
void Function(String message)? onError,
void Function(int sentenceIndex)? onSentenceIndex,
void Function(int start, int end)? onDeviceWordProgress,
}) {
_onStart = onStart;
_onComplete = onComplete;
@@ -58,6 +62,8 @@ class TextToSpeechService {
_onPause = onPause;
_onContinue = onContinue;
_onError = onError;
_onSentenceIndex = onSentenceIndex;
_onDeviceWordProgress = onDeviceWordProgress;
_tts.setStartHandler(_handleStart);
_tts.setCompletionHandler(_handleComplete);
@@ -65,6 +71,13 @@ class TextToSpeechService {
_tts.setPauseHandler(_handlePause);
_tts.setContinueHandler(_handleContinue);
_tts.setErrorHandler(_handleError);
try {
_tts.setProgressHandler((String text, int start, int end, String word) {
_onDeviceWordProgress?.call(start, end);
});
} catch (_) {
// Some platforms may not support progress handler
}
}
/// Initialize the native TTS engine lazily
@@ -151,6 +164,7 @@ class TextToSpeechService {
if (result is int && result != 1) {
_onError?.call('Text-to-speech engine returned code $result');
}
_onSentenceIndex?.call(0);
}
Future<void> pause() async {
@@ -370,6 +384,7 @@ class TextToSpeechService {
_buffered.add(Uint8List.fromList(firstBytes));
_currentIndex = 0;
await _player.play(BytesSource(_buffered.first));
_onSentenceIndex?.call(0);
// Prefetch the rest in background
unawaited(
@@ -438,6 +453,7 @@ class TextToSpeechService {
_currentIndex = nextIndex;
final bytes = _buffered[nextIndex];
await _player.play(BytesSource(bytes));
_onSentenceIndex?.call(_currentIndex);
}
List<String> _splitForTts(String text) {

View File

@@ -18,6 +18,7 @@ import 'package:conduit/l10n/app_localizations.dart';
import 'enhanced_attachment.dart';
import 'package:conduit/shared/widgets/chat_action_button.dart';
import '../../../shared/widgets/model_avatar.dart';
import '../../../shared/widgets/conduit_components.dart';
import 'package:url_launcher/url_launcher_string.dart';
import '../providers/chat_providers.dart' show sendMessageWithContainer;
import '../../../core/utils/debug_logger.dart';
@@ -457,12 +458,72 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
}
if (children.isEmpty) return const SizedBox.shrink();
// Append TTS karaoke bar if this is the active message
final ttsState = ref.watch(textToSpeechControllerProvider);
final isActive =
ttsState.activeMessageId == _messageId &&
(ttsState.status == TtsPlaybackStatus.speaking ||
ttsState.status == TtsPlaybackStatus.paused ||
ttsState.status == TtsPlaybackStatus.loading);
if (isActive && ttsState.activeSentenceIndex >= 0) {
children.add(const SizedBox(height: Spacing.sm));
children.add(_buildKaraokeBar(ttsState));
}
return Column(
crossAxisAlignment: CrossAxisAlignment.start,
children: children,
);
}
Widget _buildKaraokeBar(TextToSpeechState ttsState) {
final theme = context.conduitTheme;
final idx = ttsState.activeSentenceIndex;
if (idx < 0 || idx >= ttsState.sentences.length) {
return const SizedBox.shrink();
}
final sentence = ttsState.sentences[idx];
final ws = ttsState.wordStartInSentence;
final we = ttsState.wordEndInSentence;
final baseStyle = TextStyle(
color: theme.textPrimary,
height: 1.2,
fontSize: 14,
);
final highlightStyle = baseStyle.copyWith(
backgroundColor: theme.buttonPrimary.withValues(alpha: 0.25),
color: theme.textPrimary,
fontWeight: FontWeight.w600,
);
InlineSpan buildSpans() {
if (ws == null ||
we == null ||
ws < 0 ||
we <= ws ||
ws >= sentence.length) {
return TextSpan(text: sentence, style: baseStyle);
}
final safeEnd = we.clamp(0, sentence.length);
final before = sentence.substring(0, ws);
final word = sentence.substring(ws, safeEnd);
final after = sentence.substring(safeEnd);
return TextSpan(
children: [
if (before.isNotEmpty) TextSpan(text: before, style: baseStyle),
TextSpan(text: word, style: highlightStyle),
if (after.isNotEmpty) TextSpan(text: after, style: baseStyle),
],
);
}
return ConduitCard(
padding: const EdgeInsets.all(Spacing.sm),
child: RichText(text: buildSpans()),
);
}
bool get _shouldShowTypingIndicator =>
widget.isStreaming && _isAssistantResponseEmpty;