From 561e7dd61662b31b778482ac8e9fd7b33a04ea96 Mon Sep 17 00:00:00 2001 From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com> Date: Thu, 23 Oct 2025 16:31:15 +0530 Subject: [PATCH 1/3] feat(tts): server-backed TTS engine selection Introduce server TTS support and engine selection while keeping device TTS as the default. - Add new persistence keys for storing TTS engine and selected server voice (ttsEngine, ttsServerVoiceId, ttsServerVoiceName). - Extend TextToSpeechService to support two engines: TtsEngine.device (FlutterTts) and TtsEngine.server (remote audio). - Wire in an AudioPlayer and optional ApiService to fetch raw audio bytes from the server and play them, with event hooks mapped to existing lifecycle callbacks. - Implement fallback to device TTS on server errors or empty responses, and ensure player lifecycle (pause/stop/dispose) is handled when using server engine. - Allow engine and preferred voice to be configured before initialization and updated at runtime via updateSettings. This enables selecting a server-side voice and using a remote TTS provider while preserving compatibility with the existing device TTS implementation. --- ios/Podfile.lock | 6 + lib/core/persistence/persistence_keys.dart | 3 + lib/core/providers/app_providers.dart | 6 +- lib/core/services/api_service.dart | 28 +++- lib/core/services/settings_service.dart | 74 ++++++++++ .../providers/text_to_speech_provider.dart | 15 +- .../chat/services/text_to_speech_service.dart | 117 +++++++++++++-- .../profile/views/app_customization_page.dart | 134 ++++++++++++++++-- pubspec.lock | 56 ++++++++ pubspec.yaml | 1 + 10 files changed, 404 insertions(+), 36 deletions(-) diff --git a/ios/Podfile.lock b/ios/Podfile.lock index c990258..9a90a3c 100644 --- a/ios/Podfile.lock +++ b/ios/Podfile.lock @@ -1,4 +1,6 @@ PODS: + - audioplayers_darwin (0.0.1): + - Flutter - connectivity_plus (0.0.1): - Flutter - DKImagePickerController/Core (4.3.9): @@ -84,6 +86,7 @@ PODS: - FlutterMacOS DEPENDENCIES: + - audioplayers_darwin (from `.symlinks/plugins/audioplayers_darwin/ios`) - connectivity_plus (from `.symlinks/plugins/connectivity_plus/ios`) - file_picker (from `.symlinks/plugins/file_picker/ios`) - Flutter (from `Flutter`) @@ -113,6 +116,8 @@ SPEC REPOS: - SwiftyGif EXTERNAL SOURCES: + audioplayers_darwin: + :path: ".symlinks/plugins/audioplayers_darwin/ios" connectivity_plus: :path: ".symlinks/plugins/connectivity_plus/ios" file_picker: @@ -155,6 +160,7 @@ EXTERNAL SOURCES: :path: ".symlinks/plugins/webview_flutter_wkwebview/darwin" SPEC CHECKSUMS: + audioplayers_darwin: ccf9c770ee768abb07e26d90af093f7bab1c12ab connectivity_plus: cb623214f4e1f6ef8fe7403d580fdad517d2f7dd DKImagePickerController: 946cec48c7873164274ecc4624d19e3da4c1ef3c DKPhotoGallery: b3834fecb755ee09a593d7c9e389d8b5d6deed60 diff --git a/lib/core/persistence/persistence_keys.dart b/lib/core/persistence/persistence_keys.dart index 80e58b7..6afce43 100644 --- a/lib/core/persistence/persistence_keys.dart +++ b/lib/core/persistence/persistence_keys.dart @@ -25,6 +25,9 @@ final class PreferenceKeys { static const String ttsSpeechRate = 'tts_speech_rate'; static const String ttsPitch = 'tts_pitch'; static const String ttsVolume = 'tts_volume'; + static const String ttsEngine = 'tts_engine'; // 'device' | 'server' + static const String ttsServerVoiceId = 'tts_server_voice_id'; + static const String ttsServerVoiceName = 'tts_server_voice_name'; } final class LegacyPreferenceKeys { diff --git a/lib/core/providers/app_providers.dart b/lib/core/providers/app_providers.dart index 9257d86..8ba3794 100644 --- a/lib/core/providers/app_providers.dart +++ b/lib/core/providers/app_providers.dart @@ -1830,7 +1830,11 @@ Future> availableVoices(Ref ref) async { if (api == null) return []; try { - return await api.getAvailableVoices(); + final voices = await api.getAvailableServerVoices(); + return voices + .map((v) => (v['name'] ?? v['id'] ?? '').toString()) + .where((s) => s.isNotEmpty) + .toList(); } catch (e) { DebugLogger.error('voices-failed', scope: 'voices', error: e); return []; diff --git a/lib/core/services/api_service.dart b/lib/core/services/api_service.dart index acc7a77..46044aa 100644 --- a/lib/core/services/api_service.dart +++ b/lib/core/services/api_service.dart @@ -2261,12 +2261,24 @@ class ApiService { } // Audio - Future> getAvailableVoices() async { - _traceApi('Fetching available voices'); + Future>> getAvailableServerVoices() async { + _traceApi('Fetching server TTS voices'); final response = await _dio.get('/api/v1/audio/voices'); final data = response.data; + if (data is Map) { + final voices = data['voices']; + if (voices is List) { + return voices + .whereType() + .map((e) => e.cast()) + .toList(); + } + } if (data is List) { - return data.cast(); + // Fallback: plain list of ids + return data + .map((e) => {'id': e.toString(), 'name': e.toString()}) + .toList(); } return []; } @@ -2279,13 +2291,15 @@ class ApiService { _traceApi('Generating speech for text: $textPreview...'); final response = await _dio.post( '/api/v1/audio/speech', - data: {'text': text, if (voice != null) 'voice': voice}, + data: {'input': text, if (voice != null) 'voice': voice}, + options: Options(responseType: ResponseType.bytes), ); // Return audio data as bytes - if (response.data is List) { - return (response.data as List).cast(); - } + final data = response.data; + if (data is List) return data; + if (data is Uint8List) return data.toList(); + if (data is List) return (data).cast(); return []; } diff --git a/lib/core/services/settings_service.dart b/lib/core/services/settings_service.dart index b21a168..b40b533 100644 --- a/lib/core/services/settings_service.dart +++ b/lib/core/services/settings_service.dart @@ -8,6 +8,9 @@ import 'animation_service.dart'; part 'settings_service.g.dart'; +/// TTS engine selection +enum TtsEngine { device, server } + /// Service for managing app-wide settings including accessibility preferences class SettingsService { static const String _reduceMotionKey = PreferenceKeys.reduceMotion; @@ -142,6 +145,12 @@ class SettingsService { ttsPitch: (box.get(PreferenceKeys.ttsPitch) as num?)?.toDouble() ?? 1.0, ttsVolume: (box.get(PreferenceKeys.ttsVolume) as num?)?.toDouble() ?? 1.0, + ttsEngine: _parseTtsEngine( + box.get(PreferenceKeys.ttsEngine) as String?, + ), + ttsServerVoiceId: box.get(PreferenceKeys.ttsServerVoiceId) as String?, + ttsServerVoiceName: + box.get(PreferenceKeys.ttsServerVoiceName) as String?, ), ); } @@ -164,6 +173,7 @@ class SettingsService { PreferenceKeys.ttsSpeechRate: settings.ttsSpeechRate, PreferenceKeys.ttsPitch: settings.ttsPitch, PreferenceKeys.ttsVolume: settings.ttsVolume, + PreferenceKeys.ttsEngine: settings.ttsEngine.name, }; await box.putAll(updates); @@ -185,6 +195,33 @@ class SettingsService { } else { await box.delete(PreferenceKeys.ttsVoice); } + + // Server-specific voice id and friendly name + if (settings.ttsServerVoiceId != null && + settings.ttsServerVoiceId!.isNotEmpty) { + await box.put(PreferenceKeys.ttsServerVoiceId, settings.ttsServerVoiceId); + } else { + await box.delete(PreferenceKeys.ttsServerVoiceId); + } + if (settings.ttsServerVoiceName != null && + settings.ttsServerVoiceName!.isNotEmpty) { + await box.put( + PreferenceKeys.ttsServerVoiceName, + settings.ttsServerVoiceName, + ); + } else { + await box.delete(PreferenceKeys.ttsServerVoiceName); + } + } + + static TtsEngine _parseTtsEngine(String? raw) { + switch ((raw ?? '').toLowerCase()) { + case 'server': + return TtsEngine.server; + case 'device': + default: + return TtsEngine.device; + } } // Voice input specific settings @@ -314,6 +351,9 @@ class AppSettings { final double ttsSpeechRate; final double ttsPitch; final double ttsVolume; + final TtsEngine ttsEngine; + final String? ttsServerVoiceId; + final String? ttsServerVoiceName; const AppSettings({ this.reduceMotion = false, this.animationSpeed = 1.0, @@ -332,6 +372,9 @@ class AppSettings { this.ttsSpeechRate = 0.5, this.ttsPitch = 1.0, this.ttsVolume = 1.0, + this.ttsEngine = TtsEngine.device, + this.ttsServerVoiceId, + this.ttsServerVoiceName, }); AppSettings copyWith({ @@ -352,6 +395,9 @@ class AppSettings { double? ttsSpeechRate, double? ttsPitch, double? ttsVolume, + TtsEngine? ttsEngine, + Object? ttsServerVoiceId = const _DefaultValue(), + Object? ttsServerVoiceName = const _DefaultValue(), }) { return AppSettings( reduceMotion: reduceMotion ?? this.reduceMotion, @@ -375,6 +421,13 @@ class AppSettings { ttsSpeechRate: ttsSpeechRate ?? this.ttsSpeechRate, ttsPitch: ttsPitch ?? this.ttsPitch, ttsVolume: ttsVolume ?? this.ttsVolume, + ttsEngine: ttsEngine ?? this.ttsEngine, + ttsServerVoiceId: ttsServerVoiceId is _DefaultValue + ? this.ttsServerVoiceId + : ttsServerVoiceId as String?, + ttsServerVoiceName: ttsServerVoiceName is _DefaultValue + ? this.ttsServerVoiceName + : ttsServerVoiceName as String?, ); } @@ -397,6 +450,9 @@ class AppSettings { other.ttsSpeechRate == ttsSpeechRate && other.ttsPitch == ttsPitch && other.ttsVolume == ttsVolume && + other.ttsEngine == ttsEngine && + other.ttsServerVoiceId == ttsServerVoiceId && + other.ttsServerVoiceName == ttsServerVoiceName && _listEquals(other.quickPills, quickPills); // socketTransportMode intentionally not included in == to avoid frequent rebuilds } @@ -420,6 +476,9 @@ class AppSettings { ttsSpeechRate, ttsPitch, ttsVolume, + ttsEngine, + ttsServerVoiceId, + ttsServerVoiceName, Object.hashAllUnordered(quickPills), ); } @@ -543,6 +602,21 @@ class AppSettingsNotifier extends _$AppSettingsNotifier { await SettingsService.saveSettings(state); } + Future setTtsEngine(TtsEngine engine) async { + state = state.copyWith(ttsEngine: engine); + await SettingsService.saveSettings(state); + } + + Future setTtsServerVoiceName(String? name) async { + state = state.copyWith(ttsServerVoiceName: name); + await SettingsService.saveSettings(state); + } + + Future setTtsServerVoiceId(String? id) async { + state = state.copyWith(ttsServerVoiceId: id); + await SettingsService.saveSettings(state); + } + Future resetToDefaults() async { const defaultSettings = AppSettings(); await SettingsService.saveSettings(defaultSettings); diff --git a/lib/features/chat/providers/text_to_speech_provider.dart b/lib/features/chat/providers/text_to_speech_provider.dart index b25e341..dc234d9 100644 --- a/lib/features/chat/providers/text_to_speech_provider.dart +++ b/lib/features/chat/providers/text_to_speech_provider.dart @@ -3,6 +3,7 @@ import 'dart:async'; import 'package:flutter_riverpod/flutter_riverpod.dart'; import '../../../core/services/settings_service.dart'; +import '../../../core/providers/app_providers.dart'; import '../../../core/utils/markdown_to_text.dart'; import '../services/text_to_speech_service.dart'; @@ -79,11 +80,15 @@ class TextToSpeechController extends Notifier { // Listen to settings changes and update TTS when initialized ref.listen(appSettingsProvider, (previous, next) { if (_service.isInitialized && _service.isAvailable) { + final selectedVoice = next.ttsEngine == TtsEngine.server + ? next.ttsServerVoiceId + : next.ttsVoice; _service.updateSettings( - voice: next.ttsVoice, + voice: selectedVoice, speechRate: next.ttsSpeechRate, pitch: next.ttsPitch, volume: next.ttsVolume, + engine: next.ttsEngine, ); } }, fireImmediately: false); @@ -105,10 +110,13 @@ class TextToSpeechController extends Notifier { final settings = ref.read(appSettingsProvider); final future = _service .initialize( - voice: settings.ttsVoice, + voice: settings.ttsEngine == TtsEngine.server + ? settings.ttsServerVoiceId + : settings.ttsVoice, speechRate: settings.ttsSpeechRate, pitch: settings.ttsPitch, volume: settings.ttsVolume, + engine: settings.ttsEngine, ) .then((available) { if (!ref.mounted) { @@ -289,7 +297,8 @@ class TextToSpeechController extends Notifier { } final textToSpeechServiceProvider = Provider((ref) { - final service = TextToSpeechService(); + final api = ref.watch(apiServiceProvider); + final service = TextToSpeechService(api: api); ref.onDispose(() { unawaited(service.dispose()); }); diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart index 6591f41..65aaa86 100644 --- a/lib/features/chat/services/text_to_speech_service.dart +++ b/lib/features/chat/services/text_to_speech_service.dart @@ -1,13 +1,21 @@ import 'dart:async'; import 'dart:io' show Platform; +import 'package:audioplayers/audioplayers.dart'; import 'package:flutter/foundation.dart'; import 'package:flutter/widgets.dart'; import 'package:flutter_tts/flutter_tts.dart'; +import '../../../core/services/api_service.dart'; +import '../../../core/services/settings_service.dart'; + /// Lightweight wrapper around FlutterTts to centralize configuration class TextToSpeechService { final FlutterTts _tts = FlutterTts(); + final AudioPlayer _player = AudioPlayer(); + final ApiService? _api; + TtsEngine _engine = TtsEngine.device; + String? _preferredVoice; bool _initialized = false; bool _available = false; bool _voiceConfigured = false; @@ -22,6 +30,14 @@ class TextToSpeechService { bool get isInitialized => _initialized; bool get isAvailable => _available; + TextToSpeechService({ApiService? api}) : _api = api { + // Wire minimal player events to callbacks + _player.onPlayerComplete.listen((_) => _handleComplete()); + _player.onPlayerStateChanged.listen((s) { + if (s == PlayerState.playing) _handleStart(); + }); + } + /// Register callbacks for TTS lifecycle events void bindHandlers({ VoidCallback? onStart, @@ -52,12 +68,15 @@ class TextToSpeechService { double speechRate = 0.5, double pitch = 1.0, double volume = 1.0, + TtsEngine engine = TtsEngine.device, }) async { if (_initialized) { return _available; } try { + _engine = engine; + _preferredVoice = voice; await _tts.awaitSpeakCompletion(false); // Set volume @@ -97,34 +116,61 @@ class TextToSpeechService { } if (!_initialized) { - await initialize(); + await initialize(voice: _preferredVoice, engine: _engine); } + if (_engine == TtsEngine.server && _api != null) { + // Server-backed TTS path + try { + final effectiveVoice = + (_preferredVoice == null || _preferredVoice!.trim().isEmpty) + ? 'alloy' + : _preferredVoice!; + + final bytes = await _api.generateSpeech( + text: text, + voice: effectiveVoice, + ); + if (bytes.isEmpty) { + throw Exception('Empty audio response'); + } + await _player.stop(); + final data = Uint8List.fromList(bytes); + await _player.play(BytesSource(data)); + } catch (e) { + _onError?.call(e.toString()); + // Fallback to device TTS on failure + await _speakOnDevice(text); + } + return; + } + + // Device TTS path + await _speakOnDevice(text); + } + + Future _speakOnDevice(String text) async { if (!_available) { throw StateError('Text-to-speech is unavailable on this device'); } - await _tts.stop(); if (!_voiceConfigured) { await _configurePreferredVoice(); } final result = await _tts.speak(text); - if (result == null) { - return; - } - if (result is int && result != 1) { _onError?.call('Text-to-speech engine returned code $result'); } } Future pause() async { - if (!_initialized || !_available) { - return; - } - + if (!_initialized) return; try { - await _tts.pause(); + if (_engine == TtsEngine.server) { + await _player.pause(); + } else if (_available) { + await _tts.pause(); + } } catch (e) { _onError?.call(e.toString()); } @@ -136,7 +182,11 @@ class TextToSpeechService { } try { - await _tts.stop(); + if (_engine == TtsEngine.server) { + await _player.stop(); + } else { + await _tts.stop(); + } } catch (e) { _onError?.call(e.toString()); } @@ -144,6 +194,7 @@ class TextToSpeechService { Future dispose() async { await stop(); + await _player.dispose(); } /// Update TTS settings on-the-fly @@ -152,12 +203,22 @@ class TextToSpeechService { double? speechRate, double? pitch, double? volume, + TtsEngine? engine, }) async { if (!_initialized || !_available) { + // Allow engine and voice to update before init + if (engine != null) _engine = engine; + if (voice != null) _preferredVoice = voice; return; } try { + if (engine != null) { + _engine = engine; + } + if (voice != null) { + _preferredVoice = voice; + } if (volume != null) { await _tts.setVolume(volume); } @@ -167,8 +228,10 @@ class TextToSpeechService { if (pitch != null) { await _tts.setPitch(pitch); } - // Set specific voice by name - await _setVoiceByName(voice); + // Set specific voice by name on device engine + if (_engine == TtsEngine.device) { + await _setVoiceByName(_preferredVoice); + } } catch (e) { _onError?.call(e.toString()); } @@ -224,7 +287,31 @@ class TextToSpeechService { /// Get available voices from the TTS engine Future>> getAvailableVoices() async { if (!_initialized) { - await initialize(); + await initialize(voice: _preferredVoice, engine: _engine); + } + + if (_engine == TtsEngine.server && _api != null) { + try { + final serverVoices = await _api.getAvailableServerVoices(); + final mapped = serverVoices + .map( + (v) => { + 'name': (v['name'] ?? v['id'] ?? '').toString(), + 'locale': (v['locale'] ?? '').toString(), + }, + ) + .where((e) => (e['name'] as String).isNotEmpty) + .toList(); + if (mapped.isEmpty) { + return [ + {'name': 'alloy', 'locale': ''}, + ]; + } + return mapped; + } catch (e) { + _onError?.call(e.toString()); + // Fall back to device voices + } } if (!_available) { diff --git a/lib/features/profile/views/app_customization_page.dart b/lib/features/profile/views/app_customization_page.dart index ba2c4ac..bc5e01d 100644 --- a/lib/features/profile/views/app_customization_page.dart +++ b/lib/features/profile/views/app_customization_page.dart @@ -441,10 +441,97 @@ class AppCustomizationPage extends ConsumerWidget { TextStyle(color: theme.sidebarForeground, fontSize: 18), ), const SizedBox(height: Spacing.sm), + ConduitCard( + padding: const EdgeInsets.all(Spacing.md), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Row( + children: [ + _buildIconBadge( + context, + UiUtils.platformIcon( + ios: CupertinoIcons.settings, + android: Icons.settings_voice, + ), + color: theme.buttonPrimary, + ), + const SizedBox(width: Spacing.sm), + const Text('Engine'), + const Spacer(), + Wrap( + spacing: Spacing.sm, + children: [ + ChoiceChip( + label: const Text('On Device'), + selected: settings.ttsEngine == TtsEngine.device, + showCheckmark: false, + selectedColor: theme.buttonPrimary, + backgroundColor: theme.cardBackground, + side: BorderSide( + color: settings.ttsEngine == TtsEngine.device + ? theme.buttonPrimary.withValues(alpha: 0.6) + : theme.textPrimary.withValues(alpha: 0.2), + ), + labelStyle: TextStyle( + color: settings.ttsEngine == TtsEngine.device + ? theme.buttonPrimaryText + : theme.textPrimary, + fontWeight: FontWeight.w600, + ), + onSelected: (v) { + if (v) { + final notifier = ref.read( + appSettingsProvider.notifier, + ); + notifier.setTtsEngine(TtsEngine.device); + // Keep previous voice (device voices) + } + }, + ), + ChoiceChip( + label: const Text('Server'), + selected: settings.ttsEngine == TtsEngine.server, + showCheckmark: false, + selectedColor: theme.buttonPrimary, + backgroundColor: theme.cardBackground, + side: BorderSide( + color: settings.ttsEngine == TtsEngine.server + ? theme.buttonPrimary.withValues(alpha: 0.6) + : theme.textPrimary.withValues(alpha: 0.2), + ), + labelStyle: TextStyle( + color: settings.ttsEngine == TtsEngine.server + ? theme.buttonPrimaryText + : theme.textPrimary, + fontWeight: FontWeight.w600, + ), + onSelected: (v) { + if (v) { + final notifier = ref.read( + appSettingsProvider.notifier, + ); + // Clear device-specific voice so server can default + notifier.setTtsVoice(null); + notifier.setTtsEngine(TtsEngine.server); + } + }, + ), + ], + ), + ], + ), + ], + ), + ), + const SizedBox(height: Spacing.sm), _ExpandableCard( title: l10n.ttsVoice, subtitle: _getDisplayVoiceName( - settings.ttsVoice, + settings.ttsEngine == TtsEngine.server + ? ((settings.ttsServerVoiceName ?? settings.ttsServerVoiceId) ?? + '') + : (settings.ttsVoice ?? ''), l10n.ttsSystemDefault, ), icon: UiUtils.platformIcon( @@ -466,7 +553,11 @@ class AppCustomizationPage extends ConsumerWidget { ), title: l10n.ttsVoice, subtitle: _getDisplayVoiceName( - settings.ttsVoice, + settings.ttsEngine == TtsEngine.server + ? ((settings.ttsServerVoiceName ?? + settings.ttsServerVoiceId) ?? + '') + : (settings.ttsVoice ?? ''), l10n.ttsSystemDefault, ), onTap: () => _showVoicePickerSheet(context, ref, settings), @@ -616,7 +707,10 @@ class AppCustomizationPage extends ConsumerWidget { final theme = context.conduitTheme; final ttsService = ref.read(textToSpeechServiceProvider); - // Fetch available voices + // Ensure the service uses the currently selected engine before fetching + await ttsService.updateSettings(engine: settings.ttsEngine); + + // Fetch available voices from the active engine final allVoices = await ttsService.getAvailableVoices(); if (!context.mounted) return; @@ -729,17 +823,29 @@ class AppCustomizationPage extends ConsumerWidget { style: theme.bodyMedium?.copyWith( color: theme.sidebarForeground, - fontWeight: settings.ttsVoice == null + fontWeight: + (settings.ttsEngine == TtsEngine.server + ? settings.ttsServerVoiceId == null + : settings.ttsVoice == null) ? FontWeight.bold : FontWeight.normal, ) ?? TextStyle(color: theme.sidebarForeground), ), - trailing: settings.ttsVoice == null + trailing: + (settings.ttsEngine == TtsEngine.server + ? settings.ttsServerVoiceId == null + : settings.ttsVoice == null) ? Icon(Icons.check, color: theme.buttonPrimary) : null, onTap: () { - ref.read(appSettingsProvider.notifier).setTtsVoice(null); + final notifier = ref.read(appSettingsProvider.notifier); + if (settings.ttsEngine == TtsEngine.server) { + notifier.setTtsServerVoiceId(null); + notifier.setTtsServerVoiceName(null); + } else { + notifier.setTtsVoice(null); + } Navigator.of(sheetContext).pop(); }, ), @@ -823,7 +929,9 @@ class AppCustomizationPage extends ConsumerWidget { final voiceId = _getVoiceIdentifier(voice); final displayName = _formatVoiceName(voice); final subtitle = _getVoiceSubtitle(voice); - final isSelected = settings.ttsVoice == voiceId; + final isSelected = settings.ttsEngine == TtsEngine.server + ? settings.ttsServerVoiceId == voiceId + : settings.ttsVoice == voiceId; return ListTile( leading: Icon( @@ -865,9 +973,15 @@ class AppCustomizationPage extends ConsumerWidget { ? Icon(Icons.check, color: theme.buttonPrimary) : null, onTap: () { - ref - .read(appSettingsProvider.notifier) - .setTtsVoice(voiceId); + final notifier = ref.read( + appSettingsProvider.notifier, + ); + if (settings.ttsEngine == TtsEngine.server) { + notifier.setTtsServerVoiceId(voiceId); + notifier.setTtsServerVoiceName(displayName); + } else { + notifier.setTtsVoice(voiceId); + } Navigator.of(sheetContext).pop(); }, ); diff --git a/pubspec.lock b/pubspec.lock index 9b612c1..a57444b 100644 --- a/pubspec.lock +++ b/pubspec.lock @@ -65,6 +65,62 @@ packages: url: "https://pub.dev" source: hosted version: "2.13.0" + audioplayers: + dependency: "direct main" + description: + name: audioplayers + sha256: c05c6147124cd63e725e861335a8b4d57300b80e6e92cea7c145c739223bbaef + url: "https://pub.dev" + source: hosted + version: "5.2.1" + audioplayers_android: + dependency: transitive + description: + name: audioplayers_android + sha256: b00e1a0e11365d88576320ec2d8c192bc21f1afb6c0e5995d1c57ae63156acb5 + url: "https://pub.dev" + source: hosted + version: "4.0.3" + audioplayers_darwin: + dependency: transitive + description: + name: audioplayers_darwin + sha256: "3034e99a6df8d101da0f5082dcca0a2a99db62ab1d4ddb3277bed3f6f81afe08" + url: "https://pub.dev" + source: hosted + version: "5.0.2" + audioplayers_linux: + dependency: transitive + description: + name: audioplayers_linux + sha256: "60787e73fefc4d2e0b9c02c69885402177e818e4e27ef087074cf27c02246c9e" + url: "https://pub.dev" + source: hosted + version: "3.1.0" + audioplayers_platform_interface: + dependency: transitive + description: + name: audioplayers_platform_interface + sha256: "365c547f1bb9e77d94dd1687903a668d8f7ac3409e48e6e6a3668a1ac2982adb" + url: "https://pub.dev" + source: hosted + version: "6.1.0" + audioplayers_web: + dependency: transitive + description: + name: audioplayers_web + sha256: "22cd0173e54d92bd9b2c80b1204eb1eb159ece87475ab58c9788a70ec43c2a62" + url: "https://pub.dev" + source: hosted + version: "4.1.0" + audioplayers_windows: + dependency: transitive + description: + name: audioplayers_windows + sha256: "9536812c9103563644ada2ef45ae523806b0745f7a78e89d1b5fb1951de90e1a" + url: "https://pub.dev" + source: hosted + version: "3.1.0" boolean_selector: dependency: transitive description: diff --git a/pubspec.yaml b/pubspec.yaml index 4aa6980..8e2d10a 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -47,6 +47,7 @@ dependencies: record: ^6.1.1 stts: ^1.2.5 flutter_tts: ^4.2.3 + audioplayers: ^5.2.1 image_picker: ^1.2.0 file_picker: ^10.3.3 path_provider: ^2.1.4 From 8ec411d6aacb7b4e2110436dbddb7599d0569e0b Mon Sep 17 00:00:00 2001 From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com> Date: Thu, 23 Oct 2025 16:46:24 +0530 Subject: [PATCH 2/3] feat(tts): server chunked playback queue on server pathRefactor server-backedTS path to perform sentence chunking and queued playback via a dedicated _startServerChunkedPlayback method instead of generating a single monolithic audio blob. This change simplifies the server flow, avoids constructing an entire audio buffer in memory, and enables smoother playback and error recovery. On errors, the code still falls back to device TTS. --- .../chat/services/text_to_speech_service.dart | 175 ++++++++++++++++-- 1 file changed, 157 insertions(+), 18 deletions(-) diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart index 65aaa86..5d344e6 100644 --- a/lib/features/chat/services/text_to_speech_service.dart +++ b/lib/features/chat/services/text_to_speech_service.dart @@ -19,6 +19,11 @@ class TextToSpeechService { bool _initialized = false; bool _available = false; bool _voiceConfigured = false; + int _session = 0; // increments to cancel in-flight work + final List _buffered = []; // server chunks + int _expectedChunks = 0; + int _currentIndex = -1; + bool _waitingNext = false; VoidCallback? _onStart; VoidCallback? _onComplete; @@ -32,7 +37,7 @@ class TextToSpeechService { TextToSpeechService({ApiService? api}) : _api = api { // Wire minimal player events to callbacks - _player.onPlayerComplete.listen((_) => _handleComplete()); + _player.onPlayerComplete.listen((_) => _onAudioComplete()); _player.onPlayerStateChanged.listen((s) { if (s == PlayerState.playing) _handleStart(); }); @@ -120,26 +125,11 @@ class TextToSpeechService { } if (_engine == TtsEngine.server && _api != null) { - // Server-backed TTS path + // Server-backed TTS with sentence chunking & queued playback try { - final effectiveVoice = - (_preferredVoice == null || _preferredVoice!.trim().isEmpty) - ? 'alloy' - : _preferredVoice!; - - final bytes = await _api.generateSpeech( - text: text, - voice: effectiveVoice, - ); - if (bytes.isEmpty) { - throw Exception('Empty audio response'); - } - await _player.stop(); - final data = Uint8List.fromList(bytes); - await _player.play(BytesSource(data)); + await _startServerChunkedPlayback(text); } catch (e) { _onError?.call(e.toString()); - // Fallback to device TTS on failure await _speakOnDevice(text); } return; @@ -182,6 +172,12 @@ class TextToSpeechService { } try { + // Cancel any in-flight server work + _session++; + _buffered.clear(); + _expectedChunks = 0; + _currentIndex = -1; + _waitingNext = false; if (_engine == TtsEngine.server) { await _player.stop(); } else { @@ -341,6 +337,149 @@ class TextToSpeechService { } } + // ===== Server chunked playback ===== + + Future _startServerChunkedPlayback(String text) async { + final effectiveVoice = + (_preferredVoice == null || _preferredVoice!.trim().isEmpty) + ? 'alloy' + : _preferredVoice!; + + // Reset queue and create a new session + _session++; + final session = _session; + _buffered.clear(); + _expectedChunks = 0; + _currentIndex = -1; + _waitingNext = false; + + final chunks = _splitForTts(text); + if (chunks.isEmpty) return; + _expectedChunks = chunks.length; + + // Fetch first chunk to start playback quickly + final firstBytes = await _fetchServerAudio( + chunks.first, + effectiveVoice, + session, + ); + if (session != _session) return; // canceled + if (firstBytes.isEmpty) throw Exception('Empty audio response'); + + await _player.stop(); + _buffered.add(Uint8List.fromList(firstBytes)); + _currentIndex = 0; + await _player.play(BytesSource(_buffered.first)); + + // Prefetch the rest in background + unawaited( + _prefetchRemainingChunks( + chunks.skip(1).toList(), + effectiveVoice, + session, + ), + ); + } + + Future _prefetchRemainingChunks( + List remaining, + String voice, + int session, + ) async { + for (final chunk in remaining) { + if (session != _session) return; // canceled + try { + final audio = await _fetchServerAudio(chunk, voice, session); + if (session != _session) return; + if (audio.isNotEmpty) { + _buffered.add(Uint8List.fromList(audio)); + // If the player finished the previous chunk and is waiting, start now + if (_waitingNext && (_currentIndex + 1) < _buffered.length) { + _waitingNext = false; + await _playNextIfBuffered(session); + } + } + } catch (e) { + _onError?.call(e.toString()); + // continue with other chunks + } + } + } + + Future> _fetchServerAudio( + String text, + String voice, + int session, + ) async { + return await _api!.generateSpeech(text: text, voice: voice); + } + + Future _onAudioComplete() async { + final session = _session; + // If there are more expected chunks + if ((_currentIndex + 1) < _expectedChunks) { + // If next chunk is already buffered, play it + if ((_currentIndex + 1) < _buffered.length) { + await _playNextIfBuffered(session); + } else { + // Wait for prefetch to provide it + _waitingNext = true; + } + return; + } + // No more chunks – this is the real completion + _handleComplete(); + } + + Future _playNextIfBuffered(int session) async { + if (session != _session) return; + final nextIndex = _currentIndex + 1; + if (nextIndex < 0 || nextIndex >= _buffered.length) return; + _currentIndex = nextIndex; + final bytes = _buffered[nextIndex]; + await _player.play(BytesSource(bytes)); + } + + List _splitForTts(String text) { + // Normalize whitespace + final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim(); + if (normalized.isEmpty) return const []; + + // Split on sentence-ending punctuation while keeping the delimiter + final parts = []; + final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)"); + int index = 0; + for (final match in sentenceRegex.allMatches('$normalized ')) { + final s = match.group(1) ?? ''; + if (s.trim().isNotEmpty) parts.add(s.trim()); + index = match.end; + } + if (index < normalized.length) { + final tail = normalized.substring(index).trim(); + if (tail.isNotEmpty) parts.add(tail); + } + + // Fallback to length-based splits for very long segments + const maxLen = 300; + final chunks = []; + for (final p in parts.isEmpty ? [normalized] : parts) { + if (p.length <= maxLen) { + chunks.add(p); + } else { + // Try splitting on commas/spaces + var remaining = p; + while (remaining.length > maxLen) { + int cut = remaining.lastIndexOf(RegExp(r",\s|\s"), maxLen); + cut = cut <= 0 ? maxLen : cut; + chunks.add(remaining.substring(0, cut).trim()); + remaining = remaining.substring(cut).trim(); + } + if (remaining.isNotEmpty) chunks.add(remaining); + } + } + return chunks; + } + Future _configurePreferredVoice() async { if (_voiceConfigured) { return; From 56246507de5e4e1a95d37de871fd41059961a4c6 Mon Sep 17 00:00:00 2001 From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com> Date: Thu, 23 Oct 2025 17:05:35 +0530 Subject: [PATCH 3/3] feat(tts): add karaoke-style TTS progress bar to assistant UI Add rendering and support for a karaoke-style text-to-speechprogress bar in assistant messages so users can see the currently spoken sentence and highlighted word during playback. - Append TTS karaoke bar to AssistantMessageWidget when the message is the active TTS target and playback is speaking/paused/loading. - Implement _buildKaraokeBar to render the active sentence with a highlighted word span, using ConduitCard and theme styles. - Import conduit_components for shared UI primitives. - Extend TextToSpeechState with sentence data: sentences, sentenceOffsets, activeSentenceIndex, and per-word progress (wordStartInSentence, wordEndInSentence). - Add provider callbacks wiring: onSentenceIndex and onDeviceWordProgress handlers (hooked into TTS backend). - Prepare sentence splitting and word-progress plumbing in the TTS provider (prepares data used to drive the karaoke display). This change improves UX by visually indicating the spoken sentence and current word during TTS playback, aiding comprehension and accessibility. --- .../providers/text_to_speech_provider.dart | 100 +++++++++++++++++- .../chat/services/text_to_speech_service.dart | 16 +++ .../widgets/assistant_message_widget.dart | 61 +++++++++++ 3 files changed, 176 insertions(+), 1 deletion(-) diff --git a/lib/features/chat/providers/text_to_speech_provider.dart b/lib/features/chat/providers/text_to_speech_provider.dart index dc234d9..a68aff4 100644 --- a/lib/features/chat/providers/text_to_speech_provider.dart +++ b/lib/features/chat/providers/text_to_speech_provider.dart @@ -15,6 +15,11 @@ class TextToSpeechState { final TtsPlaybackStatus status; final String? activeMessageId; final String? errorMessage; + final List sentences; + final List sentenceOffsets; // start indices in full text + final int activeSentenceIndex; // -1 when none + final int? wordStartInSentence; // nullable; only for on-device + final int? wordEndInSentence; // nullable; only for on-device const TextToSpeechState({ this.initialized = false, @@ -22,6 +27,11 @@ class TextToSpeechState { this.status = TtsPlaybackStatus.idle, this.activeMessageId, this.errorMessage, + this.sentences = const [], + this.sentenceOffsets = const [], + this.activeSentenceIndex = -1, + this.wordStartInSentence, + this.wordEndInSentence, }); bool get isSpeaking => status == TtsPlaybackStatus.speaking; @@ -37,6 +47,12 @@ class TextToSpeechState { bool clearActiveMessageId = false, String? errorMessage, bool clearErrorMessage = false, + List? sentences, + List? sentenceOffsets, + int? activeSentenceIndex, + bool clearWord = false, + int? wordStartInSentence, + int? wordEndInSentence, }) { return TextToSpeechState( initialized: initialized ?? this.initialized, @@ -48,6 +64,15 @@ class TextToSpeechState { errorMessage: clearErrorMessage ? null : errorMessage ?? this.errorMessage, + sentences: sentences ?? this.sentences, + sentenceOffsets: sentenceOffsets ?? this.sentenceOffsets, + activeSentenceIndex: activeSentenceIndex ?? this.activeSentenceIndex, + wordStartInSentence: clearWord + ? null + : (wordStartInSentence ?? this.wordStartInSentence), + wordEndInSentence: clearWord + ? null + : (wordEndInSentence ?? this.wordEndInSentence), ); } } @@ -70,6 +95,8 @@ class TextToSpeechController extends Notifier { onPause: _handlePause, onContinue: _handleContinue, onError: _handleError, + onSentenceIndex: _handleSentenceIndex, + onDeviceWordProgress: _handleDeviceWordProgress, ); ref.onDispose(() { @@ -184,15 +211,23 @@ class TextToSpeechController extends Notifier { return; } + // Prepare sentence split for highlighting + final cleanText = MarkdownToText.convert(text); + final sentences = _splitForTts(cleanText); + final offsets = _computeOffsets(sentences); + state = state.copyWith( status: TtsPlaybackStatus.loading, activeMessageId: messageId, clearErrorMessage: true, + sentences: sentences, + sentenceOffsets: offsets, + activeSentenceIndex: sentences.isEmpty ? -1 : 0, + clearWord: true, ); try { // Convert markdown to clean text for TTS - final cleanText = MarkdownToText.convert(text); if (cleanText.isEmpty) { // No speakable content if (!ref.mounted) { @@ -224,6 +259,34 @@ class TextToSpeechController extends Notifier { } } + List _splitForTts(String text) { + final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim(); + if (normalized.isEmpty) return const []; + final parts = []; + final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)"); + int index = 0; + for (final match in sentenceRegex.allMatches('$normalized ')) { + final s = match.group(1) ?? ''; + if (s.trim().isNotEmpty) parts.add(s.trim()); + index = match.end; + } + if (index < normalized.length) { + final tail = normalized.substring(index).trim(); + if (tail.isNotEmpty) parts.add(tail); + } + return parts; + } + + List _computeOffsets(List sentences) { + final offsets = []; + int acc = 0; + for (final s in sentences) { + offsets.add(acc); + acc += s.length + 1; // assume a space or punctuation between + } + return offsets; + } + Future pause() async { if (!state.initialized || !state.available) { return; @@ -294,6 +357,41 @@ class TextToSpeechController extends Notifier { clearActiveMessageId: true, ); } + + void _handleSentenceIndex(int index) { + if (!ref.mounted) return; + final clamped = index.clamp( + -1, + state.sentences.isEmpty ? -1 : state.sentences.length - 1, + ); + state = state.copyWith( + activeSentenceIndex: clamped, + // clear per-word highlight when sentence switches (server or device) + clearWord: true, + ); + } + + void _handleDeviceWordProgress(int start, int end) { + if (!ref.mounted) return; + // Map global offsets to sentence index + final offsets = state.sentenceOffsets; + if (offsets.isEmpty) return; + int idx = 0; + for (var i = 0; i < offsets.length; i++) { + final sStart = offsets[i]; + final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30; + if (start >= sStart && start < sEnd) { + idx = i; + break; + } + } + final sentenceStart = offsets[idx]; + state = state.copyWith( + activeSentenceIndex: idx, + wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20), + wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20), + ); + } } final textToSpeechServiceProvider = Provider((ref) { diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart index 5d344e6..9f01ebb 100644 --- a/lib/features/chat/services/text_to_speech_service.dart +++ b/lib/features/chat/services/text_to_speech_service.dart @@ -31,6 +31,8 @@ class TextToSpeechService { VoidCallback? _onPause; VoidCallback? _onContinue; void Function(String message)? _onError; + void Function(int sentenceIndex)? _onSentenceIndex; + void Function(int start, int end)? _onDeviceWordProgress; bool get isInitialized => _initialized; bool get isAvailable => _available; @@ -51,6 +53,8 @@ class TextToSpeechService { VoidCallback? onPause, VoidCallback? onContinue, void Function(String message)? onError, + void Function(int sentenceIndex)? onSentenceIndex, + void Function(int start, int end)? onDeviceWordProgress, }) { _onStart = onStart; _onComplete = onComplete; @@ -58,6 +62,8 @@ class TextToSpeechService { _onPause = onPause; _onContinue = onContinue; _onError = onError; + _onSentenceIndex = onSentenceIndex; + _onDeviceWordProgress = onDeviceWordProgress; _tts.setStartHandler(_handleStart); _tts.setCompletionHandler(_handleComplete); @@ -65,6 +71,13 @@ class TextToSpeechService { _tts.setPauseHandler(_handlePause); _tts.setContinueHandler(_handleContinue); _tts.setErrorHandler(_handleError); + try { + _tts.setProgressHandler((String text, int start, int end, String word) { + _onDeviceWordProgress?.call(start, end); + }); + } catch (_) { + // Some platforms may not support progress handler + } } /// Initialize the native TTS engine lazily @@ -151,6 +164,7 @@ class TextToSpeechService { if (result is int && result != 1) { _onError?.call('Text-to-speech engine returned code $result'); } + _onSentenceIndex?.call(0); } Future pause() async { @@ -370,6 +384,7 @@ class TextToSpeechService { _buffered.add(Uint8List.fromList(firstBytes)); _currentIndex = 0; await _player.play(BytesSource(_buffered.first)); + _onSentenceIndex?.call(0); // Prefetch the rest in background unawaited( @@ -438,6 +453,7 @@ class TextToSpeechService { _currentIndex = nextIndex; final bytes = _buffered[nextIndex]; await _player.play(BytesSource(bytes)); + _onSentenceIndex?.call(_currentIndex); } List _splitForTts(String text) { diff --git a/lib/features/chat/widgets/assistant_message_widget.dart b/lib/features/chat/widgets/assistant_message_widget.dart index 370ee77..5d6a42b 100644 --- a/lib/features/chat/widgets/assistant_message_widget.dart +++ b/lib/features/chat/widgets/assistant_message_widget.dart @@ -18,6 +18,7 @@ import 'package:conduit/l10n/app_localizations.dart'; import 'enhanced_attachment.dart'; import 'package:conduit/shared/widgets/chat_action_button.dart'; import '../../../shared/widgets/model_avatar.dart'; +import '../../../shared/widgets/conduit_components.dart'; import 'package:url_launcher/url_launcher_string.dart'; import '../providers/chat_providers.dart' show sendMessageWithContainer; import '../../../core/utils/debug_logger.dart'; @@ -457,12 +458,72 @@ class _AssistantMessageWidgetState extends ConsumerState } if (children.isEmpty) return const SizedBox.shrink(); + // Append TTS karaoke bar if this is the active message + final ttsState = ref.watch(textToSpeechControllerProvider); + final isActive = + ttsState.activeMessageId == _messageId && + (ttsState.status == TtsPlaybackStatus.speaking || + ttsState.status == TtsPlaybackStatus.paused || + ttsState.status == TtsPlaybackStatus.loading); + if (isActive && ttsState.activeSentenceIndex >= 0) { + children.add(const SizedBox(height: Spacing.sm)); + children.add(_buildKaraokeBar(ttsState)); + } + return Column( crossAxisAlignment: CrossAxisAlignment.start, children: children, ); } + Widget _buildKaraokeBar(TextToSpeechState ttsState) { + final theme = context.conduitTheme; + final idx = ttsState.activeSentenceIndex; + if (idx < 0 || idx >= ttsState.sentences.length) { + return const SizedBox.shrink(); + } + final sentence = ttsState.sentences[idx]; + final ws = ttsState.wordStartInSentence; + final we = ttsState.wordEndInSentence; + + final baseStyle = TextStyle( + color: theme.textPrimary, + height: 1.2, + fontSize: 14, + ); + final highlightStyle = baseStyle.copyWith( + backgroundColor: theme.buttonPrimary.withValues(alpha: 0.25), + color: theme.textPrimary, + fontWeight: FontWeight.w600, + ); + + InlineSpan buildSpans() { + if (ws == null || + we == null || + ws < 0 || + we <= ws || + ws >= sentence.length) { + return TextSpan(text: sentence, style: baseStyle); + } + final safeEnd = we.clamp(0, sentence.length); + final before = sentence.substring(0, ws); + final word = sentence.substring(ws, safeEnd); + final after = sentence.substring(safeEnd); + return TextSpan( + children: [ + if (before.isNotEmpty) TextSpan(text: before, style: baseStyle), + TextSpan(text: word, style: highlightStyle), + if (after.isNotEmpty) TextSpan(text: after, style: baseStyle), + ], + ); + } + + return ConduitCard( + padding: const EdgeInsets.all(Spacing.sm), + child: RichText(text: buildSpans()), + ); + } + bool get _shouldShowTypingIndicator => widget.isStreaming && _isAssistantResponseEmpty;