diff --git a/android/app/src/main/AndroidManifest.xml b/android/app/src/main/AndroidManifest.xml index c3b95a5..38a833e 100644 --- a/android/app/src/main/AndroidManifest.xml +++ b/android/app/src/main/AndroidManifest.xml @@ -2,6 +2,7 @@ + diff --git a/ios/Flutter/AppFrameworkInfo.plist b/ios/Flutter/AppFrameworkInfo.plist index 1dc6cf7..b2a56aa 100644 --- a/ios/Flutter/AppFrameworkInfo.plist +++ b/ios/Flutter/AppFrameworkInfo.plist @@ -21,6 +21,6 @@ CFBundleVersion 1.0 MinimumOSVersion - 13.0 + 15.1 diff --git a/ios/Podfile b/ios/Podfile index e3b3517..24026fa 100644 --- a/ios/Podfile +++ b/ios/Podfile @@ -1,5 +1,5 @@ # Uncomment this line to define a global platform for your project -platform :ios, '13.0' +platform :ios, '15.1' # CocoaPods analytics sends network stats synchronously affecting flutter build latency. ENV['COCOAPODS_DISABLE_STATS'] = 'true' diff --git a/ios/Podfile.lock b/ios/Podfile.lock index 42cde87..d56263b 100644 --- a/ios/Podfile.lock +++ b/ios/Podfile.lock @@ -49,13 +49,18 @@ PODS: - Flutter - image_picker_ios (0.0.1): - Flutter - - mic_stream_recorder (0.0.1): - - Flutter + - onnxruntime-c (1.22.0) + - onnxruntime-objc (1.22.0): + - onnxruntime-objc/Core (= 1.22.0) + - onnxruntime-objc/Core (1.22.0): + - onnxruntime-c (= 1.22.0) - package_info_plus (0.4.5): - Flutter - path_provider_foundation (0.0.1): - Flutter - FlutterMacOS + - record_ios (1.1.0): + - Flutter - SDWebImage (5.21.1): - SDWebImage/Core (= 5.21.1) - SDWebImage/Core (5.21.1) @@ -80,6 +85,9 @@ PODS: - SwiftyGif (5.4.5) - url_launcher_ios (0.0.1): - Flutter + - vad (0.0.6): + - Flutter + - onnxruntime-objc (= 1.22.0) - wakelock_plus (0.0.1): - Flutter - webview_flutter_wkwebview (0.0.1): @@ -96,9 +104,9 @@ DEPENDENCIES: - flutter_secure_storage (from `.symlinks/plugins/flutter_secure_storage/ios`) - flutter_tts (from `.symlinks/plugins/flutter_tts/ios`) - image_picker_ios (from `.symlinks/plugins/image_picker_ios/ios`) - - mic_stream_recorder (from `.symlinks/plugins/mic_stream_recorder/ios`) - package_info_plus (from `.symlinks/plugins/package_info_plus/ios`) - path_provider_foundation (from `.symlinks/plugins/path_provider_foundation/darwin`) + - record_ios (from `.symlinks/plugins/record_ios/ios`) - share_handler_ios (from `.symlinks/plugins/share_handler_ios/ios`) - share_handler_ios_models (from `.symlinks/plugins/share_handler_ios/ios/Models`) - share_plus (from `.symlinks/plugins/share_plus/ios`) @@ -106,6 +114,7 @@ DEPENDENCIES: - sqflite_darwin (from `.symlinks/plugins/sqflite_darwin/darwin`) - stts (from `.symlinks/plugins/stts/ios`) - url_launcher_ios (from `.symlinks/plugins/url_launcher_ios/ios`) + - vad (from `.symlinks/plugins/vad/ios`) - wakelock_plus (from `.symlinks/plugins/wakelock_plus/ios`) - webview_flutter_wkwebview (from `.symlinks/plugins/webview_flutter_wkwebview/darwin`) @@ -113,6 +122,8 @@ SPEC REPOS: trunk: - DKImagePickerController - DKPhotoGallery + - onnxruntime-c + - onnxruntime-objc - SDWebImage - SwiftyGif @@ -135,12 +146,12 @@ EXTERNAL SOURCES: :path: ".symlinks/plugins/flutter_tts/ios" image_picker_ios: :path: ".symlinks/plugins/image_picker_ios/ios" - mic_stream_recorder: - :path: ".symlinks/plugins/mic_stream_recorder/ios" package_info_plus: :path: ".symlinks/plugins/package_info_plus/ios" path_provider_foundation: :path: ".symlinks/plugins/path_provider_foundation/darwin" + record_ios: + :path: ".symlinks/plugins/record_ios/ios" share_handler_ios: :path: ".symlinks/plugins/share_handler_ios/ios" share_handler_ios_models: @@ -155,6 +166,8 @@ EXTERNAL SOURCES: :path: ".symlinks/plugins/stts/ios" url_launcher_ios: :path: ".symlinks/plugins/url_launcher_ios/ios" + vad: + :path: ".symlinks/plugins/vad/ios" wakelock_plus: :path: ".symlinks/plugins/wakelock_plus/ios" webview_flutter_wkwebview: @@ -172,9 +185,11 @@ SPEC CHECKSUMS: flutter_secure_storage: 1ed9476fba7e7a782b22888f956cce43e2c62f13 flutter_tts: b88dbc8655d3dc961bc4a796e4e16a4cc1795833 image_picker_ios: 7fe1ff8e34c1790d6fff70a32484959f563a928a - mic_stream_recorder: 27d2d1225563a3a28bf4019fc5cc198cffd7dad1 + onnxruntime-c: 7f778680e96145956c0a31945f260321eed2611a + onnxruntime-objc: 83d28b87525bd971259a66e153ea32b5d023de19 package_info_plus: af8e2ca6888548050f16fa2f1938db7b5a5df499 path_provider_foundation: 080d55be775b7414fd5a5ef3ac137b97b097e564 + record_ios: f75fa1d57f840012775c0e93a38a7f3ceea1a374 SDWebImage: f29024626962457f3470184232766516dee8dfea share_handler_ios: e2244e990f826b2c8eaa291ac3831569438ba0fb share_handler_ios_models: fc638c9b4330dc7f082586c92aee9dfa0b87b871 @@ -184,9 +199,10 @@ SPEC CHECKSUMS: stts: 1a48df645bb516e86e4121d5253b582749a1d3a6 SwiftyGif: 706c60cf65fa2bc5ee0313beece843c8eb8194d4 url_launcher_ios: 694010445543906933d732453a59da0a173ae33d + vad: 7934867589afe53567f492df66fb1615f2185822 wakelock_plus: e29112ab3ef0b318e58cfa5c32326458be66b556 webview_flutter_wkwebview: 8ebf4fded22593026f7dbff1fbff31ea98573c8d -PODFILE CHECKSUM: df88575cf61e98a1a3edf2f8c887dad2c18c2079 +PODFILE CHECKSUM: a6ecbec6401c6461e69650e9ef66360aee70610f COCOAPODS: 1.16.2 diff --git a/ios/Runner.xcodeproj/project.pbxproj b/ios/Runner.xcodeproj/project.pbxproj index 90360a7..7f9d99d 100644 --- a/ios/Runner.xcodeproj/project.pbxproj +++ b/ios/Runner.xcodeproj/project.pbxproj @@ -585,7 +585,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 13.0; + IPHONEOS_DEPLOYMENT_TARGET = 15.1; MTL_ENABLE_DEBUG_INFO = NO; SDKROOT = iphoneos; SUPPORTED_PLATFORMS = iphoneos; @@ -722,7 +722,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 13.0; + IPHONEOS_DEPLOYMENT_TARGET = 15.1; MTL_ENABLE_DEBUG_INFO = YES; ONLY_ACTIVE_ARCH = YES; SDKROOT = iphoneos; @@ -773,7 +773,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 13.0; + IPHONEOS_DEPLOYMENT_TARGET = 15.1; MTL_ENABLE_DEBUG_INFO = NO; SDKROOT = iphoneos; SUPPORTED_PLATFORMS = iphoneos; @@ -865,7 +865,7 @@ INFOPLIST_FILE = ShareExtension/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = ShareExtension; INFOPLIST_KEY_NSHumanReadableCopyright = ""; - IPHONEOS_DEPLOYMENT_TARGET = 14.0; + IPHONEOS_DEPLOYMENT_TARGET = 15.1; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", @@ -908,7 +908,7 @@ INFOPLIST_FILE = ShareExtension/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = ShareExtension; INFOPLIST_KEY_NSHumanReadableCopyright = ""; - IPHONEOS_DEPLOYMENT_TARGET = 14.0; + IPHONEOS_DEPLOYMENT_TARGET = 15.1; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", @@ -948,7 +948,7 @@ INFOPLIST_FILE = ShareExtension/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = ShareExtension; INFOPLIST_KEY_NSHumanReadableCopyright = ""; - IPHONEOS_DEPLOYMENT_TARGET = 14.0; + IPHONEOS_DEPLOYMENT_TARGET = 15.1; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart index 7c8f4b4..0fe7240 100644 --- a/lib/features/chat/services/text_to_speech_service.dart +++ b/lib/features/chat/services/text_to_speech_service.dart @@ -11,6 +11,13 @@ import '../../../core/services/settings_service.dart'; typedef _SpeechChunk = ({Uint8List bytes, String mimeType}); +class SpeechAudioChunk { + const SpeechAudioChunk({required this.bytes, required this.mimeType}); + + final Uint8List bytes; + final String mimeType; +} + /// Lightweight wrapper around FlutterTts to centralize configuration class TextToSpeechService { final FlutterTts _tts = FlutterTts(); @@ -45,6 +52,7 @@ class TextToSpeechService { bool get isAvailable => _available; bool get deviceEngineAvailable => _deviceEngineAvailable; bool get serverEngineAvailable => _api != null; + bool get prefersServerEngine => _shouldUseServer(); TextToSpeechService({ApiService? api}) : _api = api { // Wire minimal player events to callbacks @@ -277,6 +285,29 @@ class TextToSpeechService { _onSentenceIndex?.call(0); } + Future synthesizeServerSpeechChunk(String text) async { + if (text.trim().isEmpty) { + throw ArgumentError('Cannot synthesize empty text'); + } + if (_api == null) { + throw StateError('Server text-to-speech is unavailable'); + } + if (!_initialized) { + await initialize( + deviceVoice: _preferredVoice, + serverVoice: _serverPreferredVoice, + engine: _engine, + ); + } + final voice = await _resolveServerVoice(); + final chunk = await _api.generateSpeech( + text: text, + voice: voice, + speed: _speechRate, + ); + return SpeechAudioChunk(bytes: chunk.bytes, mimeType: chunk.mimeType); + } + Future pause() async { if (!_initialized) return; try { @@ -572,6 +603,15 @@ class TextToSpeechService { } } + Future preloadServerDefaults() async { + if (_api == null) { + return; + } + try { + await _getServerDefaultVoice(); + } catch (_) {} + } + // ===== Server chunked playback ===== Future _startServerChunkedPlayback(String text) async { diff --git a/lib/features/chat/services/voice_call_service.dart b/lib/features/chat/services/voice_call_service.dart index 1fb4123..fa32b90 100644 --- a/lib/features/chat/services/voice_call_service.dart +++ b/lib/features/chat/services/voice_call_service.dart @@ -1,5 +1,7 @@ import 'dart:async'; +import 'dart:collection'; +import 'package:audioplayers/audioplayers.dart'; import 'package:riverpod_annotation/riverpod_annotation.dart'; import 'package:wakelock_plus/wakelock_plus.dart'; @@ -49,6 +51,18 @@ class VoiceCallService { final Set _pauseReasons = {}; SocketEventSubscription? _socketSubscription; Timer? _keepAliveTimer; + final ListQueue _speechQueue = ListQueue(); + int _enqueuedSentenceCount = 0; + String? _activeAssistantMessageId; + bool _responseCompleted = false; + bool _listeningSuspendedForSpeech = false; + final Map _serverAudioBuffer = {}; + final AudioPlayer _serverAudioPlayer = AudioPlayer(); + int _serverAudioSession = 0; + int _pendingServerAudioFetches = 0; + bool _serverPipelineActive = false; + int _nextServerChunkId = 0; + int _nextServerPlaybackId = 0; final StreamController _stateController = StreamController.broadcast(); @@ -75,6 +89,12 @@ class VoiceCallService { // sentence/word callbacks are not required for call UI, but harmless ); + _serverAudioPlayer.onPlayerComplete.listen((_) { + _handleServerAudioComplete(); + }); + + unawaited(_tts.preloadServerDefaults()); + // Set up notification action handler _notificationService.onActionPressed = _handleNotificationAction; } @@ -197,6 +217,13 @@ class VoiceCallService { if (_isDisposed) return; try { + _speechQueue.clear(); + _enqueuedSentenceCount = 0; + _activeAssistantMessageId = null; + _responseCompleted = false; + _listeningSuspendedForSpeech = false; + _resetServerAudio(stopPlayback: true); + if (_pauseReasons.isNotEmpty) { _listeningPaused = true; if (_state != VoiceCallState.paused) { @@ -276,6 +303,14 @@ class VoiceCallService { String _accumulatedResponse = ''; bool _isSpeaking = false; + bool get _hasPendingSpeech { + if (_serverPipelineActive) { + return _isSpeaking || + _serverAudioBuffer.isNotEmpty || + _pendingServerAudioFetches > 0; + } + return _isSpeaking || _speechQueue.isNotEmpty; + } void _handleSocketEvent( Map event, @@ -284,18 +319,32 @@ class VoiceCallService { if (_isDisposed) return; final outerData = event['data']; + final messageId = event['message_id']?.toString(); if (outerData is Map) { final eventType = outerData['type']?.toString(); final innerData = outerData['data']; if (eventType == 'chat:completion' && innerData is Map) { + final bool doneFlag = innerData['done'] == true; + if (messageId != null && messageId.isNotEmpty) { + _handleAssistantMessageStart(messageId); + } + // Handle full content replacement (used by some models/backends) if (innerData.containsKey('content')) { final content = innerData['content']?.toString() ?? ''; if (content.isNotEmpty) { _accumulatedResponse = content; _responseController.add(content); + _processSpeakableSegments(isFinalChunk: doneFlag); + if (doneFlag) { + _responseCompleted = true; + _maybeResumeListeningAfterSpeech(); + } + } else if (doneFlag) { + _responseCompleted = true; + _maybeResumeListeningAfterSpeech(); } } @@ -313,61 +362,248 @@ class VoiceCallService { if (deltaContent.isNotEmpty) { _accumulatedResponse += deltaContent; _responseController.add(_accumulatedResponse); + _processSpeakableSegments(isFinalChunk: false); } } // Check for completion - if (finishReason == 'stop') { - if (_accumulatedResponse.isNotEmpty && !_isSpeaking) { - _speakResponse(_accumulatedResponse); - _accumulatedResponse = ''; - } else if (_accumulatedResponse.isEmpty) { - // No response, restart listening unless paused - if (_pauseReasons.isEmpty) { - _startListening(); - } else if (_state != VoiceCallState.paused) { - _updateState(VoiceCallState.paused); - } - } + if (finishReason == 'stop' || finishReason == 'length') { + _responseCompleted = true; + _processSpeakableSegments(isFinalChunk: true); + _maybeResumeListeningAfterSpeech(); } } } + + if (doneFlag && !_responseCompleted) { + _responseCompleted = true; + _processSpeakableSegments(isFinalChunk: true); + _maybeResumeListeningAfterSpeech(); + } } } } - Future _speakResponse(String response) async { - if (_isDisposed || _isSpeaking) return; + void _handleAssistantMessageStart(String messageId) { + if (_activeAssistantMessageId == messageId) { + return; + } + _activeAssistantMessageId = messageId; + _accumulatedResponse = ''; + _responseController.add(''); + _speechQueue.clear(); + _enqueuedSentenceCount = 0; + _responseCompleted = false; + _resetServerAudio(stopPlayback: true); + if (_isSpeaking) { + _isSpeaking = false; + unawaited(_tts.stop()); + } + } - try { - _isSpeaking = true; + void _processSpeakableSegments({required bool isFinalChunk}) { + if (_isDisposed) return; + final cleanText = MarkdownToText.convert(_accumulatedResponse).trim(); + if (cleanText.isEmpty) { + return; + } - // Stop listening before speaking - await _voiceInput.stopListening(); - await _transcriptSubscription?.cancel(); - await _intensitySubscription?.cancel(); + final segments = _tts.splitTextForSpeech(cleanText); + if (segments.isEmpty) { + return; + } - _updateState(VoiceCallState.speaking); + var availableCount = segments.length; + if (!isFinalChunk && availableCount > 0) { + availableCount -= 1; + } + if (availableCount < 0) { + availableCount = 0; + } - // Convert markdown to clean text for TTS - final cleanText = MarkdownToText.convert(response); - if (cleanText.isEmpty) { - // No speakable content, restart listening - _isSpeaking = false; - await _startListening(); - return; + if (_enqueuedSentenceCount > availableCount) { + _enqueuedSentenceCount = availableCount; + } + + if (availableCount > _enqueuedSentenceCount) { + final newChunks = segments.sublist( + _enqueuedSentenceCount, + availableCount, + ); + _enqueuedSentenceCount = availableCount; + for (final chunk in newChunks) { + _enqueueSpeechChunk(chunk); } + } - await _tts.speak(cleanText); - // After speaking completes, _handleTtsComplete will restart listening + if (isFinalChunk && _enqueuedSentenceCount < segments.length) { + _enqueuedSentenceCount = segments.length; + _enqueueSpeechChunk(segments.last); + } + } + + void _enqueueSpeechChunk(String chunk) { + if (_isDisposed) return; + final trimmed = chunk.trim(); + if (trimmed.isEmpty) { + return; + } + if (_isMuted) { + return; // Skip playback while muted + } + if (_tts.prefersServerEngine) { + _serverPipelineActive = true; + final chunkId = _nextServerChunkId++; + _prefetchServerAudio(trimmed, chunkId); + return; + } + _speechQueue.add(trimmed); + if (!_isSpeaking) { + unawaited(_startNextSpeechChunk()); + } + } + + Future _startNextSpeechChunk() async { + if (_isDisposed) return; + if (_speechQueue.isEmpty || _isSpeaking || _isMuted) { + return; + } + + final next = _speechQueue.removeFirst(); + try { + await _prepareForSpeechPlayback(); + _isSpeaking = true; + _updateState(VoiceCallState.speaking); + await _tts.speak(next); } catch (e) { _isSpeaking = false; _updateState(VoiceCallState.error); - // Restart listening even if TTS fails - await _startListening(); + unawaited(_startListening()); } } + void _prefetchServerAudio(String chunk, int chunkId) { + if (_isDisposed) { + return; + } + final session = _serverAudioSession; + _pendingServerAudioFetches++; + _tts + .synthesizeServerSpeechChunk(chunk) + .then((audioChunk) { + _pendingServerAudioFetches--; + if (_pendingServerAudioFetches < 0) { + _pendingServerAudioFetches = 0; + } + if (_isDisposed || + !_serverPipelineActive || + session != _serverAudioSession) { + return; + } + _serverAudioBuffer[chunkId] = audioChunk; + _maybeStartServerAudio(); + }) + .catchError((error, _) { + _pendingServerAudioFetches--; + if (_pendingServerAudioFetches < 0) { + _pendingServerAudioFetches = 0; + } + if (_isDisposed) { + return; + } + _handleTtsError(error.toString()); + }); + } + + void _maybeStartServerAudio() { + if (_isDisposed || !_serverPipelineActive) { + return; + } + if (_isSpeaking || _isMuted) { + return; + } + final chunk = _serverAudioBuffer.remove(_nextServerPlaybackId); + if (chunk == null) { + return; + } + _nextServerPlaybackId++; + _playServerAudioChunk(chunk); + } + + Future _playServerAudioChunk(SpeechAudioChunk chunk) async { + try { + await _prepareForSpeechPlayback(); + _isSpeaking = true; + _updateState(VoiceCallState.speaking); + await _serverAudioPlayer.play( + BytesSource(chunk.bytes, mimeType: chunk.mimeType), + ); + } catch (e) { + _isSpeaking = false; + _handleTtsError(e.toString()); + } + } + + void _handleServerAudioComplete() { + if (_isDisposed) { + return; + } + _isSpeaking = false; + if (_serverAudioBuffer.containsKey(_nextServerPlaybackId)) { + _maybeStartServerAudio(); + return; + } + _maybeResumeListeningAfterSpeech(); + } + + void _resetServerAudio({bool stopPlayback = false}) { + _serverAudioBuffer.clear(); + _pendingServerAudioFetches = 0; + _serverAudioSession++; + _nextServerChunkId = 0; + _nextServerPlaybackId = 0; + if (stopPlayback) { + unawaited(_serverAudioPlayer.stop()); + _isSpeaking = false; + } + _serverPipelineActive = false; + } + + Future _prepareForSpeechPlayback() async { + if (_listeningSuspendedForSpeech) { + return; + } + _listeningSuspendedForSpeech = true; + await _voiceInput.stopListening(); + await _transcriptSubscription?.cancel(); + _transcriptSubscription = null; + await _intensitySubscription?.cancel(); + _intensitySubscription = null; + } + + void _maybeResumeListeningAfterSpeech() { + if (!_responseCompleted) { + return; + } + if (_hasPendingSpeech) { + return; + } + + if (_pauseReasons.isNotEmpty) { + _listeningPaused = true; + if (_state != VoiceCallState.paused) { + _updateState(VoiceCallState.paused); + } + return; + } + + if (_serverPipelineActive && _pendingServerAudioFetches > 0) { + return; + } + + unawaited(_startListening()); + } + void _handleTtsStart() { if (_isDisposed) return; _updateState(VoiceCallState.speaking); @@ -376,17 +612,19 @@ class VoiceCallService { void _handleTtsComplete() { if (_isDisposed) return; _isSpeaking = false; - // After assistant finishes speaking, resume only if not paused - if (_pauseReasons.isNotEmpty) { - _listeningPaused = true; - _updateState(VoiceCallState.paused); + if (_speechQueue.isNotEmpty) { + unawaited(_startNextSpeechChunk()); return; } - _startListening(); + _maybeResumeListeningAfterSpeech(); } void _handleTtsError(String error) { if (_isDisposed) return; + _isSpeaking = false; + _speechQueue.clear(); + _resetServerAudio(stopPlayback: true); + _listeningSuspendedForSpeech = false; _updateState(VoiceCallState.error); // Try to recover by restarting listening _startListening(); @@ -405,6 +643,7 @@ class VoiceCallService { await _voiceInput.stopListening(); await _tts.stop(); + await _serverAudioPlayer.stop(); await BackgroundStreamingHandler.instance.stopBackgroundExecution(const [ _voiceCallStreamId, @@ -421,6 +660,13 @@ class VoiceCallService { _isMuted = false; _listeningPaused = false; _pauseReasons.clear(); + _speechQueue.clear(); + _enqueuedSentenceCount = 0; + _responseCompleted = false; + _listeningSuspendedForSpeech = false; + _activeAssistantMessageId = null; + _isSpeaking = false; + _resetServerAudio(stopPlayback: true); _updateState(VoiceCallState.disconnected); } @@ -462,6 +708,11 @@ class VoiceCallService { Future cancelSpeaking() async { if (_isDisposed) return; + _speechQueue.clear(); + _enqueuedSentenceCount = 0; + _responseCompleted = false; + _listeningSuspendedForSpeech = false; + _resetServerAudio(stopPlayback: true); await _tts.stop(); _isSpeaking = false; _accumulatedResponse = ''; @@ -527,6 +778,11 @@ class VoiceCallService { _isSpeaking = false; _accumulatedResponse = ''; } + _speechQueue.clear(); + _enqueuedSentenceCount = 0; + _responseCompleted = false; + _listeningSuspendedForSpeech = false; + _resetServerAudio(stopPlayback: true); pauseListening(reason: VoiceCallPauseReason.mute); } else { resumeListening(reason: VoiceCallPauseReason.mute); @@ -547,6 +803,7 @@ class VoiceCallService { _voiceInput.dispose(); await _tts.dispose(); + await _serverAudioPlayer.dispose(); // Cancel notification await _notificationService.cancelNotification(); diff --git a/lib/features/chat/services/voice_input_service.dart b/lib/features/chat/services/voice_input_service.dart index c1990aa..df7e2fe 100644 --- a/lib/features/chat/services/voice_input_service.dart +++ b/lib/features/chat/services/voice_input_service.dart @@ -1,13 +1,13 @@ import 'dart:async'; -import 'dart:io' show File, Platform; +import 'dart:convert'; +import 'dart:io' show Platform; +import 'dart:typed_data'; import 'package:flutter/widgets.dart'; import 'package:flutter_riverpod/flutter_riverpod.dart'; import 'package:riverpod_annotation/riverpod_annotation.dart'; -import 'package:mic_stream_recorder/mic_stream_recorder.dart'; import 'package:stts/stts.dart'; -import 'package:path/path.dart' as p; -import 'package:path_provider/path_provider.dart'; +import 'package:vad/vad.dart'; import '../../../core/providers/app_providers.dart'; import '../../../core/services/api_service.dart'; @@ -23,7 +23,10 @@ class LocaleName { } class VoiceInputService { - final MicStreamRecorder _recorder = MicStreamRecorder(); + static const int _vadSampleRate = 16000; + static const int _vadFrameSamples = 1536; + + final VadHandler _vadHandler = VadHandler.create(); final Stt _speech = Stt(); final ApiService? _api; final Ref? _ref; @@ -41,17 +44,17 @@ class VoiceInputService { _intensityController?.stream ?? const Stream.empty(); int _lastIntensity = 0; Timer? _intensityDecayTimer; - Timer? _silenceTimer; - bool _hasDetectedSpeech = false; - int _amplitudeCallbackCount = 0; - Timer? _amplitudeFallbackTimer; + List? _vadPendingSamples; Stream get textStream => _textStreamController?.stream ?? const Stream.empty(); Timer? _autoStopTimer; - StreamSubscription? _ampSub; StreamSubscription? _sttResultSub; StreamSubscription? _sttStateSub; + StreamSubscription>? _vadSpeechEndSub; + StreamSubscription<({double isSpeech, double notSpeech, List frame})>? + _vadFrameSub; + StreamSubscription? _vadErrorSub; bool get isSupportedPlatform => Platform.isAndroid || Platform.isIOS; bool get hasServerStt => _api != null; @@ -60,9 +63,7 @@ class VoiceInputService { bool get prefersServerOnly => _preference == SttPreference.serverOnly; bool get prefersDeviceOnly => _preference == SttPreference.deviceOnly; - VoiceInputService({ApiService? api, Ref? ref}) - : _api = api, - _ref = ref; + VoiceInputService({ApiService? api, Ref? ref}) : _api = api, _ref = ref; void updatePreference(SttPreference preference) { _preference = preference; @@ -327,33 +328,27 @@ class VoiceInputService { _autoStopTimer?.cancel(); _autoStopTimer = null; - _silenceTimer?.cancel(); - _silenceTimer = null; - - _amplitudeFallbackTimer?.cancel(); - _amplitudeFallbackTimer = null; - if (_usingServerStt) { - await _finalizeServerRecording(); + await _stopVadRecording(); + final samples = _vadPendingSamples; + _vadPendingSamples = null; + if (samples != null && samples.isNotEmpty) { + await _processVadSamples(samples); + } } else { await _stopLocalStt(); + if (_currentText.isNotEmpty) { + _textStreamController?.add(_currentText); + } } - await _ampSub?.cancel(); - _ampSub = null; - _intensityDecayTimer?.cancel(); _intensityDecayTimer = null; _lastIntensity = 0; - if (!_usingServerStt && _currentText.isNotEmpty) { - _textStreamController?.add(_currentText); - } - await _closeControllers(); _usingServerStt = false; - _hasDetectedSpeech = false; } Future _stopLocalStt() async { @@ -411,82 +406,100 @@ class VoiceInputService { } Future _startServerRecording() async { - final path = await _createRecordingPath(); - _hasDetectedSpeech = false; + await _setupVadStreams(); + final settings = _ref?.read(appSettingsProvider); + final silenceMs = settings?.voiceSilenceDuration ?? 2000; + final redemptionFrames = _silenceDurationToFrames(silenceMs); + final endPadFrames = redemptionFrames > 4 + ? (redemptionFrames / 4).round().clamp(1, redemptionFrames) + : 1; - await _recorder.startRecording(path); + try { + await _vadHandler.startListening( + frameSamples: _vadFrameSamples, + redemptionFrames: redemptionFrames, + endSpeechPadFrames: endPadFrames, + preSpeechPadFrames: 2, + minSpeechFrames: 3, + submitUserSpeechOnPause: true, + recordConfig: const RecordConfig( + encoder: AudioEncoder.pcm16bits, + sampleRate: _vadSampleRate, + numChannels: 1, + bitRate: 16, + echoCancel: true, + autoGain: true, + noiseSuppress: true, + androidConfig: AndroidRecordConfig( + audioSource: AndroidAudioSource.voiceCommunication, + audioManagerMode: AudioManagerMode.modeInCommunication, + speakerphone: true, + manageBluetooth: true, + useLegacy: false, + ), + ), + ); + } catch (error) { + _textStreamController?.addError(error); + rethrow; + } + } - await _ampSub?.cancel(); - _amplitudeFallbackTimer?.cancel(); - _amplitudeCallbackCount = 0; + Future _setupVadStreams() async { + await _vadSpeechEndSub?.cancel(); + _vadSpeechEndSub = _vadHandler.onSpeechEnd.listen((samples) { + if (!_isListening || !_usingServerStt) return; + if (samples.isEmpty) return; + _vadPendingSamples = samples; + if (_isListening) { + unawaited(_stopListening()); + } + }); - _ampSub = _recorder.amplitudeStream.listen((amplitude) { - _amplitudeCallbackCount++; + await _vadFrameSub?.cancel(); + _vadFrameSub = _vadHandler.onFrameProcessed.listen((frameData) { if (!_isListening) return; - - _lastIntensity = _normalizedToIntensity(amplitude); + final intensity = _intensityFromVadFrame(frameData.frame); + _lastIntensity = intensity; try { _intensityController?.add(_lastIntensity); } catch (_) {} - - _handleServerAmplitude(amplitude); }); - _amplitudeFallbackTimer = Timer(const Duration(seconds: 1), () { - if (_amplitudeCallbackCount == 0) { - _silenceTimer = Timer(const Duration(seconds: 15), () { - if (_isListening && _usingServerStt) { - unawaited(_stopListening()); - } - }); + await _vadErrorSub?.cancel(); + _vadErrorSub = _vadHandler.onError.listen((message) { + _textStreamController?.addError(Exception(message)); + if (_isListening) { + unawaited(_stopListening()); } }); } - void _handleServerAmplitude(double amplitude) { - if (!_usingServerStt || !_isListening) return; - - const double speechThreshold = 0.55; - if (amplitude.isNaN || amplitude.isInfinite) return; - - if (amplitude > speechThreshold) { - _hasDetectedSpeech = true; - _silenceTimer?.cancel(); - _silenceTimer = null; - } else if (_hasDetectedSpeech && _silenceTimer == null) { - final silenceDuration = _ref?.read(appSettingsProvider).voiceSilenceDuration ?? 2000; - _silenceTimer = Timer(Duration(milliseconds: silenceDuration), () { - if (_isListening && _usingServerStt) { - unawaited(_stopListening()); - } - }); - } + Future _stopVadRecording() async { + try { + await _vadHandler.stopListening(); + } catch (_) {} + await _vadSpeechEndSub?.cancel(); + _vadSpeechEndSub = null; + await _vadFrameSub?.cancel(); + _vadFrameSub = null; + await _vadErrorSub?.cancel(); + _vadErrorSub = null; } - Future _createRecordingPath() async { - final directory = await getTemporaryDirectory(); - final timestamp = DateTime.now().millisecondsSinceEpoch; - final fileName = 'conduit_voice_$timestamp.m4a'; - return p.join(directory.path, fileName); - } - - Future _finalizeServerRecording() async { + Future _processVadSamples(List samples) async { final api = _api; if (api == null) return; - final path = await _recorder.stopRecording(); - if (path == null || path.isEmpty) return; - - final file = File(path); try { - if (!await file.exists()) return; - final bytes = await file.readAsBytes(); - if (bytes.isEmpty) return; + final wavBytes = _samplesToWav(samples); + final fileName = + 'conduit_voice_${DateTime.now().millisecondsSinceEpoch}.wav'; final response = await api.transcribeSpeech( - audioBytes: bytes, - fileName: p.basename(path), - mimeType: 'audio/mp4', + audioBytes: wavBytes, + fileName: fileName, + mimeType: 'audio/wav', language: _languageForServer(), ); @@ -499,19 +512,72 @@ class VoiceInputService { } } catch (error) { _textStreamController?.addError(error); - } finally { - unawaited(_cleanupRecordingFile(file)); } } - Future _cleanupRecordingFile(File file) async { - try { - if (await file.exists()) { - await file.delete(); - } - } catch (_) {} + int _silenceDurationToFrames(int milliseconds) { + final frameDurationMs = (_vadFrameSamples / _vadSampleRate) * 1000; + final frames = (milliseconds / frameDurationMs).round(); + return frames.clamp(4, 50); } + int _intensityFromVadFrame(List frame) { + if (frame.isEmpty) return 0; + double peak = 0; + for (final sample in frame) { + final value = sample.abs(); + if (value > peak) { + peak = value; + } + } + final scaled = (peak * 12).round(); + return scaled.clamp(0, 10); + } + + Uint8List _samplesToWav(List samples) { + if (samples.isEmpty) { + return Uint8List(0); + } + final Int16List pcm = Int16List(samples.length); + for (var i = 0; i < samples.length; i++) { + final clamped = samples[i].clamp(-1.0, 1.0); + final scaled = (clamped * 32767).round().clamp(-32768, 32767); + pcm[i] = scaled; + } + + final dataLength = pcm.lengthInBytes; + final bytesPerSample = 2; + final numChannels = 1; + final byteRate = _vadSampleRate * numChannels * bytesPerSample; + final blockAlign = numChannels * bytesPerSample; + + final builder = BytesBuilder(); + builder.add(ascii.encode('RIFF')); + builder.add(_int32Le(36 + dataLength)); + builder.add(ascii.encode('WAVE')); + builder.add(ascii.encode('fmt ')); + builder.add(_int32Le(16)); + builder.add(_int16Le(1)); + builder.add(_int16Le(numChannels)); + builder.add(_int32Le(_vadSampleRate)); + builder.add(_int32Le(byteRate)); + builder.add(_int16Le(blockAlign)); + builder.add(_int16Le(16)); + builder.add(ascii.encode('data')); + builder.add(_int32Le(dataLength)); + builder.add(Uint8List.view(pcm.buffer)); + return builder.toBytes(); + } + + List _int16Le(int value) => [value & 0xff, (value >> 8) & 0xff]; + + List _int32Le(int value) => [ + value & 0xff, + (value >> 8) & 0xff, + (value >> 16) & 0xff, + (value >> 24) & 0xff, + ]; + String? _languageForServer() { final locale = _selectedLocaleId; if (locale != null && locale.isNotEmpty) { @@ -611,11 +677,6 @@ class VoiceInputService { return null; } - int _normalizedToIntensity(double value) { - if (value.isNaN || value.isInfinite) return 0; - return (value * 10).round().clamp(0, 10); - } - Future _closeControllers() async { if (_textStreamController != null) { try { @@ -647,7 +708,7 @@ class VoiceInputService { void dispose() { stopListening(); - _silenceTimer?.cancel(); + unawaited(_vadHandler.dispose()); try { _speech.dispose().catchError((_) {}); } catch (_) {} diff --git a/pubspec.lock b/pubspec.lock index b662398..f12e607 100644 --- a/pubspec.lock +++ b/pubspec.lock @@ -965,14 +965,6 @@ packages: url: "https://pub.dev" source: hosted version: "1.16.0" - mic_stream_recorder: - dependency: "direct main" - description: - name: mic_stream_recorder - sha256: "73965991ef5cc93d2b0c1e6d590cbd567a853b9ee7b2d52de43a73f185bb0d9c" - url: "https://pub.dev" - source: hosted - version: "1.1.2" mime: dependency: transitive description: @@ -1173,6 +1165,70 @@ packages: url: "https://pub.dev" source: hosted version: "1.5.0" + record: + dependency: transitive + description: + name: record + sha256: "6bad72fb3ea6708d724cf8b6c97c4e236cf9f43a52259b654efeb6fd9b737f1f" + url: "https://pub.dev" + source: hosted + version: "6.1.2" + record_android: + dependency: transitive + description: + name: record_android + sha256: fb54ee4e28f6829b8c580252a9ef49d9c549cfd263b0660ad7eeac0908658e9f + url: "https://pub.dev" + source: hosted + version: "1.4.4" + record_ios: + dependency: transitive + description: + name: record_ios + sha256: "765b42ac1be019b1674ddd809b811fc721fe5a93f7bb1da7803f0d16772fd6d7" + url: "https://pub.dev" + source: hosted + version: "1.1.4" + record_linux: + dependency: transitive + description: + name: record_linux + sha256: "235b1f1fb84e810f8149cc0c2c731d7d697f8d1c333b32cb820c449bf7bb72d8" + url: "https://pub.dev" + source: hosted + version: "1.2.1" + record_macos: + dependency: transitive + description: + name: record_macos + sha256: "842ea4b7e95f4dd237aacffc686d1b0ff4277e3e5357865f8d28cd28bc18ed95" + url: "https://pub.dev" + source: hosted + version: "1.1.2" + record_platform_interface: + dependency: transitive + description: + name: record_platform_interface + sha256: b0065fdf1ec28f5a634d676724d388a77e43ce7646fb049949f58c69f3fcb4ed + url: "https://pub.dev" + source: hosted + version: "1.4.0" + record_web: + dependency: transitive + description: + name: record_web + sha256: "20ac10d56514cb9f8cecc8f3579383084fdfb43b0d04e05a95244d0d76091d90" + url: "https://pub.dev" + source: hosted + version: "1.2.1" + record_windows: + dependency: transitive + description: + name: record_windows + sha256: "223258060a1d25c62bae18282c16783f28581ec19401d17e56b5205b9f039d78" + url: "https://pub.dev" + source: hosted + version: "1.0.7" riverpod: dependency: transitive description: @@ -1682,6 +1738,14 @@ packages: url: "https://pub.dev" source: hosted version: "4.5.1" + vad: + dependency: "direct main" + description: + name: vad + sha256: ef6c8b12c5af7a6a519ff5684f074b8a2ac00c434705f544af379ea77bccd258 + url: "https://pub.dev" + source: hosted + version: "0.0.7+1" vector_graphics: dependency: transitive description: diff --git a/pubspec.yaml b/pubspec.yaml index f0dfa30..13de85d 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -44,7 +44,7 @@ dependencies: flutter_animate: ^4.5.0 # Platform Features - mic_stream_recorder: ^1.1.2 + vad: ^0.0.7+1 stts: ^1.2.5 flutter_tts: ^4.2.3 audioplayers: ^6.5.1