From 86339715b133ffa19061777c8d4e6092ce3209ef Mon Sep 17 00:00:00 2001 From: cogwheel0 <172976095+cogwheel0@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:02:37 +0530 Subject: [PATCH] feat(sts): add server side speech-to-text --- lib/core/persistence/persistence_keys.dart | 1 + .../persistence/persistence_migrator.dart | 2 + lib/core/services/api_service.dart | 89 ++- lib/core/services/settings_service.dart | 41 +- .../chat/services/voice_call_service.dart | 27 +- .../chat/services/voice_input_service.dart | 537 +++++++++++++++--- lib/features/chat/views/chat_page.dart | 2 +- .../chat/widgets/modern_chat_input.dart | 2 +- .../profile/views/app_customization_page.dart | 237 ++++++++ lib/l10n/app_de.arb | 10 + lib/l10n/app_es.arb | 10 + lib/l10n/app_fr.arb | 10 + lib/l10n/app_it.arb | 10 + lib/l10n/app_nl.arb | 10 + lib/l10n/app_ru.arb | 10 + lib/l10n/app_zh.arb | 10 + 16 files changed, 916 insertions(+), 92 deletions(-) diff --git a/lib/core/persistence/persistence_keys.dart b/lib/core/persistence/persistence_keys.dart index d0078c5..177fedc 100644 --- a/lib/core/persistence/persistence_keys.dart +++ b/lib/core/persistence/persistence_keys.dart @@ -11,6 +11,7 @@ final class PreferenceKeys { static const String voiceLocaleId = 'voice_locale_id'; static const String voiceHoldToTalk = 'voice_hold_to_talk'; static const String voiceAutoSendFinal = 'voice_auto_send_final'; + static const String voiceSttPreference = 'voice_stt_preference'; static const String socketTransportMode = 'socket_transport_mode'; static const String quickPills = 'quick_pills'; static const String sendOnEnterKey = 'send_on_enter'; diff --git a/lib/core/persistence/persistence_migrator.dart b/lib/core/persistence/persistence_migrator.dart index 8a7350f..d6a278c 100644 --- a/lib/core/persistence/persistence_migrator.dart +++ b/lib/core/persistence/persistence_migrator.dart @@ -90,6 +90,7 @@ class PersistenceMigrator { copyString(PreferenceKeys.voiceLocaleId); copyBool(PreferenceKeys.voiceHoldToTalk); copyBool(PreferenceKeys.voiceAutoSendFinal); + copyString(PreferenceKeys.voiceSttPreference); copyString(PreferenceKeys.socketTransportMode); copyStringList(PreferenceKeys.quickPills); copyBool(PreferenceKeys.sendOnEnterKey); @@ -194,6 +195,7 @@ class PersistenceMigrator { PreferenceKeys.voiceLocaleId, PreferenceKeys.voiceHoldToTalk, PreferenceKeys.voiceAutoSendFinal, + PreferenceKeys.voiceSttPreference, PreferenceKeys.socketTransportMode, PreferenceKeys.quickPills, PreferenceKeys.sendOnEnterKey, diff --git a/lib/core/services/api_service.dart b/lib/core/services/api_service.dart index efafb45..0ceeb7a 100644 --- a/lib/core/services/api_service.dart +++ b/lib/core/services/api_service.dart @@ -4,7 +4,7 @@ import 'dart:io'; import 'package:dio/dio.dart'; import 'package:dio/io.dart'; import 'package:flutter/foundation.dart'; -// import 'package:http_parser/http_parser.dart'; +import 'package:http_parser/http_parser.dart'; // Removed legacy websocket/socket.io imports import 'package:uuid/uuid.dart'; import '../models/backend_config.dart'; @@ -1607,6 +1607,55 @@ class ApiService { return []; } + Future> transcribeSpeech({ + required Uint8List audioBytes, + String? fileName, + String? mimeType, + String? language, + }) async { + if (audioBytes.isEmpty) { + throw ArgumentError('audioBytes cannot be empty for transcription'); + } + + final sanitizedFileName = (fileName != null && fileName.trim().isNotEmpty + ? fileName.trim() + : 'audio.m4a'); + final resolvedMimeType = (mimeType != null && mimeType.trim().isNotEmpty) + ? mimeType.trim() + : _inferMimeTypeFromName(sanitizedFileName); + + _traceApi( + 'Uploading $sanitizedFileName (${audioBytes.length} bytes) for transcription', + ); + + final formData = FormData.fromMap({ + 'file': MultipartFile.fromBytes( + audioBytes, + filename: sanitizedFileName, + contentType: _parseMediaType(resolvedMimeType), + ), + if (language != null && language.trim().isNotEmpty) + 'language': language.trim(), + }); + + final response = await _dio.post( + '/api/v1/audio/transcriptions', + data: formData, + options: Options(headers: const {'accept': 'application/json'}), + ); + + final data = response.data; + if (data is Map) { + return data; + } + if (data is String) { + return {'text': data}; + } + throw StateError( + 'Unexpected transcription response type: ${data.runtimeType}', + ); + } + Future<({Uint8List bytes, String mimeType})> generateSpeech({ required String text, String? voice, @@ -1690,7 +1739,43 @@ class ApiService { return bytes.length >= 2 && bytes[0] == 0xFF && (bytes[1] & 0xE0) == 0xE0; } - // Server audio transcription removed; rely on on-device STT in UI layer + String _inferMimeTypeFromName(String name) { + final dotIndex = name.lastIndexOf('.'); + if (dotIndex == -1 || dotIndex == name.length - 1) { + return 'audio/mpeg'; + } + final ext = name.substring(dotIndex + 1).toLowerCase(); + switch (ext) { + case 'wav': + return 'audio/wav'; + case 'ogg': + return 'audio/ogg'; + case 'm4a': + case 'mp4': + return 'audio/mp4'; + case 'aac': + return 'audio/aac'; + case 'webm': + return 'audio/webm'; + case 'flac': + return 'audio/flac'; + case 'mp3': + return 'audio/mpeg'; + default: + return 'audio/mpeg'; + } + } + + MediaType? _parseMediaType(String? value) { + if (value == null || value.isEmpty) { + return null; + } + try { + return MediaType.parse(value); + } catch (_) { + return null; + } + } // Image Generation Future>> getImageModels() async { diff --git a/lib/core/services/settings_service.dart b/lib/core/services/settings_service.dart index 19ba497..89693c0 100644 --- a/lib/core/services/settings_service.dart +++ b/lib/core/services/settings_service.dart @@ -8,6 +8,9 @@ import 'animation_service.dart'; part 'settings_service.g.dart'; +/// Speech-to-text preference selection. +enum SttPreference { auto, deviceOnly, serverOnly } + /// TTS engine selection enum TtsEngine { device, server } @@ -151,6 +154,9 @@ class SettingsService { ttsServerVoiceId: box.get(PreferenceKeys.ttsServerVoiceId) as String?, ttsServerVoiceName: box.get(PreferenceKeys.ttsServerVoiceName) as String?, + sttPreference: _parseSttPreference( + box.get(PreferenceKeys.voiceSttPreference) as String?, + ), ), ); } @@ -174,6 +180,7 @@ class SettingsService { PreferenceKeys.ttsPitch: settings.ttsPitch, PreferenceKeys.ttsVolume: settings.ttsVolume, PreferenceKeys.ttsEngine: settings.ttsEngine.name, + PreferenceKeys.voiceSttPreference: settings.sttPreference.name, }; await box.putAll(updates); @@ -224,6 +231,22 @@ class SettingsService { } } + static SttPreference _parseSttPreference(String? raw) { + switch ((raw ?? '').toLowerCase()) { + case 'deviceonly': + case 'device_only': + case 'device': + return SttPreference.deviceOnly; + case 'serveronly': + case 'server_only': + case 'server': + return SttPreference.serverOnly; + case 'auto': + default: + return SttPreference.auto; + } + } + // Voice input specific settings static Future getVoiceLocaleId() { final value = _preferencesBox().get(_voiceLocaleKey) as String?; @@ -359,6 +382,7 @@ class AppSettings { final String socketTransportMode; // 'polling' or 'ws' final List quickPills; // e.g., ['web','image'] final bool sendOnEnter; + final SttPreference sttPreference; final String? ttsVoice; final double ttsSpeechRate; final double ttsPitch; @@ -380,6 +404,7 @@ class AppSettings { this.socketTransportMode = 'ws', this.quickPills = const [], this.sendOnEnter = false, + this.sttPreference = SttPreference.auto, this.ttsVoice, this.ttsSpeechRate = 0.5, this.ttsPitch = 1.0, @@ -403,6 +428,7 @@ class AppSettings { String? socketTransportMode, List? quickPills, bool? sendOnEnter, + SttPreference? sttPreference, Object? ttsVoice = const _DefaultValue(), double? ttsSpeechRate, double? ttsPitch, @@ -429,6 +455,7 @@ class AppSettings { socketTransportMode: socketTransportMode ?? this.socketTransportMode, quickPills: quickPills ?? this.quickPills, sendOnEnter: sendOnEnter ?? this.sendOnEnter, + sttPreference: sttPreference ?? this.sttPreference, ttsVoice: ttsVoice is _DefaultValue ? this.ttsVoice : ttsVoice as String?, ttsSpeechRate: ttsSpeechRate ?? this.ttsSpeechRate, ttsPitch: ttsPitch ?? this.ttsPitch, @@ -457,6 +484,7 @@ class AppSettings { other.voiceLocaleId == voiceLocaleId && other.voiceHoldToTalk == voiceHoldToTalk && other.voiceAutoSendFinal == voiceAutoSendFinal && + other.sttPreference == sttPreference && other.sendOnEnter == sendOnEnter && other.ttsVoice == ttsVoice && other.ttsSpeechRate == ttsSpeechRate && @@ -471,7 +499,7 @@ class AppSettings { @override int get hashCode { - return Object.hash( + return Object.hashAll([ reduceMotion, animationSpeed, hapticFeedback, @@ -482,6 +510,7 @@ class AppSettings { voiceLocaleId, voiceHoldToTalk, voiceAutoSendFinal, + sttPreference, socketTransportMode, sendOnEnter, ttsVoice, @@ -492,7 +521,7 @@ class AppSettings { ttsServerVoiceId, ttsServerVoiceName, Object.hashAllUnordered(quickPills), - ); + ]); } } @@ -603,6 +632,14 @@ class AppSettingsNotifier extends _$AppSettingsNotifier { await SettingsService.setSendOnEnter(value); } + Future setSttPreference(SttPreference preference) async { + if (state.sttPreference == preference) { + return; + } + state = state.copyWith(sttPreference: preference); + await SettingsService.saveSettings(state); + } + Future setTtsVoice(String? voice) async { state = state.copyWith(ttsVoice: voice); await SettingsService.saveSettings(state); diff --git a/lib/features/chat/services/voice_call_service.dart b/lib/features/chat/services/voice_call_service.dart index 1037b38..f20f356 100644 --- a/lib/features/chat/services/voice_call_service.dart +++ b/lib/features/chat/services/voice_call_service.dart @@ -108,11 +108,18 @@ class VoiceCallService { throw Exception('Voice input initialization failed'); } - // Check if local STT is available + // Check if preferred STT path is available final hasLocalStt = _voiceInput.hasLocalStt; - if (!hasLocalStt) { + final hasServerStt = _voiceInput.hasServerStt; + final ready = switch (_voiceInput.preference) { + SttPreference.deviceOnly => hasLocalStt, + SttPreference.serverOnly => hasServerStt, + SttPreference.auto => hasLocalStt || hasServerStt, + }; + + if (!ready) { _updateState(VoiceCallState.error); - throw Exception('Speech recognition not available on this device'); + throw Exception('Preferred speech recognition engine is unavailable'); } // Check microphone permissions @@ -202,10 +209,18 @@ class VoiceCallService { _listeningPaused = false; _accumulatedTranscript = ''; - // Check if voice input is available - if (!_voiceInput.hasLocalStt) { + final hasLocalStt = _voiceInput.hasLocalStt; + final hasServerStt = _voiceInput.hasServerStt; + final pref = _voiceInput.preference; + final engineAvailable = switch (pref) { + SttPreference.deviceOnly => hasLocalStt, + SttPreference.serverOnly => hasServerStt, + SttPreference.auto => hasLocalStt || hasServerStt, + }; + + if (!engineAvailable) { _updateState(VoiceCallState.error); - throw Exception('Voice input not available on this device'); + throw Exception('Preferred speech recognition engine is unavailable'); } _updateState(VoiceCallState.listening); diff --git a/lib/features/chat/services/voice_input_service.dart b/lib/features/chat/services/voice_input_service.dart index 1c30c63..47b1238 100644 --- a/lib/features/chat/services/voice_input_service.dart +++ b/lib/features/chat/services/voice_input_service.dart @@ -1,14 +1,19 @@ import 'dart:async'; -import 'dart:io' show Platform; +import 'dart:io' show File, Platform; import 'package:flutter/widgets.dart'; import 'package:flutter_riverpod/flutter_riverpod.dart'; import 'package:riverpod_annotation/riverpod_annotation.dart'; import 'package:record/record.dart'; import 'package:stts/stts.dart'; +import 'package:path/path.dart' as p; +import 'package:path_provider/path_provider.dart'; + +import '../../../core/providers/app_providers.dart'; +import '../../../core/services/api_service.dart'; +import '../../../core/services/settings_service.dart'; part 'voice_input_service.g.dart'; -// Removed path imports as server transcription fallback was removed // Lightweight replacement for previous stt.LocaleName used across the UI class LocaleName { @@ -20,9 +25,15 @@ class LocaleName { class VoiceInputService { final AudioRecorder _recorder = AudioRecorder(); final Stt _speech = Stt(); + final ApiService? _api; bool _isInitialized = false; bool _isListening = false; bool _localSttAvailable = false; + SttPreference _preference = SttPreference.auto; + bool _usingServerStt = false; + bool _serverRecorderActive = false; + String? _serverRecordingPath; + String? _serverRecordingMimeType; String? _selectedLocaleId; List _locales = const []; StreamController? _textStreamController; @@ -43,6 +54,17 @@ class VoiceInputService { StreamSubscription? _sttStateSub; bool get isSupportedPlatform => Platform.isAndroid || Platform.isIOS; + bool get hasServerStt => _api != null; + SttPreference get preference => _preference; + bool get allowsServerFallback => _preference != SttPreference.deviceOnly; + bool get prefersServerOnly => _preference == SttPreference.serverOnly; + bool get prefersDeviceOnly => _preference == SttPreference.deviceOnly; + + VoiceInputService({ApiService? api}) : _api = api; + + void updatePreference(SttPreference preference) { + _preference = preference; + } Future initialize() async { if (_isInitialized) return true; @@ -97,7 +119,8 @@ class VoiceInputService { } bool get isListening => _isListening; - bool get isAvailable => _isInitialized; // service usable (local or fallback) + bool get isAvailable => + _isInitialized && (_localSttAvailable || hasServerStt); bool get hasLocalStt => _localSttAvailable; // Add a method to check if on-device STT is properly supported @@ -166,7 +189,7 @@ class VoiceInputService { } if (_isListening) { - stopListening(); + unawaited(stopListening()); } _textStreamController = StreamController.broadcast(); @@ -174,82 +197,112 @@ class VoiceInputService { _isListening = true; _intensityController = StreamController.broadcast(); _lastIntensity = 0; + _usingServerStt = false; + _serverRecorderActive = false; + _serverRecordingPath = null; + _serverRecordingMimeType = null; - // Begin a gentle decay timer so the UI level bars fall when silent - _intensityDecayTimer?.cancel(); - _intensityDecayTimer = Timer.periodic(const Duration(milliseconds: 120), ( - t, - ) { - if (!_isListening) return; - if (_lastIntensity <= 0) return; - _lastIntensity = (_lastIntensity - 1).clamp(0, 10); - try { - _intensityController?.add(_lastIntensity); - } catch (_) {} - }); + _startIntensityDecayTimer(); + + final bool canUseLocal = _localSttAvailable; + final bool serverAvailable = hasServerStt; + final bool shouldUseLocal = + canUseLocal && _preference != SttPreference.serverOnly; + final bool shouldUseServer = + serverAvailable && + (_preference == SttPreference.serverOnly || !shouldUseLocal); + + if (shouldUseLocal) { + _autoStopTimer?.cancel(); + _autoStopTimer = Timer(const Duration(seconds: 60), () { + if (_isListening) { + unawaited(_stopListening()); + } + }); - // Check if speech recognition is available before trying to use it - if (_localSttAvailable) { - // Schedule a check for speech recognition availability Future.microtask(() async { try { final isStillAvailable = await _speech.isSupported(); if (!isStillAvailable && _isListening) { - // Speech recognition no longer available; stop listening _localSttAvailable = false; - _stopListening(); - return; + if (hasServerStt && allowsServerFallback) { + unawaited(_beginServerFallback()); + } else { + unawaited(_stopListening()); + } } - } catch (e) { + } catch (_) { // ignore availability check errors } }); - // Local on-device STT path - _autoStopTimer?.cancel(); - _autoStopTimer = Timer(const Duration(seconds: 60), () { - if (_isListening) { - _stopListening(); - } - }); - - // Listen for results and state changes; keep subscriptions so we can cancel later _sttResultSub = _speech.onResultChanged.listen((SttRecognition result) { if (!_isListening) return; final prevLen = _currentText.length; _currentText = result.text; _textStreamController?.add(_currentText); - // Map number of new characters to a rough 0..10 intensity final delta = (_currentText.length - prevLen).clamp(0, 50); - final mapped = (delta / 5.0).ceil(); // 0 chars -> 0, 1-5 -> 1, ... + final mapped = (delta / 5.0).ceil(); _lastIntensity = mapped.clamp(0, 10); try { _intensityController?.add(_lastIntensity); } catch (_) {} if (result.isFinal) { - _stopListening(); + unawaited(_stopListening()); } }, onError: (_) {}); _sttStateSub = _speech.onStateChanged.listen((_) {}, onError: (_) {}); - try { - if (_selectedLocaleId != null) { - _speech.setLanguage(_selectedLocaleId!).catchError((_) {}); - } - // Start recognition (no await blocking the sync flow) - _speech.start(SttRecognitionOptions(punctuation: true)).catchError((_) { - // On-device STT failed; stop listening entirely as server transcription is removed + Future(() async { + try { + if (_selectedLocaleId != null) { + await _speech.setLanguage(_selectedLocaleId!); + } + await _speech.start(SttRecognitionOptions(punctuation: true)); + } catch (error) { _localSttAvailable = false; - _stopListening(); - }); - } catch (e) { - _localSttAvailable = false; - _stopListening(); - } + if (!_isListening) return; + if (hasServerStt && allowsServerFallback) { + await _beginServerFallback(); + } else { + _textStreamController?.addError(error); + await _stopListening(); + } + } + }); + } else if (shouldUseServer) { + _usingServerStt = true; + _autoStopTimer?.cancel(); + _autoStopTimer = Timer(const Duration(seconds: 90), () { + if (_isListening) { + unawaited(_stopListening()); + } + }); + Future(() async { + try { + await _startServerRecording(); + } catch (error) { + if (!_isListening) return; + _textStreamController?.addError(error); + await _stopListening(); + } + }); } else { - // No local STT available; stop immediately since server transcription is removed - _stopListening(); + final Exception error; + if (prefersDeviceOnly) { + error = Exception( + 'On-device speech recognition required but unavailable', + ); + } else if (prefersServerOnly) { + error = Exception('Server speech-to-text is not configured'); + } else { + error = Exception('Speech recognition not available on this device'); + } + Future.microtask(() { + _textStreamController?.addError(error); + unawaited(_stopListening()); + }); } return _textStreamController!.stream; @@ -258,14 +311,11 @@ class VoiceInputService { /// Centralized entry point to begin voice recognition. /// Ensures initialization and microphone permission before starting. Future> beginListening() async { - // Ensure service is ready await initialize(); - // Ensure microphone permission (triggers OS prompt if needed) final hasMic = await checkPermissions(); if (!hasMic) { throw Exception('Microphone permission not granted'); } - // Start listening and return the transcript stream return startListening(); } @@ -277,37 +327,332 @@ class VoiceInputService { if (!_isListening) return; _isListening = false; - if (_localSttAvailable) { - try { - await _speech.stop(); - } catch (_) {} - // Cancel STT subscriptions - try { - _sttResultSub?.cancel(); - } catch (_) {} - _sttResultSub = null; - try { - _sttStateSub?.cancel(); - } catch (_) {} - _sttStateSub = null; - } _autoStopTimer?.cancel(); _autoStopTimer = null; - _ampSub?.cancel(); + + if (_usingServerStt) { + await _finalizeServerRecording(); + } else { + await _stopLocalStt(); + } + + await _ampSub?.cancel(); _ampSub = null; + _intensityDecayTimer?.cancel(); _intensityDecayTimer = null; _lastIntensity = 0; - if (_currentText.isNotEmpty) { + if (!_usingServerStt && _currentText.isNotEmpty) { _textStreamController?.add(_currentText); } - _textStreamController?.close(); - _textStreamController = null; - _intensityController?.close(); - _intensityController = null; + await _closeControllers(); + + _usingServerStt = false; + _serverRecorderActive = false; + _serverRecordingPath = null; + _serverRecordingMimeType = null; + } + + Future _stopLocalStt() async { + if (_sttResultSub != null) { + try { + await _sttResultSub?.cancel(); + } catch (_) {} + _sttResultSub = null; + } + if (_sttStateSub != null) { + try { + await _sttStateSub?.cancel(); + } catch (_) {} + _sttStateSub = null; + } + + if (_localSttAvailable) { + try { + await _speech.stop(); + } catch (_) {} + } + } + + Future _beginServerFallback() async { + if (!allowsServerFallback) { + _textStreamController?.addError( + Exception('Server speech-to-text disabled in preferences'), + ); + await _stopListening(); + return; + } + await _stopLocalStt(); + if (!hasServerStt) { + _textStreamController?.addError( + Exception('Server speech-to-text unavailable'), + ); + await _stopListening(); + return; + } + + _usingServerStt = true; + _autoStopTimer?.cancel(); + _autoStopTimer = Timer(const Duration(seconds: 90), () { + if (_isListening) { + unawaited(_stopListening()); + } + }); + + try { + await _startServerRecording(); + } catch (error) { + _textStreamController?.addError(error); + await _stopListening(); + } + } + + Future _startServerRecording() async { + final (path, mimeType) = await _createRecordingTarget(); + _serverRecordingPath = path; + _serverRecordingMimeType = mimeType; + + final config = RecordConfig( + encoder: AudioEncoder.aacLc, + sampleRate: 44100, + bitRate: 96000, + numChannels: 1, + noiseSuppress: true, + ); + + await _recorder.start(config, path: path); + _serverRecorderActive = true; + + await _ampSub?.cancel(); + _ampSub = _recorder + .onAmplitudeChanged(const Duration(milliseconds: 140)) + .listen((Amplitude amplitude) { + if (!_isListening) return; + _lastIntensity = _amplitudeToIntensity(amplitude.current); + try { + _intensityController?.add(_lastIntensity); + } catch (_) {} + }, onError: (_) {}); + } + + Future<(String, String)> _createRecordingTarget() async { + final directory = await getTemporaryDirectory(); + final timestamp = DateTime.now().millisecondsSinceEpoch; + const extension = 'm4a'; + final fileName = 'conduit_voice_$timestamp.$extension'; + final path = p.join(directory.path, fileName); + return (path, 'audio/mp4'); + } + + Future _finalizeServerRecording() async { + final api = _api; + if (api == null) { + return; + } + + String? path; + try { + if (_serverRecorderActive && await _recorder.isRecording()) { + path = await _recorder.stop(); + } else { + path = _serverRecordingPath; + } + } catch (_) { + path = _serverRecordingPath; + } finally { + _serverRecorderActive = false; + } + + final resolvedPath = path; + if (resolvedPath == null || resolvedPath.isEmpty) { + return; + } + + final file = File(resolvedPath); + try { + if (!await file.exists()) { + return; + } + final bytes = await file.readAsBytes(); + if (bytes.isEmpty) { + return; + } + + final response = await api.transcribeSpeech( + audioBytes: bytes, + fileName: p.basename(resolvedPath), + mimeType: _serverRecordingMimeType, + language: _languageForServer(), + ); + + final transcript = _extractTranscriptionText(response); + if (transcript != null && transcript.trim().isNotEmpty) { + _currentText = transcript.trim(); + _textStreamController?.add(_currentText); + } else { + throw StateError('Empty transcription result'); + } + } catch (error) { + _textStreamController?.addError(error); + } finally { + unawaited(_cleanupRecordingFile(file)); + } + } + + Future _cleanupRecordingFile(File file) async { + try { + if (await file.exists()) { + await file.delete(); + } + } catch (_) {} + } + + String? _languageForServer() { + final locale = _selectedLocaleId; + if (locale != null && locale.isNotEmpty) { + final primary = locale.split(RegExp('[-_]')).first.toLowerCase(); + if (primary.length >= 2) { + return primary; + } + } + try { + final fallback = WidgetsBinding.instance.platformDispatcher.locale; + final primary = fallback.languageCode.toLowerCase(); + if (primary.isNotEmpty) { + return primary; + } + } catch (_) {} + return null; + } + + String? _extractTranscriptionText(Map data) { + final direct = data['text']; + if (direct is String && direct.trim().isNotEmpty) { + return direct; + } + + final display = data['display_text'] ?? data['DisplayText']; + if (display is String && display.trim().isNotEmpty) { + return display; + } + + final result = data['result']; + if (result is Map) { + final resultText = result['text']; + if (resultText is String && resultText.trim().isNotEmpty) { + return resultText; + } + } + + final combined = data['combinedRecognizedPhrases']; + if (combined is List && combined.isNotEmpty) { + final first = combined.first; + if (first is Map) { + final candidate = + first['display'] ?? + first['Display'] ?? + first['transcript'] ?? + first['text']; + if (candidate is String && candidate.trim().isNotEmpty) { + return candidate; + } + } else if (first is String && first.trim().isNotEmpty) { + return first; + } + } + + final results = data['results']; + if (results is Map) { + final channels = results['channels']; + if (channels is List && channels.isNotEmpty) { + final channel = channels.first; + if (channel is Map) { + final alternatives = channel['alternatives']; + if (alternatives is List && alternatives.isNotEmpty) { + final alternative = alternatives.first; + if (alternative is Map) { + final transcript = + alternative['transcript'] ?? alternative['text']; + if (transcript is String && transcript.trim().isNotEmpty) { + return transcript; + } + } + } + } + } + } + + final segments = data['segments']; + if (segments is List && segments.isNotEmpty) { + final buffer = StringBuffer(); + for (final segment in segments) { + if (segment is Map) { + final text = segment['text']; + if (text is String && text.trim().isNotEmpty) { + buffer.write(text.trim()); + buffer.write(' '); + } + } else if (segment is String && segment.trim().isNotEmpty) { + buffer.write(segment.trim()); + buffer.write(' '); + } + } + final combinedText = buffer.toString().trim(); + if (combinedText.isNotEmpty) { + return combinedText; + } + } + + return null; + } + + int _amplitudeToIntensity(double? value) { + if (value == null || value.isNaN || value.isInfinite) { + return 0; + } + const minDb = -55.0; + const maxDb = 0.0; + final double clamped = value.clamp(minDb, maxDb).toDouble(); + final double normalized = ((clamped - minDb) / (maxDb - minDb)).clamp( + 0.0, + 1.0, + ); + final int scaled = (normalized * 10).round(); + if (scaled <= 0) return 0; + if (scaled >= 10) return 10; + return scaled; + } + + Future _closeControllers() async { + if (_textStreamController != null) { + try { + await _textStreamController?.close(); + } catch (_) {} + _textStreamController = null; + } + if (_intensityController != null) { + try { + await _intensityController?.close(); + } catch (_) {} + _intensityController = null; + } + } + + void _startIntensityDecayTimer() { + _intensityDecayTimer?.cancel(); + _intensityDecayTimer = Timer.periodic(const Duration(milliseconds: 120), ( + _, + ) { + if (!_isListening) return; + if (_lastIntensity <= 0) return; + _lastIntensity = (_lastIntensity - 1).clamp(0, 10); + try { + _intensityController?.add(_lastIntensity); + } catch (_) {} + }); } void dispose() { @@ -315,15 +660,24 @@ class VoiceInputService { try { _speech.dispose().catchError((_) {}); } catch (_) {} + try { + _recorder.dispose().catchError((_) {}); + } catch (_) {} } - - // Recording fallback removed; only on-device STT is supported now - - // Native locales not used in server transcription mode } final voiceInputServiceProvider = Provider((ref) { - return VoiceInputService(); + final api = ref.watch(apiServiceProvider); + final service = VoiceInputService(api: api); + final currentSettings = ref.read(appSettingsProvider); + service.updatePreference(currentSettings.sttPreference); + ref.listen(appSettingsProvider, (previous, next) { + if (previous?.sttPreference != next.sttPreference) { + service.updatePreference(next.sttPreference); + } + }); + ref.onDispose(service.dispose); + return service; }); @Riverpod(keepAlive: true) @@ -332,8 +686,16 @@ Future voiceInputAvailable(Ref ref) async { if (!service.isSupportedPlatform) return false; final initialized = await service.initialize(); if (!initialized) return false; - // If local STT exists, we consider it available; otherwise ensure mic permission for fallback - if (service.hasLocalStt) return true; + switch (service.preference) { + case SttPreference.deviceOnly: + return service.hasLocalStt; + case SttPreference.serverOnly: + return service.hasServerStt; + case SttPreference.auto: + if (service.hasLocalStt) return true; + if (!service.hasServerStt) return false; + break; + } final hasPermission = await service.checkPermissions(); if (!hasPermission) return false; return service.isAvailable; @@ -349,3 +711,18 @@ final voiceIntensityStreamProvider = StreamProvider((ref) { final service = ref.watch(voiceInputServiceProvider); return service.intensityStream; }); + +final localVoiceRecognitionAvailableProvider = FutureProvider(( + ref, +) async { + final service = ref.watch(voiceInputServiceProvider); + final initialized = await service.initialize(); + if (!initialized) return false; + if (service.hasLocalStt) return true; + return service.checkOnDeviceSupport(); +}); + +final serverVoiceRecognitionAvailableProvider = Provider((ref) { + final service = ref.watch(voiceInputServiceProvider); + return service.hasServerStt; +}); diff --git a/lib/features/chat/views/chat_page.dart b/lib/features/chat/views/chat_page.dart index 7a28e67..2e49411 100644 --- a/lib/features/chat/views/chat_page.dart +++ b/lib/features/chat/views/chat_page.dart @@ -2380,7 +2380,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> { } } - // Server transcription removed; only on-device STT is supported + // When on-device STT is unavailable we fall back to server transcription. Future _stopListening() async { _intensitySub?.cancel(); diff --git a/lib/features/chat/widgets/modern_chat_input.dart b/lib/features/chat/widgets/modern_chat_input.dart index 6fb4e39..d3daf19 100644 --- a/lib/features/chat/widgets/modern_chat_input.dart +++ b/lib/features/chat/widgets/modern_chat_input.dart @@ -2460,7 +2460,7 @@ class _ModernChatInputState extends ConsumerState HapticFeedback.selectionClick(); } - // Server transcription removed; only on-device STT updates the input text + // When on-device STT is unavailable we rely on server transcription. void _showVoiceUnavailable(String message) { if (!mounted) return; diff --git a/lib/features/profile/views/app_customization_page.dart b/lib/features/profile/views/app_customization_page.dart index c8adde3..b33b46f 100644 --- a/lib/features/profile/views/app_customization_page.dart +++ b/lib/features/profile/views/app_customization_page.dart @@ -14,6 +14,7 @@ import '../../../shared/utils/ui_utils.dart'; import '../../../core/providers/app_providers.dart'; import '../../../l10n/app_localizations.dart'; import '../../chat/providers/text_to_speech_provider.dart'; +import '../../chat/services/voice_input_service.dart'; class AppCustomizationPage extends ConsumerWidget { const AppCustomizationPage({super.key}); @@ -70,6 +71,8 @@ class AppCustomizationPage extends ConsumerWidget { languageLabel, ), const SizedBox(height: Spacing.xl), + _buildSttSection(context, ref, settings), + const SizedBox(height: Spacing.xl), _buildTtsDropdownSection(context, ref, settings), const SizedBox(height: Spacing.xl), _buildChatSection(context, ref, settings), @@ -468,6 +471,226 @@ class AppCustomizationPage extends ConsumerWidget { ); } + Widget _buildSttSection( + BuildContext context, + WidgetRef ref, + AppSettings settings, + ) { + final theme = context.conduitTheme; + final l10n = AppLocalizations.of(context)!; + final localSupport = ref.watch(localVoiceRecognitionAvailableProvider); + final bool localAvailable = localSupport.maybeWhen( + data: (value) => value, + orElse: () => false, + ); + final bool localLoading = localSupport.isLoading; + final bool serverAvailable = ref.watch( + serverVoiceRecognitionAvailableProvider, + ); + final notifier = ref.read(appSettingsProvider.notifier); + final description = _sttPreferenceDescription(l10n, settings.sttPreference); + + final warnings = []; + if (settings.sttPreference == SttPreference.deviceOnly && + !localAvailable && + !localLoading) { + warnings.add(l10n.sttDeviceUnavailableWarning); + } + if (settings.sttPreference == SttPreference.serverOnly && + !serverAvailable) { + warnings.add(l10n.sttServerUnavailableWarning); + } + + final bool autoSelectable = + localAvailable || serverAvailable || localLoading; + final bool deviceSelectable = localAvailable || localLoading; + final bool serverSelectable = serverAvailable; + + return Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Text( + l10n.sttSettings, + style: + theme.headingSmall?.copyWith(color: theme.sidebarForeground) ?? + TextStyle(color: theme.sidebarForeground, fontSize: 18), + ), + const SizedBox(height: Spacing.sm), + ConduitCard( + padding: const EdgeInsets.all(Spacing.md), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Row( + children: [ + _buildIconBadge( + context, + UiUtils.platformIcon( + ios: CupertinoIcons.mic, + android: Icons.mic, + ), + color: theme.buttonPrimary, + ), + const SizedBox(width: Spacing.md), + Expanded( + child: Text( + l10n.sttEngineLabel, + style: + theme.bodyMedium?.copyWith( + color: theme.sidebarForeground, + fontWeight: FontWeight.w600, + ) ?? + TextStyle( + color: theme.sidebarForeground, + fontSize: 14, + fontWeight: FontWeight.w600, + ), + ), + ), + ], + ), + const SizedBox(height: Spacing.sm), + Wrap( + spacing: Spacing.sm, + runSpacing: Spacing.sm, + children: [ + ChoiceChip( + label: Text(l10n.sttEngineAuto), + selected: settings.sttPreference == SttPreference.auto, + showCheckmark: false, + selectedColor: theme.buttonPrimary, + backgroundColor: theme.cardBackground, + side: BorderSide( + color: settings.sttPreference == SttPreference.auto + ? theme.buttonPrimary.withValues(alpha: 0.6) + : theme.textPrimary.withValues(alpha: 0.2), + ), + labelStyle: TextStyle( + color: settings.sttPreference == SttPreference.auto + ? theme.buttonPrimaryText + : theme.textPrimary, + fontWeight: FontWeight.w600, + ), + onSelected: autoSelectable + ? (value) { + if (value) { + notifier.setSttPreference(SttPreference.auto); + } + } + : null, + ), + ChoiceChip( + label: Text(l10n.sttEngineDevice), + selected: + settings.sttPreference == SttPreference.deviceOnly, + showCheckmark: false, + selectedColor: theme.buttonPrimary, + backgroundColor: theme.cardBackground, + side: BorderSide( + color: settings.sttPreference == SttPreference.deviceOnly + ? theme.buttonPrimary.withValues(alpha: 0.6) + : theme.textPrimary.withValues(alpha: 0.2), + ), + labelStyle: TextStyle( + color: settings.sttPreference == SttPreference.deviceOnly + ? theme.buttonPrimaryText + : theme.textPrimary, + fontWeight: FontWeight.w600, + ), + onSelected: deviceSelectable + ? (value) { + if (value) { + notifier.setSttPreference( + SttPreference.deviceOnly, + ); + } + } + : null, + ), + ChoiceChip( + label: Text(l10n.sttEngineServer), + selected: + settings.sttPreference == SttPreference.serverOnly, + showCheckmark: false, + selectedColor: theme.buttonPrimary, + backgroundColor: theme.cardBackground, + side: BorderSide( + color: settings.sttPreference == SttPreference.serverOnly + ? theme.buttonPrimary.withValues(alpha: 0.6) + : theme.textPrimary.withValues(alpha: 0.2), + ), + labelStyle: TextStyle( + color: settings.sttPreference == SttPreference.serverOnly + ? theme.buttonPrimaryText + : theme.textPrimary, + fontWeight: FontWeight.w600, + ), + onSelected: serverSelectable + ? (value) { + if (value) { + notifier.setSttPreference( + SttPreference.serverOnly, + ); + } + } + : null, + ), + ], + ), + if (localLoading) ...[ + const SizedBox(height: Spacing.sm), + LinearProgressIndicator( + minHeight: 3, + color: theme.buttonPrimary, + backgroundColor: theme.cardBorder.withValues(alpha: 0.4), + ), + ], + const SizedBox(height: Spacing.sm), + AnimatedSwitcher( + duration: const Duration(milliseconds: 200), + child: Text( + description, + key: ValueKey( + 'stt-desc-${settings.sttPreference.name}', + ), + style: + theme.bodyMedium?.copyWith( + color: theme.sidebarForeground.withValues(alpha: 0.9), + ) ?? + TextStyle( + color: theme.sidebarForeground.withValues(alpha: 0.9), + fontSize: 14, + ), + ), + ), + if (warnings.isNotEmpty) ...[ + const SizedBox(height: Spacing.sm), + ...warnings.map( + (warning) => Padding( + padding: const EdgeInsets.only(top: Spacing.xs), + child: Text( + warning, + style: + theme.bodySmall?.copyWith( + color: theme.error, + fontWeight: FontWeight.w600, + ) ?? + TextStyle( + color: theme.error, + fontSize: 12, + fontWeight: FontWeight.w600, + ), + ), + ), + ), + ], + ], + ), + ), + ], + ); + } + Widget _buildTtsDropdownSection( BuildContext context, WidgetRef ref, @@ -691,6 +914,20 @@ class AppCustomizationPage extends ConsumerWidget { ); } + String _sttPreferenceDescription( + AppLocalizations l10n, + SttPreference preference, + ) { + switch (preference) { + case SttPreference.auto: + return l10n.sttEngineAutoDescription; + case SttPreference.deviceOnly: + return l10n.sttEngineDeviceDescription; + case SttPreference.serverOnly: + return l10n.sttEngineServerDescription; + } + } + Widget _buildSliderTile( BuildContext context, WidgetRef ref, { diff --git a/lib/l10n/app_de.arb b/lib/l10n/app_de.arb index c99b5fa..84e1e07 100644 --- a/lib/l10n/app_de.arb +++ b/lib/l10n/app_de.arb @@ -307,6 +307,16 @@ "chatSettings": "Chat", "sendOnEnter": "Mit Enter senden", "sendOnEnterDescription": "Enter sendet (Soft-Tastatur). Cmd/Ctrl+Enter ebenfalls verfügbar", + "sttSettings": "Sprache zu Text", + "sttEngineLabel": "Erkennungs-Engine", + "sttEngineAuto": "Automatisch", + "sttEngineDevice": "Auf dem Gerät", + "sttEngineServer": "Server", + "sttEngineAutoDescription": "Verwendet die Erkennung auf dem Gerät, wenn verfügbar, und greift sonst auf deinen Server zurück.", + "sttEngineDeviceDescription": "Behält Audio auf diesem Gerät. Spracheingabe funktioniert nicht, wenn das Gerät keine Spracherkennung unterstützt.", + "sttEngineServerDescription": "Sendet Aufnahmen immer an deinen Conduit-Server zur Transkription.", + "sttDeviceUnavailableWarning": "Auf diesem Gerät steht keine Spracherkennung zur Verfügung.", + "sttServerUnavailableWarning": "Verbinde dich mit einem Server mit aktivierter Transkription, um diese Option zu nutzen.", "ttsSettings": "Text zu Sprache", "ttsVoice": "Stimme", "ttsSpeechRate": "Sprechgeschwindigkeit", diff --git a/lib/l10n/app_es.arb b/lib/l10n/app_es.arb index a8d8ddc..6fb7e15 100644 --- a/lib/l10n/app_es.arb +++ b/lib/l10n/app_es.arb @@ -307,6 +307,16 @@ "chatSettings": "Conversación", "sendOnEnter": "Enviar con Enter", "sendOnEnterDescription": "Enter envía (teclado virtual). Cmd/Ctrl+Enter también disponible", + "sttSettings": "Voz a texto", + "sttEngineLabel": "Motor de reconocimiento", + "sttEngineAuto": "Automático", + "sttEngineDevice": "En el dispositivo", + "sttEngineServer": "Servidor", + "sttEngineAutoDescription": "Usa el reconocimiento en el dispositivo cuando esté disponible y, si no, recurre a tu servidor.", + "sttEngineDeviceDescription": "Mantiene el audio en este dispositivo. La entrada de voz no funciona si el dispositivo no admite reconocimiento de voz.", + "sttEngineServerDescription": "Envía siempre las grabaciones a tu servidor Conduit para la transcripción.", + "sttDeviceUnavailableWarning": "El reconocimiento de voz en el dispositivo no está disponible en este dispositivo.", + "sttServerUnavailableWarning": "Conéctate a un servidor con transcripción habilitada para usar esta opción.", "ttsSettings": "Texto a voz", "ttsVoice": "Voz", "ttsSpeechRate": "Velocidad de voz", diff --git a/lib/l10n/app_fr.arb b/lib/l10n/app_fr.arb index d50ae5e..a44f277 100644 --- a/lib/l10n/app_fr.arb +++ b/lib/l10n/app_fr.arb @@ -307,6 +307,16 @@ "chatSettings": "Discussion", "sendOnEnter": "Envoyer avec Entrée", "sendOnEnterDescription": "Entrée envoie (clavier logiciel). Cmd/Ctrl+Entrée aussi disponible", + "sttSettings": "Voix vers texte", + "sttEngineLabel": "Moteur de reconnaissance", + "sttEngineAuto": "Auto", + "sttEngineDevice": "Sur l’appareil", + "sttEngineServer": "Serveur", + "sttEngineAutoDescription": "Utilise la reconnaissance sur l’appareil quand c’est possible, sinon bascule vers votre serveur.", + "sttEngineDeviceDescription": "Conserve l’audio sur cet appareil. L’entrée vocale cesse de fonctionner si la reconnaissance vocale n’est pas prise en charge.", + "sttEngineServerDescription": "Envoie toujours les enregistrements à votre serveur Conduit pour transcription.", + "sttDeviceUnavailableWarning": "La reconnaissance vocale sur l’appareil n’est pas disponible sur cet appareil.", + "sttServerUnavailableWarning": "Connectez-vous à un serveur avec la transcription activée pour utiliser cette option.", "ttsSettings": "Synthèse vocale", "ttsVoice": "Voix", "ttsSpeechRate": "Vitesse de parole", diff --git a/lib/l10n/app_it.arb b/lib/l10n/app_it.arb index cc2e0f0..9caea28 100644 --- a/lib/l10n/app_it.arb +++ b/lib/l10n/app_it.arb @@ -307,6 +307,16 @@ "chatSettings": "Chat", "sendOnEnter": "Invia con Invio", "sendOnEnterDescription": "Invio invia (tastiera software). Cmd/Ctrl+Invio disponibile", + "sttSettings": "Voce in testo", + "sttEngineLabel": "Motore di riconoscimento", + "sttEngineAuto": "Automatico", + "sttEngineDevice": "Sul dispositivo", + "sttEngineServer": "Server", + "sttEngineAutoDescription": "Usa il riconoscimento sul dispositivo quando disponibile e altrimenti passa al tuo server.", + "sttEngineDeviceDescription": "Mantiene l’audio su questo dispositivo. L’input vocale non funziona se il dispositivo non supporta il riconoscimento vocale.", + "sttEngineServerDescription": "Invia sempre le registrazioni al tuo server Conduit per la trascrizione.", + "sttDeviceUnavailableWarning": "Il riconoscimento vocale sul dispositivo non è disponibile su questo dispositivo.", + "sttServerUnavailableWarning": "Collegati a un server con la trascrizione abilitata per usare questa opzione.", "ttsSettings": "Sintesi vocale", "ttsVoice": "Voce", "ttsSpeechRate": "Velocità di sintesi vocale", diff --git a/lib/l10n/app_nl.arb b/lib/l10n/app_nl.arb index d6133d2..548c254 100644 --- a/lib/l10n/app_nl.arb +++ b/lib/l10n/app_nl.arb @@ -307,6 +307,16 @@ "chatSettings": "Chat", "sendOnEnter": "Verzenden met Enter", "sendOnEnterDescription": "Enter verzendt (softtoetsenbord). Cmd/Ctrl+Enter ook beschikbaar", + "sttSettings": "Spraak naar tekst", + "sttEngineLabel": "Herkenningsengine", + "sttEngineAuto": "Automatisch", + "sttEngineDevice": "Op het apparaat", + "sttEngineServer": "Server", + "sttEngineAutoDescription": "Gebruikt spraakherkenning op het apparaat wanneer beschikbaar en valt anders terug op je server.", + "sttEngineDeviceDescription": "Houdt audio op dit apparaat. Spraakinput werkt niet als het apparaat geen spraakherkenning ondersteunt.", + "sttEngineServerDescription": "Stuurt opnames altijd naar je Conduit-server voor transcriptie.", + "sttDeviceUnavailableWarning": "Spraakherkenning op het apparaat is niet beschikbaar op dit apparaat.", + "sttServerUnavailableWarning": "Verbind met een server met transcriptie ingeschakeld om deze optie te gebruiken.", "ttsSettings": "Tekst naar spraak", "ttsVoice": "Stem", "ttsSpeechRate": "Spraaksnelheid", diff --git a/lib/l10n/app_ru.arb b/lib/l10n/app_ru.arb index d438815..332036f 100644 --- a/lib/l10n/app_ru.arb +++ b/lib/l10n/app_ru.arb @@ -307,6 +307,16 @@ "chatSettings": "Чат", "sendOnEnter": "Отправка по Enter", "sendOnEnterDescription": "Enter отправляет (программная клавиатура). Также доступно Cmd/Ctrl+Enter", + "sttSettings": "Речь в текст", + "sttEngineLabel": "Движок распознавания", + "sttEngineAuto": "Авто", + "sttEngineDevice": "На устройстве", + "sttEngineServer": "Сервер", + "sttEngineAutoDescription": "Использует распознавание на устройстве, когда это возможно, иначе переключается на ваш сервер.", + "sttEngineDeviceDescription": "Оставляет звук на этом устройстве. Голосовой ввод не работает, если устройство не поддерживает распознавание речи.", + "sttEngineServerDescription": "Всегда отправляет записи на сервер Conduit для транскрибации.", + "sttDeviceUnavailableWarning": "Распознавание речи на устройстве недоступно на этом устройстве.", + "sttServerUnavailableWarning": "Подключитесь к серверу с включённой транскрибацией, чтобы использовать эту опцию.", "ttsSettings": "Преобразование текста в речь", "ttsVoice": "Голос", "ttsSpeechRate": "Скорость речи", diff --git a/lib/l10n/app_zh.arb b/lib/l10n/app_zh.arb index b8b41f9..40e53f4 100644 --- a/lib/l10n/app_zh.arb +++ b/lib/l10n/app_zh.arb @@ -307,6 +307,16 @@ "chatSettings": "对话", "sendOnEnter": "回车发送", "sendOnEnterDescription": "回车发送(软键盘)。Cmd/Ctrl+Enter 也可用", + "sttSettings": "语音转文字", + "sttEngineLabel": "识别引擎", + "sttEngineAuto": "自动", + "sttEngineDevice": "本机", + "sttEngineServer": "服务器", + "sttEngineAutoDescription": "在可用时使用本机识别,否则切换到你的服务器。", + "sttEngineDeviceDescription": "音频会保留在此设备上。如果设备不支持语音识别,语音输入将不可用。", + "sttEngineServerDescription": "始终将录音发送到你的 Conduit 服务器进行转写。", + "sttDeviceUnavailableWarning": "此设备不支持本机语音识别。", + "sttServerUnavailableWarning": "连接到启用转写功能的服务器后才能使用此选项。", "ttsSettings": "文本转语音", "ttsVoice": "语音", "ttsSpeechRate": "语速",