feat(sts): add server side speech-to-text

2025-11-02 19:02:37 +05:30
parent 99f867bf54
commit 86339715b1
16 changed files with 916 additions and 92 deletions
@@ -11,6 +11,7 @@ final class PreferenceKeys {
  static const String voiceLocaleId = 'voice_locale_id';
  static const String voiceHoldToTalk = 'voice_hold_to_talk';
  static const String voiceAutoSendFinal = 'voice_auto_send_final';
+  static const String voiceSttPreference = 'voice_stt_preference';
  static const String socketTransportMode = 'socket_transport_mode';
  static const String quickPills = 'quick_pills';
  static const String sendOnEnterKey = 'send_on_enter';
@@ -90,6 +90,7 @@ class PersistenceMigrator {
    copyString(PreferenceKeys.voiceLocaleId);
    copyBool(PreferenceKeys.voiceHoldToTalk);
    copyBool(PreferenceKeys.voiceAutoSendFinal);
+    copyString(PreferenceKeys.voiceSttPreference);
    copyString(PreferenceKeys.socketTransportMode);
    copyStringList(PreferenceKeys.quickPills);
    copyBool(PreferenceKeys.sendOnEnterKey);
@@ -194,6 +195,7 @@ class PersistenceMigrator {
      PreferenceKeys.voiceLocaleId,
      PreferenceKeys.voiceHoldToTalk,
      PreferenceKeys.voiceAutoSendFinal,
+      PreferenceKeys.voiceSttPreference,
      PreferenceKeys.socketTransportMode,
      PreferenceKeys.quickPills,
      PreferenceKeys.sendOnEnterKey,
@@ -4,7 +4,7 @@ import 'dart:io';
 import 'package:dio/dio.dart';
 import 'package:dio/io.dart';
 import 'package:flutter/foundation.dart';
-// import 'package:http_parser/http_parser.dart';
+import 'package:http_parser/http_parser.dart';
 // Removed legacy websocket/socket.io imports
 import 'package:uuid/uuid.dart';
 import '../models/backend_config.dart';
@@ -1607,6 +1607,55 @@ class ApiService {
    return [];
  }

+  Future<Map<String, dynamic>> transcribeSpeech({
+    required Uint8List audioBytes,
+    String? fileName,
+    String? mimeType,
+    String? language,
+  }) async {
+    if (audioBytes.isEmpty) {
+      throw ArgumentError('audioBytes cannot be empty for transcription');
+    }
+
+    final sanitizedFileName = (fileName != null && fileName.trim().isNotEmpty
+        ? fileName.trim()
+        : 'audio.m4a');
+    final resolvedMimeType = (mimeType != null && mimeType.trim().isNotEmpty)
+        ? mimeType.trim()
+        : _inferMimeTypeFromName(sanitizedFileName);
+
+    _traceApi(
+      'Uploading $sanitizedFileName (${audioBytes.length} bytes) for transcription',
+    );
+
+    final formData = FormData.fromMap({
+      'file': MultipartFile.fromBytes(
+        audioBytes,
+        filename: sanitizedFileName,
+        contentType: _parseMediaType(resolvedMimeType),
+      ),
+      if (language != null && language.trim().isNotEmpty)
+        'language': language.trim(),
+    });
+
+    final response = await _dio.post(
+      '/api/v1/audio/transcriptions',
+      data: formData,
+      options: Options(headers: const {'accept': 'application/json'}),
+    );
+
+    final data = response.data;
+    if (data is Map<String, dynamic>) {
+      return data;
+    }
+    if (data is String) {
+      return {'text': data};
+    }
+    throw StateError(
+      'Unexpected transcription response type: ${data.runtimeType}',
+    );
+  }
+
  Future<({Uint8List bytes, String mimeType})> generateSpeech({
    required String text,
    String? voice,
@@ -1690,7 +1739,43 @@ class ApiService {
    return bytes.length >= 2 && bytes[0] == 0xFF && (bytes[1] & 0xE0) == 0xE0;
  }

-  // Server audio transcription removed; rely on on-device STT in UI layer
+  String _inferMimeTypeFromName(String name) {
+    final dotIndex = name.lastIndexOf('.');
+    if (dotIndex == -1 || dotIndex == name.length - 1) {
+      return 'audio/mpeg';
+    }
+    final ext = name.substring(dotIndex + 1).toLowerCase();
+    switch (ext) {
+      case 'wav':
+        return 'audio/wav';
+      case 'ogg':
+        return 'audio/ogg';
+      case 'm4a':
+      case 'mp4':
+        return 'audio/mp4';
+      case 'aac':
+        return 'audio/aac';
+      case 'webm':
+        return 'audio/webm';
+      case 'flac':
+        return 'audio/flac';
+      case 'mp3':
+        return 'audio/mpeg';
+      default:
+        return 'audio/mpeg';
+    }
+  }
+
+  MediaType? _parseMediaType(String? value) {
+    if (value == null || value.isEmpty) {
+      return null;
+    }
+    try {
+      return MediaType.parse(value);
+    } catch (_) {
+      return null;
+    }
+  }

  // Image Generation
  Future<List<Map<String, dynamic>>> getImageModels() async {
@@ -8,6 +8,9 @@ import 'animation_service.dart';

 part 'settings_service.g.dart';

+/// Speech-to-text preference selection.
+enum SttPreference { auto, deviceOnly, serverOnly }
+
 /// TTS engine selection
 enum TtsEngine { device, server }

@@ -151,6 +154,9 @@ class SettingsService {
        ttsServerVoiceId: box.get(PreferenceKeys.ttsServerVoiceId) as String?,
        ttsServerVoiceName:
            box.get(PreferenceKeys.ttsServerVoiceName) as String?,
+        sttPreference: _parseSttPreference(
+          box.get(PreferenceKeys.voiceSttPreference) as String?,
+        ),
      ),
    );
  }
@@ -174,6 +180,7 @@ class SettingsService {
      PreferenceKeys.ttsPitch: settings.ttsPitch,
      PreferenceKeys.ttsVolume: settings.ttsVolume,
      PreferenceKeys.ttsEngine: settings.ttsEngine.name,
+      PreferenceKeys.voiceSttPreference: settings.sttPreference.name,
    };

    await box.putAll(updates);
@@ -224,6 +231,22 @@ class SettingsService {
    }
  }

+  static SttPreference _parseSttPreference(String? raw) {
+    switch ((raw ?? '').toLowerCase()) {
+      case 'deviceonly':
+      case 'device_only':
+      case 'device':
+        return SttPreference.deviceOnly;
+      case 'serveronly':
+      case 'server_only':
+      case 'server':
+        return SttPreference.serverOnly;
+      case 'auto':
+      default:
+        return SttPreference.auto;
+    }
+  }
+
  // Voice input specific settings
  static Future<String?> getVoiceLocaleId() {
    final value = _preferencesBox().get(_voiceLocaleKey) as String?;
@@ -359,6 +382,7 @@ class AppSettings {
  final String socketTransportMode; // 'polling' or 'ws'
  final List<String> quickPills; // e.g., ['web','image']
  final bool sendOnEnter;
+  final SttPreference sttPreference;
  final String? ttsVoice;
  final double ttsSpeechRate;
  final double ttsPitch;
@@ -380,6 +404,7 @@ class AppSettings {
    this.socketTransportMode = 'ws',
    this.quickPills = const [],
    this.sendOnEnter = false,
+    this.sttPreference = SttPreference.auto,
    this.ttsVoice,
    this.ttsSpeechRate = 0.5,
    this.ttsPitch = 1.0,
@@ -403,6 +428,7 @@ class AppSettings {
    String? socketTransportMode,
    List<String>? quickPills,
    bool? sendOnEnter,
+    SttPreference? sttPreference,
    Object? ttsVoice = const _DefaultValue(),
    double? ttsSpeechRate,
    double? ttsPitch,
@@ -429,6 +455,7 @@ class AppSettings {
      socketTransportMode: socketTransportMode ?? this.socketTransportMode,
      quickPills: quickPills ?? this.quickPills,
      sendOnEnter: sendOnEnter ?? this.sendOnEnter,
+      sttPreference: sttPreference ?? this.sttPreference,
      ttsVoice: ttsVoice is _DefaultValue ? this.ttsVoice : ttsVoice as String?,
      ttsSpeechRate: ttsSpeechRate ?? this.ttsSpeechRate,
      ttsPitch: ttsPitch ?? this.ttsPitch,
@@ -457,6 +484,7 @@ class AppSettings {
        other.voiceLocaleId == voiceLocaleId &&
        other.voiceHoldToTalk == voiceHoldToTalk &&
        other.voiceAutoSendFinal == voiceAutoSendFinal &&
+        other.sttPreference == sttPreference &&
        other.sendOnEnter == sendOnEnter &&
        other.ttsVoice == ttsVoice &&
        other.ttsSpeechRate == ttsSpeechRate &&
@@ -471,7 +499,7 @@ class AppSettings {

  @override
  int get hashCode {
-    return Object.hash(
+    return Object.hashAll([
      reduceMotion,
      animationSpeed,
      hapticFeedback,
@@ -482,6 +510,7 @@ class AppSettings {
      voiceLocaleId,
      voiceHoldToTalk,
      voiceAutoSendFinal,
+      sttPreference,
      socketTransportMode,
      sendOnEnter,
      ttsVoice,
@@ -492,7 +521,7 @@ class AppSettings {
      ttsServerVoiceId,
      ttsServerVoiceName,
      Object.hashAllUnordered(quickPills),
-    );
+    ]);
  }
 }

@@ -603,6 +632,14 @@ class AppSettingsNotifier extends _$AppSettingsNotifier {
    await SettingsService.setSendOnEnter(value);
  }

+  Future<void> setSttPreference(SttPreference preference) async {
+    if (state.sttPreference == preference) {
+      return;
+    }
+    state = state.copyWith(sttPreference: preference);
+    await SettingsService.saveSettings(state);
+  }
+
  Future<void> setTtsVoice(String? voice) async {
    state = state.copyWith(ttsVoice: voice);
    await SettingsService.saveSettings(state);
@@ -108,11 +108,18 @@ class VoiceCallService {
      throw Exception('Voice input initialization failed');
    }

-    // Check if local STT is available
+    // Check if preferred STT path is available
    final hasLocalStt = _voiceInput.hasLocalStt;
-    if (!hasLocalStt) {
+    final hasServerStt = _voiceInput.hasServerStt;
+    final ready = switch (_voiceInput.preference) {
+      SttPreference.deviceOnly => hasLocalStt,
+      SttPreference.serverOnly => hasServerStt,
+      SttPreference.auto => hasLocalStt || hasServerStt,
+    };
+
+    if (!ready) {
      _updateState(VoiceCallState.error);
-      throw Exception('Speech recognition not available on this device');
+      throw Exception('Preferred speech recognition engine is unavailable');
    }

    // Check microphone permissions
@@ -202,10 +209,18 @@ class VoiceCallService {
      _listeningPaused = false;
      _accumulatedTranscript = '';

-      // Check if voice input is available
-      if (!_voiceInput.hasLocalStt) {
+      final hasLocalStt = _voiceInput.hasLocalStt;
+      final hasServerStt = _voiceInput.hasServerStt;
+      final pref = _voiceInput.preference;
+      final engineAvailable = switch (pref) {
+        SttPreference.deviceOnly => hasLocalStt,
+        SttPreference.serverOnly => hasServerStt,
+        SttPreference.auto => hasLocalStt || hasServerStt,
+      };
+
+      if (!engineAvailable) {
        _updateState(VoiceCallState.error);
-        throw Exception('Voice input not available on this device');
+        throw Exception('Preferred speech recognition engine is unavailable');
      }

      _updateState(VoiceCallState.listening);
@@ -1,14 +1,19 @@
 import 'dart:async';
-import 'dart:io' show Platform;
+import 'dart:io' show File, Platform;

 import 'package:flutter/widgets.dart';
 import 'package:flutter_riverpod/flutter_riverpod.dart';
 import 'package:riverpod_annotation/riverpod_annotation.dart';
 import 'package:record/record.dart';
 import 'package:stts/stts.dart';
+import 'package:path/path.dart' as p;
+import 'package:path_provider/path_provider.dart';
+
+import '../../../core/providers/app_providers.dart';
+import '../../../core/services/api_service.dart';
+import '../../../core/services/settings_service.dart';

 part 'voice_input_service.g.dart';
-// Removed path imports as server transcription fallback was removed

 // Lightweight replacement for previous stt.LocaleName used across the UI
 class LocaleName {
@@ -20,9 +25,15 @@ class LocaleName {
 class VoiceInputService {
  final AudioRecorder _recorder = AudioRecorder();
  final Stt _speech = Stt();
+  final ApiService? _api;
  bool _isInitialized = false;
  bool _isListening = false;
  bool _localSttAvailable = false;
+  SttPreference _preference = SttPreference.auto;
+  bool _usingServerStt = false;
+  bool _serverRecorderActive = false;
+  String? _serverRecordingPath;
+  String? _serverRecordingMimeType;
  String? _selectedLocaleId;
  List<LocaleName> _locales = const [];
  StreamController<String>? _textStreamController;
@@ -43,6 +54,17 @@ class VoiceInputService {
  StreamSubscription<SttState>? _sttStateSub;

  bool get isSupportedPlatform => Platform.isAndroid || Platform.isIOS;
+  bool get hasServerStt => _api != null;
+  SttPreference get preference => _preference;
+  bool get allowsServerFallback => _preference != SttPreference.deviceOnly;
+  bool get prefersServerOnly => _preference == SttPreference.serverOnly;
+  bool get prefersDeviceOnly => _preference == SttPreference.deviceOnly;
+
+  VoiceInputService({ApiService? api}) : _api = api;
+
+  void updatePreference(SttPreference preference) {
+    _preference = preference;
+  }

  Future<bool> initialize() async {
    if (_isInitialized) return true;
@@ -97,7 +119,8 @@ class VoiceInputService {
  }

  bool get isListening => _isListening;
-  bool get isAvailable => _isInitialized; // service usable (local or fallback)
+  bool get isAvailable =>
+      _isInitialized && (_localSttAvailable || hasServerStt);
  bool get hasLocalStt => _localSttAvailable;

  // Add a method to check if on-device STT is properly supported
@@ -166,7 +189,7 @@ class VoiceInputService {
    }

    if (_isListening) {
-      stopListening();
+      unawaited(stopListening());
    }

    _textStreamController = StreamController<String>.broadcast();
@@ -174,82 +197,112 @@ class VoiceInputService {
    _isListening = true;
    _intensityController = StreamController<int>.broadcast();
    _lastIntensity = 0;
+    _usingServerStt = false;
+    _serverRecorderActive = false;
+    _serverRecordingPath = null;
+    _serverRecordingMimeType = null;

-    // Begin a gentle decay timer so the UI level bars fall when silent
-    _intensityDecayTimer?.cancel();
-    _intensityDecayTimer = Timer.periodic(const Duration(milliseconds: 120), (
-      t,
-    ) {
-      if (!_isListening) return;
-      if (_lastIntensity <= 0) return;
-      _lastIntensity = (_lastIntensity - 1).clamp(0, 10);
-      try {
-        _intensityController?.add(_lastIntensity);
-      } catch (_) {}
-    });
+    _startIntensityDecayTimer();
+
+    final bool canUseLocal = _localSttAvailable;
+    final bool serverAvailable = hasServerStt;
+    final bool shouldUseLocal =
+        canUseLocal && _preference != SttPreference.serverOnly;
+    final bool shouldUseServer =
+        serverAvailable &&
+        (_preference == SttPreference.serverOnly || !shouldUseLocal);
+
+    if (shouldUseLocal) {
+      _autoStopTimer?.cancel();
+      _autoStopTimer = Timer(const Duration(seconds: 60), () {
+        if (_isListening) {
+          unawaited(_stopListening());
+        }
+      });

-    // Check if speech recognition is available before trying to use it
-    if (_localSttAvailable) {
-      // Schedule a check for speech recognition availability
      Future.microtask(() async {
        try {
          final isStillAvailable = await _speech.isSupported();
          if (!isStillAvailable && _isListening) {
-            // Speech recognition no longer available; stop listening
            _localSttAvailable = false;
-            _stopListening();
-            return;
+            if (hasServerStt && allowsServerFallback) {
+              unawaited(_beginServerFallback());
+            } else {
+              unawaited(_stopListening());
+            }
          }
-        } catch (e) {
+        } catch (_) {
          // ignore availability check errors
        }
      });

-      // Local on-device STT path
-      _autoStopTimer?.cancel();
-      _autoStopTimer = Timer(const Duration(seconds: 60), () {
-        if (_isListening) {
-          _stopListening();
-        }
-      });
-
-      // Listen for results and state changes; keep subscriptions so we can cancel later
      _sttResultSub = _speech.onResultChanged.listen((SttRecognition result) {
        if (!_isListening) return;
        final prevLen = _currentText.length;
        _currentText = result.text;
        _textStreamController?.add(_currentText);
-        // Map number of new characters to a rough 0..10 intensity
        final delta = (_currentText.length - prevLen).clamp(0, 50);
-        final mapped = (delta / 5.0).ceil(); // 0 chars -> 0, 1-5 -> 1, ...
+        final mapped = (delta / 5.0).ceil();
        _lastIntensity = mapped.clamp(0, 10);
        try {
          _intensityController?.add(_lastIntensity);
        } catch (_) {}
        if (result.isFinal) {
-          _stopListening();
+          unawaited(_stopListening());
        }
      }, onError: (_) {});

      _sttStateSub = _speech.onStateChanged.listen((_) {}, onError: (_) {});

-      try {
-        if (_selectedLocaleId != null) {
-          _speech.setLanguage(_selectedLocaleId!).catchError((_) {});
-        }
-        // Start recognition (no await blocking the sync flow)
-        _speech.start(SttRecognitionOptions(punctuation: true)).catchError((_) {
-          // On-device STT failed; stop listening entirely as server transcription is removed
+      Future(() async {
+        try {
+          if (_selectedLocaleId != null) {
+            await _speech.setLanguage(_selectedLocaleId!);
+          }
+          await _speech.start(SttRecognitionOptions(punctuation: true));
+        } catch (error) {
          _localSttAvailable = false;
-          _stopListening();
-        });
-      } catch (e) {
-        _localSttAvailable = false;
-        _stopListening();
-      }
+          if (!_isListening) return;
+          if (hasServerStt && allowsServerFallback) {
+            await _beginServerFallback();
+          } else {
+            _textStreamController?.addError(error);
+            await _stopListening();
+          }
+        }
+      });
+    } else if (shouldUseServer) {
+      _usingServerStt = true;
+      _autoStopTimer?.cancel();
+      _autoStopTimer = Timer(const Duration(seconds: 90), () {
+        if (_isListening) {
+          unawaited(_stopListening());
+        }
+      });
+      Future(() async {
+        try {
+          await _startServerRecording();
+        } catch (error) {
+          if (!_isListening) return;
+          _textStreamController?.addError(error);
+          await _stopListening();
+        }
+      });
    } else {
-      // No local STT available; stop immediately since server transcription is removed
-      _stopListening();
+      final Exception error;
+      if (prefersDeviceOnly) {
+        error = Exception(
+          'On-device speech recognition required but unavailable',
+        );
+      } else if (prefersServerOnly) {
+        error = Exception('Server speech-to-text is not configured');
+      } else {
+        error = Exception('Speech recognition not available on this device');
+      }
+      Future.microtask(() {
+        _textStreamController?.addError(error);
+        unawaited(_stopListening());
+      });
    }

    return _textStreamController!.stream;
@@ -258,14 +311,11 @@ class VoiceInputService {
  /// Centralized entry point to begin voice recognition.
  /// Ensures initialization and microphone permission before starting.
  Future<Stream<String>> beginListening() async {
-    // Ensure service is ready
    await initialize();
-    // Ensure microphone permission (triggers OS prompt if needed)
    final hasMic = await checkPermissions();
    if (!hasMic) {
      throw Exception('Microphone permission not granted');
    }
-    // Start listening and return the transcript stream
    return startListening();
  }

@@ -277,37 +327,332 @@ class VoiceInputService {
    if (!_isListening) return;

    _isListening = false;
-    if (_localSttAvailable) {
-      try {
-        await _speech.stop();
-      } catch (_) {}
-      // Cancel STT subscriptions
-      try {
-        _sttResultSub?.cancel();
-      } catch (_) {}
-      _sttResultSub = null;
-      try {
-        _sttStateSub?.cancel();
-      } catch (_) {}
-      _sttStateSub = null;
-    }

    _autoStopTimer?.cancel();
    _autoStopTimer = null;
-    _ampSub?.cancel();
+
+    if (_usingServerStt) {
+      await _finalizeServerRecording();
+    } else {
+      await _stopLocalStt();
+    }
+
+    await _ampSub?.cancel();
    _ampSub = null;
+
    _intensityDecayTimer?.cancel();
    _intensityDecayTimer = null;
    _lastIntensity = 0;

-    if (_currentText.isNotEmpty) {
+    if (!_usingServerStt && _currentText.isNotEmpty) {
      _textStreamController?.add(_currentText);
    }

-    _textStreamController?.close();
-    _textStreamController = null;
-    _intensityController?.close();
-    _intensityController = null;
+    await _closeControllers();
+
+    _usingServerStt = false;
+    _serverRecorderActive = false;
+    _serverRecordingPath = null;
+    _serverRecordingMimeType = null;
+  }
+
+  Future<void> _stopLocalStt() async {
+    if (_sttResultSub != null) {
+      try {
+        await _sttResultSub?.cancel();
+      } catch (_) {}
+      _sttResultSub = null;
+    }
+    if (_sttStateSub != null) {
+      try {
+        await _sttStateSub?.cancel();
+      } catch (_) {}
+      _sttStateSub = null;
+    }
+
+    if (_localSttAvailable) {
+      try {
+        await _speech.stop();
+      } catch (_) {}
+    }
+  }
+
+  Future<void> _beginServerFallback() async {
+    if (!allowsServerFallback) {
+      _textStreamController?.addError(
+        Exception('Server speech-to-text disabled in preferences'),
+      );
+      await _stopListening();
+      return;
+    }
+    await _stopLocalStt();
+    if (!hasServerStt) {
+      _textStreamController?.addError(
+        Exception('Server speech-to-text unavailable'),
+      );
+      await _stopListening();
+      return;
+    }
+
+    _usingServerStt = true;
+    _autoStopTimer?.cancel();
+    _autoStopTimer = Timer(const Duration(seconds: 90), () {
+      if (_isListening) {
+        unawaited(_stopListening());
+      }
+    });
+
+    try {
+      await _startServerRecording();
+    } catch (error) {
+      _textStreamController?.addError(error);
+      await _stopListening();
+    }
+  }
+
+  Future<void> _startServerRecording() async {
+    final (path, mimeType) = await _createRecordingTarget();
+    _serverRecordingPath = path;
+    _serverRecordingMimeType = mimeType;
+
+    final config = RecordConfig(
+      encoder: AudioEncoder.aacLc,
+      sampleRate: 44100,
+      bitRate: 96000,
+      numChannels: 1,
+      noiseSuppress: true,
+    );
+
+    await _recorder.start(config, path: path);
+    _serverRecorderActive = true;
+
+    await _ampSub?.cancel();
+    _ampSub = _recorder
+        .onAmplitudeChanged(const Duration(milliseconds: 140))
+        .listen((Amplitude amplitude) {
+          if (!_isListening) return;
+          _lastIntensity = _amplitudeToIntensity(amplitude.current);
+          try {
+            _intensityController?.add(_lastIntensity);
+          } catch (_) {}
+        }, onError: (_) {});
+  }
+
+  Future<(String, String)> _createRecordingTarget() async {
+    final directory = await getTemporaryDirectory();
+    final timestamp = DateTime.now().millisecondsSinceEpoch;
+    const extension = 'm4a';
+    final fileName = 'conduit_voice_$timestamp.$extension';
+    final path = p.join(directory.path, fileName);
+    return (path, 'audio/mp4');
+  }
+
+  Future<void> _finalizeServerRecording() async {
+    final api = _api;
+    if (api == null) {
+      return;
+    }
+
+    String? path;
+    try {
+      if (_serverRecorderActive && await _recorder.isRecording()) {
+        path = await _recorder.stop();
+      } else {
+        path = _serverRecordingPath;
+      }
+    } catch (_) {
+      path = _serverRecordingPath;
+    } finally {
+      _serverRecorderActive = false;
+    }
+
+    final resolvedPath = path;
+    if (resolvedPath == null || resolvedPath.isEmpty) {
+      return;
+    }
+
+    final file = File(resolvedPath);
+    try {
+      if (!await file.exists()) {
+        return;
+      }
+      final bytes = await file.readAsBytes();
+      if (bytes.isEmpty) {
+        return;
+      }
+
+      final response = await api.transcribeSpeech(
+        audioBytes: bytes,
+        fileName: p.basename(resolvedPath),
+        mimeType: _serverRecordingMimeType,
+        language: _languageForServer(),
+      );
+
+      final transcript = _extractTranscriptionText(response);
+      if (transcript != null && transcript.trim().isNotEmpty) {
+        _currentText = transcript.trim();
+        _textStreamController?.add(_currentText);
+      } else {
+        throw StateError('Empty transcription result');
+      }
+    } catch (error) {
+      _textStreamController?.addError(error);
+    } finally {
+      unawaited(_cleanupRecordingFile(file));
+    }
+  }
+
+  Future<void> _cleanupRecordingFile(File file) async {
+    try {
+      if (await file.exists()) {
+        await file.delete();
+      }
+    } catch (_) {}
+  }
+
+  String? _languageForServer() {
+    final locale = _selectedLocaleId;
+    if (locale != null && locale.isNotEmpty) {
+      final primary = locale.split(RegExp('[-_]')).first.toLowerCase();
+      if (primary.length >= 2) {
+        return primary;
+      }
+    }
+    try {
+      final fallback = WidgetsBinding.instance.platformDispatcher.locale;
+      final primary = fallback.languageCode.toLowerCase();
+      if (primary.isNotEmpty) {
+        return primary;
+      }
+    } catch (_) {}
+    return null;
+  }
+
+  String? _extractTranscriptionText(Map<String, dynamic> data) {
+    final direct = data['text'];
+    if (direct is String && direct.trim().isNotEmpty) {
+      return direct;
+    }
+
+    final display = data['display_text'] ?? data['DisplayText'];
+    if (display is String && display.trim().isNotEmpty) {
+      return display;
+    }
+
+    final result = data['result'];
+    if (result is Map<String, dynamic>) {
+      final resultText = result['text'];
+      if (resultText is String && resultText.trim().isNotEmpty) {
+        return resultText;
+      }
+    }
+
+    final combined = data['combinedRecognizedPhrases'];
+    if (combined is List && combined.isNotEmpty) {
+      final first = combined.first;
+      if (first is Map<String, dynamic>) {
+        final candidate =
+            first['display'] ??
+            first['Display'] ??
+            first['transcript'] ??
+            first['text'];
+        if (candidate is String && candidate.trim().isNotEmpty) {
+          return candidate;
+        }
+      } else if (first is String && first.trim().isNotEmpty) {
+        return first;
+      }
+    }
+
+    final results = data['results'];
+    if (results is Map<String, dynamic>) {
+      final channels = results['channels'];
+      if (channels is List && channels.isNotEmpty) {
+        final channel = channels.first;
+        if (channel is Map<String, dynamic>) {
+          final alternatives = channel['alternatives'];
+          if (alternatives is List && alternatives.isNotEmpty) {
+            final alternative = alternatives.first;
+            if (alternative is Map<String, dynamic>) {
+              final transcript =
+                  alternative['transcript'] ?? alternative['text'];
+              if (transcript is String && transcript.trim().isNotEmpty) {
+                return transcript;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    final segments = data['segments'];
+    if (segments is List && segments.isNotEmpty) {
+      final buffer = StringBuffer();
+      for (final segment in segments) {
+        if (segment is Map<String, dynamic>) {
+          final text = segment['text'];
+          if (text is String && text.trim().isNotEmpty) {
+            buffer.write(text.trim());
+            buffer.write(' ');
+          }
+        } else if (segment is String && segment.trim().isNotEmpty) {
+          buffer.write(segment.trim());
+          buffer.write(' ');
+        }
+      }
+      final combinedText = buffer.toString().trim();
+      if (combinedText.isNotEmpty) {
+        return combinedText;
+      }
+    }
+
+    return null;
+  }
+
+  int _amplitudeToIntensity(double? value) {
+    if (value == null || value.isNaN || value.isInfinite) {
+      return 0;
+    }
+    const minDb = -55.0;
+    const maxDb = 0.0;
+    final double clamped = value.clamp(minDb, maxDb).toDouble();
+    final double normalized = ((clamped - minDb) / (maxDb - minDb)).clamp(
+      0.0,
+      1.0,
+    );
+    final int scaled = (normalized * 10).round();
+    if (scaled <= 0) return 0;
+    if (scaled >= 10) return 10;
+    return scaled;
+  }
+
+  Future<void> _closeControllers() async {
+    if (_textStreamController != null) {
+      try {
+        await _textStreamController?.close();
+      } catch (_) {}
+      _textStreamController = null;
+    }
+    if (_intensityController != null) {
+      try {
+        await _intensityController?.close();
+      } catch (_) {}
+      _intensityController = null;
+    }
+  }
+
+  void _startIntensityDecayTimer() {
+    _intensityDecayTimer?.cancel();
+    _intensityDecayTimer = Timer.periodic(const Duration(milliseconds: 120), (
+      _,
+    ) {
+      if (!_isListening) return;
+      if (_lastIntensity <= 0) return;
+      _lastIntensity = (_lastIntensity - 1).clamp(0, 10);
+      try {
+        _intensityController?.add(_lastIntensity);
+      } catch (_) {}
+    });
  }

  void dispose() {
@@ -315,15 +660,24 @@ class VoiceInputService {
    try {
      _speech.dispose().catchError((_) {});
    } catch (_) {}
+    try {
+      _recorder.dispose().catchError((_) {});
+    } catch (_) {}
  }
-
-  // Recording fallback removed; only on-device STT is supported now
-
-  // Native locales not used in server transcription mode
 }

 final voiceInputServiceProvider = Provider<VoiceInputService>((ref) {
-  return VoiceInputService();
+  final api = ref.watch(apiServiceProvider);
+  final service = VoiceInputService(api: api);
+  final currentSettings = ref.read(appSettingsProvider);
+  service.updatePreference(currentSettings.sttPreference);
+  ref.listen<AppSettings>(appSettingsProvider, (previous, next) {
+    if (previous?.sttPreference != next.sttPreference) {
+      service.updatePreference(next.sttPreference);
+    }
+  });
+  ref.onDispose(service.dispose);
+  return service;
 });

@Riverpod(keepAlive: true)
@@ -332,8 +686,16 @@ Future<bool> voiceInputAvailable(Ref ref) async {
  if (!service.isSupportedPlatform) return false;
  final initialized = await service.initialize();
  if (!initialized) return false;
-  // If local STT exists, we consider it available; otherwise ensure mic permission for fallback
-  if (service.hasLocalStt) return true;
+  switch (service.preference) {
+    case SttPreference.deviceOnly:
+      return service.hasLocalStt;
+    case SttPreference.serverOnly:
+      return service.hasServerStt;
+    case SttPreference.auto:
+      if (service.hasLocalStt) return true;
+      if (!service.hasServerStt) return false;
+      break;
+  }
  final hasPermission = await service.checkPermissions();
  if (!hasPermission) return false;
  return service.isAvailable;
@@ -349,3 +711,18 @@ final voiceIntensityStreamProvider = StreamProvider<int>((ref) {
  final service = ref.watch(voiceInputServiceProvider);
  return service.intensityStream;
 });
+
+final localVoiceRecognitionAvailableProvider = FutureProvider<bool>((
+  ref,
+) async {
+  final service = ref.watch(voiceInputServiceProvider);
+  final initialized = await service.initialize();
+  if (!initialized) return false;
+  if (service.hasLocalStt) return true;
+  return service.checkOnDeviceSupport();
+});
+
+final serverVoiceRecognitionAvailableProvider = Provider<bool>((ref) {
+  final service = ref.watch(voiceInputServiceProvider);
+  return service.hasServerStt;
+});
@@ -2380,7 +2380,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
    }
  }

-  // Server transcription removed; only on-device STT is supported
+  // When on-device STT is unavailable we fall back to server transcription.

  Future<void> _stopListening() async {
    _intensitySub?.cancel();
@@ -2460,7 +2460,7 @@ class _ModernChatInputState extends ConsumerState<ModernChatInput>
    HapticFeedback.selectionClick();
  }

-  // Server transcription removed; only on-device STT updates the input text
+  // When on-device STT is unavailable we rely on server transcription.

  void _showVoiceUnavailable(String message) {
    if (!mounted) return;
@@ -14,6 +14,7 @@ import '../../../shared/utils/ui_utils.dart';
 import '../../../core/providers/app_providers.dart';
 import '../../../l10n/app_localizations.dart';
 import '../../chat/providers/text_to_speech_provider.dart';
+import '../../chat/services/voice_input_service.dart';

 class AppCustomizationPage extends ConsumerWidget {
  const AppCustomizationPage({super.key});
@@ -70,6 +71,8 @@ class AppCustomizationPage extends ConsumerWidget {
              languageLabel,
            ),
            const SizedBox(height: Spacing.xl),
+            _buildSttSection(context, ref, settings),
+            const SizedBox(height: Spacing.xl),
            _buildTtsDropdownSection(context, ref, settings),
            const SizedBox(height: Spacing.xl),
            _buildChatSection(context, ref, settings),
@@ -468,6 +471,226 @@ class AppCustomizationPage extends ConsumerWidget {
    );
  }

+  Widget _buildSttSection(
+    BuildContext context,
+    WidgetRef ref,
+    AppSettings settings,
+  ) {
+    final theme = context.conduitTheme;
+    final l10n = AppLocalizations.of(context)!;
+    final localSupport = ref.watch(localVoiceRecognitionAvailableProvider);
+    final bool localAvailable = localSupport.maybeWhen(
+      data: (value) => value,
+      orElse: () => false,
+    );
+    final bool localLoading = localSupport.isLoading;
+    final bool serverAvailable = ref.watch(
+      serverVoiceRecognitionAvailableProvider,
+    );
+    final notifier = ref.read(appSettingsProvider.notifier);
+    final description = _sttPreferenceDescription(l10n, settings.sttPreference);
+
+    final warnings = <String>[];
+    if (settings.sttPreference == SttPreference.deviceOnly &&
+        !localAvailable &&
+        !localLoading) {
+      warnings.add(l10n.sttDeviceUnavailableWarning);
+    }
+    if (settings.sttPreference == SttPreference.serverOnly &&
+        !serverAvailable) {
+      warnings.add(l10n.sttServerUnavailableWarning);
+    }
+
+    final bool autoSelectable =
+        localAvailable || serverAvailable || localLoading;
+    final bool deviceSelectable = localAvailable || localLoading;
+    final bool serverSelectable = serverAvailable;
+
+    return Column(
+      crossAxisAlignment: CrossAxisAlignment.start,
+      children: [
+        Text(
+          l10n.sttSettings,
+          style:
+              theme.headingSmall?.copyWith(color: theme.sidebarForeground) ??
+              TextStyle(color: theme.sidebarForeground, fontSize: 18),
+        ),
+        const SizedBox(height: Spacing.sm),
+        ConduitCard(
+          padding: const EdgeInsets.all(Spacing.md),
+          child: Column(
+            crossAxisAlignment: CrossAxisAlignment.start,
+            children: [
+              Row(
+                children: [
+                  _buildIconBadge(
+                    context,
+                    UiUtils.platformIcon(
+                      ios: CupertinoIcons.mic,
+                      android: Icons.mic,
+                    ),
+                    color: theme.buttonPrimary,
+                  ),
+                  const SizedBox(width: Spacing.md),
+                  Expanded(
+                    child: Text(
+                      l10n.sttEngineLabel,
+                      style:
+                          theme.bodyMedium?.copyWith(
+                            color: theme.sidebarForeground,
+                            fontWeight: FontWeight.w600,
+                          ) ??
+                          TextStyle(
+                            color: theme.sidebarForeground,
+                            fontSize: 14,
+                            fontWeight: FontWeight.w600,
+                          ),
+                    ),
+                  ),
+                ],
+              ),
+              const SizedBox(height: Spacing.sm),
+              Wrap(
+                spacing: Spacing.sm,
+                runSpacing: Spacing.sm,
+                children: [
+                  ChoiceChip(
+                    label: Text(l10n.sttEngineAuto),
+                    selected: settings.sttPreference == SttPreference.auto,
+                    showCheckmark: false,
+                    selectedColor: theme.buttonPrimary,
+                    backgroundColor: theme.cardBackground,
+                    side: BorderSide(
+                      color: settings.sttPreference == SttPreference.auto
+                          ? theme.buttonPrimary.withValues(alpha: 0.6)
+                          : theme.textPrimary.withValues(alpha: 0.2),
+                    ),
+                    labelStyle: TextStyle(
+                      color: settings.sttPreference == SttPreference.auto
+                          ? theme.buttonPrimaryText
+                          : theme.textPrimary,
+                      fontWeight: FontWeight.w600,
+                    ),
+                    onSelected: autoSelectable
+                        ? (value) {
+                            if (value) {
+                              notifier.setSttPreference(SttPreference.auto);
+                            }
+                          }
+                        : null,
+                  ),
+                  ChoiceChip(
+                    label: Text(l10n.sttEngineDevice),
+                    selected:
+                        settings.sttPreference == SttPreference.deviceOnly,
+                    showCheckmark: false,
+                    selectedColor: theme.buttonPrimary,
+                    backgroundColor: theme.cardBackground,
+                    side: BorderSide(
+                      color: settings.sttPreference == SttPreference.deviceOnly
+                          ? theme.buttonPrimary.withValues(alpha: 0.6)
+                          : theme.textPrimary.withValues(alpha: 0.2),
+                    ),
+                    labelStyle: TextStyle(
+                      color: settings.sttPreference == SttPreference.deviceOnly
+                          ? theme.buttonPrimaryText
+                          : theme.textPrimary,
+                      fontWeight: FontWeight.w600,
+                    ),
+                    onSelected: deviceSelectable
+                        ? (value) {
+                            if (value) {
+                              notifier.setSttPreference(
+                                SttPreference.deviceOnly,
+                              );
+                            }
+                          }
+                        : null,
+                  ),
+                  ChoiceChip(
+                    label: Text(l10n.sttEngineServer),
+                    selected:
+                        settings.sttPreference == SttPreference.serverOnly,
+                    showCheckmark: false,
+                    selectedColor: theme.buttonPrimary,
+                    backgroundColor: theme.cardBackground,
+                    side: BorderSide(
+                      color: settings.sttPreference == SttPreference.serverOnly
+                          ? theme.buttonPrimary.withValues(alpha: 0.6)
+                          : theme.textPrimary.withValues(alpha: 0.2),
+                    ),
+                    labelStyle: TextStyle(
+                      color: settings.sttPreference == SttPreference.serverOnly
+                          ? theme.buttonPrimaryText
+                          : theme.textPrimary,
+                      fontWeight: FontWeight.w600,
+                    ),
+                    onSelected: serverSelectable
+                        ? (value) {
+                            if (value) {
+                              notifier.setSttPreference(
+                                SttPreference.serverOnly,
+                              );
+                            }
+                          }
+                        : null,
+                  ),
+                ],
+              ),
+              if (localLoading) ...[
+                const SizedBox(height: Spacing.sm),
+                LinearProgressIndicator(
+                  minHeight: 3,
+                  color: theme.buttonPrimary,
+                  backgroundColor: theme.cardBorder.withValues(alpha: 0.4),
+                ),
+              ],
+              const SizedBox(height: Spacing.sm),
+              AnimatedSwitcher(
+                duration: const Duration(milliseconds: 200),
+                child: Text(
+                  description,
+                  key: ValueKey<String>(
+                    'stt-desc-${settings.sttPreference.name}',
+                  ),
+                  style:
+                      theme.bodyMedium?.copyWith(
+                        color: theme.sidebarForeground.withValues(alpha: 0.9),
+                      ) ??
+                      TextStyle(
+                        color: theme.sidebarForeground.withValues(alpha: 0.9),
+                        fontSize: 14,
+                      ),
+                ),
+              ),
+              if (warnings.isNotEmpty) ...[
+                const SizedBox(height: Spacing.sm),
+                ...warnings.map(
+                  (warning) => Padding(
+                    padding: const EdgeInsets.only(top: Spacing.xs),
+                    child: Text(
+                      warning,
+                      style:
+                          theme.bodySmall?.copyWith(
+                            color: theme.error,
+                            fontWeight: FontWeight.w600,
+                          ) ??
+                          TextStyle(
+                            color: theme.error,
+                            fontSize: 12,
+                            fontWeight: FontWeight.w600,
+                          ),
+                    ),
+                  ),
+                ),
+              ],
+            ],
+          ),
+        ),
+      ],
+    );
+  }
+
  Widget _buildTtsDropdownSection(
    BuildContext context,
    WidgetRef ref,
@@ -691,6 +914,20 @@ class AppCustomizationPage extends ConsumerWidget {
    );
  }

+  String _sttPreferenceDescription(
+    AppLocalizations l10n,
+    SttPreference preference,
+  ) {
+    switch (preference) {
+      case SttPreference.auto:
+        return l10n.sttEngineAutoDescription;
+      case SttPreference.deviceOnly:
+        return l10n.sttEngineDeviceDescription;
+      case SttPreference.serverOnly:
+        return l10n.sttEngineServerDescription;
+    }
+  }
+
  Widget _buildSliderTile(
    BuildContext context,
    WidgetRef ref, {
@@ -307,6 +307,16 @@
  "chatSettings": "Chat",
  "sendOnEnter": "Mit Enter senden",
  "sendOnEnterDescription": "Enter sendet (Soft-Tastatur). Cmd/Ctrl+Enter ebenfalls verfügbar",
+  "sttSettings": "Sprache zu Text",
+  "sttEngineLabel": "Erkennungs-Engine",
+  "sttEngineAuto": "Automatisch",
+  "sttEngineDevice": "Auf dem Gerät",
+  "sttEngineServer": "Server",
+  "sttEngineAutoDescription": "Verwendet die Erkennung auf dem Gerät, wenn verfügbar, und greift sonst auf deinen Server zurück.",
+  "sttEngineDeviceDescription": "Behält Audio auf diesem Gerät. Spracheingabe funktioniert nicht, wenn das Gerät keine Spracherkennung unterstützt.",
+  "sttEngineServerDescription": "Sendet Aufnahmen immer an deinen Conduit-Server zur Transkription.",
+  "sttDeviceUnavailableWarning": "Auf diesem Gerät steht keine Spracherkennung zur Verfügung.",
+  "sttServerUnavailableWarning": "Verbinde dich mit einem Server mit aktivierter Transkription, um diese Option zu nutzen.",
  "ttsSettings": "Text zu Sprache",
  "ttsVoice": "Stimme",
  "ttsSpeechRate": "Sprechgeschwindigkeit",
@@ -307,6 +307,16 @@
  "chatSettings": "Conversación",
  "sendOnEnter": "Enviar con Enter",
  "sendOnEnterDescription": "Enter envía (teclado virtual). Cmd/Ctrl+Enter también disponible",
+  "sttSettings": "Voz a texto",
+  "sttEngineLabel": "Motor de reconocimiento",
+  "sttEngineAuto": "Automático",
+  "sttEngineDevice": "En el dispositivo",
+  "sttEngineServer": "Servidor",
+  "sttEngineAutoDescription": "Usa el reconocimiento en el dispositivo cuando esté disponible y, si no, recurre a tu servidor.",
+  "sttEngineDeviceDescription": "Mantiene el audio en este dispositivo. La entrada de voz no funciona si el dispositivo no admite reconocimiento de voz.",
+  "sttEngineServerDescription": "Envía siempre las grabaciones a tu servidor Conduit para la transcripción.",
+  "sttDeviceUnavailableWarning": "El reconocimiento de voz en el dispositivo no está disponible en este dispositivo.",
+  "sttServerUnavailableWarning": "Conéctate a un servidor con transcripción habilitada para usar esta opción.",
  "ttsSettings": "Texto a voz",
  "ttsVoice": "Voz",
  "ttsSpeechRate": "Velocidad de voz",
@@ -307,6 +307,16 @@
  "chatSettings": "Discussion",
  "sendOnEnter": "Envoyer avec Entrée",
  "sendOnEnterDescription": "Entrée envoie (clavier logiciel). Cmd/Ctrl+Entrée aussi disponible",
+  "sttSettings": "Voix vers texte",
+  "sttEngineLabel": "Moteur de reconnaissance",
+  "sttEngineAuto": "Auto",
+  "sttEngineDevice": "Sur l’appareil",
+  "sttEngineServer": "Serveur",
+  "sttEngineAutoDescription": "Utilise la reconnaissance sur l’appareil quand c’est possible, sinon bascule vers votre serveur.",
+  "sttEngineDeviceDescription": "Conserve l’audio sur cet appareil. L’entrée vocale cesse de fonctionner si la reconnaissance vocale n’est pas prise en charge.",
+  "sttEngineServerDescription": "Envoie toujours les enregistrements à votre serveur Conduit pour transcription.",
+  "sttDeviceUnavailableWarning": "La reconnaissance vocale sur l’appareil n’est pas disponible sur cet appareil.",
+  "sttServerUnavailableWarning": "Connectez-vous à un serveur avec la transcription activée pour utiliser cette option.",
  "ttsSettings": "Synthèse vocale",
  "ttsVoice": "Voix",
  "ttsSpeechRate": "Vitesse de parole",
@@ -307,6 +307,16 @@
  "chatSettings": "Chat",
  "sendOnEnter": "Invia con Invio",
  "sendOnEnterDescription": "Invio invia (tastiera software). Cmd/Ctrl+Invio disponibile",
+  "sttSettings": "Voce in testo",
+  "sttEngineLabel": "Motore di riconoscimento",
+  "sttEngineAuto": "Automatico",
+  "sttEngineDevice": "Sul dispositivo",
+  "sttEngineServer": "Server",
+  "sttEngineAutoDescription": "Usa il riconoscimento sul dispositivo quando disponibile e altrimenti passa al tuo server.",
+  "sttEngineDeviceDescription": "Mantiene l’audio su questo dispositivo. L’input vocale non funziona se il dispositivo non supporta il riconoscimento vocale.",
+  "sttEngineServerDescription": "Invia sempre le registrazioni al tuo server Conduit per la trascrizione.",
+  "sttDeviceUnavailableWarning": "Il riconoscimento vocale sul dispositivo non è disponibile su questo dispositivo.",
+  "sttServerUnavailableWarning": "Collegati a un server con la trascrizione abilitata per usare questa opzione.",
  "ttsSettings": "Sintesi vocale",
  "ttsVoice": "Voce",
  "ttsSpeechRate": "Velocità di sintesi vocale",
@@ -307,6 +307,16 @@
  "chatSettings": "Chat",
  "sendOnEnter": "Verzenden met Enter",
  "sendOnEnterDescription": "Enter verzendt (softtoetsenbord). Cmd/Ctrl+Enter ook beschikbaar",
+  "sttSettings": "Spraak naar tekst",
+  "sttEngineLabel": "Herkenningsengine",
+  "sttEngineAuto": "Automatisch",
+  "sttEngineDevice": "Op het apparaat",
+  "sttEngineServer": "Server",
+  "sttEngineAutoDescription": "Gebruikt spraakherkenning op het apparaat wanneer beschikbaar en valt anders terug op je server.",
+  "sttEngineDeviceDescription": "Houdt audio op dit apparaat. Spraakinput werkt niet als het apparaat geen spraakherkenning ondersteunt.",
+  "sttEngineServerDescription": "Stuurt opnames altijd naar je Conduit-server voor transcriptie.",
+  "sttDeviceUnavailableWarning": "Spraakherkenning op het apparaat is niet beschikbaar op dit apparaat.",
+  "sttServerUnavailableWarning": "Verbind met een server met transcriptie ingeschakeld om deze optie te gebruiken.",
  "ttsSettings": "Tekst naar spraak",
  "ttsVoice": "Stem",
  "ttsSpeechRate": "Spraaksnelheid",
@@ -307,6 +307,16 @@
  "chatSettings": "Чат",
  "sendOnEnter": "Отправка по Enter",
  "sendOnEnterDescription": "Enter отправляет (программная клавиатура). Также доступно Cmd/Ctrl+Enter",
+  "sttSettings": "Речь в текст",
+  "sttEngineLabel": "Движок распознавания",
+  "sttEngineAuto": "Авто",
+  "sttEngineDevice": "На устройстве",
+  "sttEngineServer": "Сервер",
+  "sttEngineAutoDescription": "Использует распознавание на устройстве, когда это возможно, иначе переключается на ваш сервер.",
+  "sttEngineDeviceDescription": "Оставляет звук на этом устройстве. Голосовой ввод не работает, если устройство не поддерживает распознавание речи.",
+  "sttEngineServerDescription": "Всегда отправляет записи на сервер Conduit для транскрибации.",
+  "sttDeviceUnavailableWarning": "Распознавание речи на устройстве недоступно на этом устройстве.",
+  "sttServerUnavailableWarning": "Подключитесь к серверу с включённой транскрибацией, чтобы использовать эту опцию.",
  "ttsSettings": "Преобразование текста в речь",
  "ttsVoice": "Голос",
  "ttsSpeechRate": "Скорость речи",
@@ -307,6 +307,16 @@
  "chatSettings": "对话",
  "sendOnEnter": "回车发送",
  "sendOnEnterDescription": "回车发送（软键盘）。Cmd/Ctrl+Enter 也可用",
+  "sttSettings": "语音转文字",
+  "sttEngineLabel": "识别引擎",
+  "sttEngineAuto": "自动",
+  "sttEngineDevice": "本机",
+  "sttEngineServer": "服务器",
+  "sttEngineAutoDescription": "在可用时使用本机识别，否则切换到你的服务器。",
+  "sttEngineDeviceDescription": "音频会保留在此设备上。如果设备不支持语音识别，语音输入将不可用。",
+  "sttEngineServerDescription": "始终将录音发送到你的 Conduit 服务器进行转写。",
+  "sttDeviceUnavailableWarning": "此设备不支持本机语音识别。",
+  "sttServerUnavailableWarning": "连接到启用转写功能的服务器后才能使用此选项。",
  "ttsSettings": "文本转语音",
  "ttsVoice": "语音",
  "ttsSpeechRate": "语速",