feat: on device speech to text

2025-08-22 13:54:58 +05:30
parent 2cdbbbc1d3
commit 7f30b728ab
7 changed files with 697 additions and 236 deletions
@@ -1,14 +1,22 @@
 import 'package:flutter_riverpod/flutter_riverpod.dart';
 import 'package:record/record.dart';
+import 'package:flutter/widgets.dart';
 import 'dart:async';
 import 'dart:io' show Platform;
 import 'package:path_provider/path_provider.dart';
 import 'package:path/path.dart' as p;
+import 'package:speech_to_text/speech_recognition_error.dart';
+import 'package:speech_to_text/speech_recognition_result.dart';
+import 'package:speech_to_text/speech_to_text.dart' as stt;

 class VoiceInputService {
  final AudioRecorder _recorder = AudioRecorder();
+  stt.SpeechToText? _speech;
  bool _isInitialized = false;
  bool _isListening = false;
+  bool _localSttAvailable = false;
+  String? _selectedLocaleId;
+  List<stt.LocaleName> _locales = const [];
  StreamController<String>? _textStreamController;
  String _currentText = '';
  // Public stream for UI waveform visualization (emits partial text length as proxy)
@@ -23,16 +31,46 @@ class VoiceInputService {
  Future<bool> initialize() async {
    if (_isInitialized) return true;
    if (!isSupportedPlatform) return false;
-    // Log platform for diagnostics
-    // ignore: avoid_print
-    print(
-      'DEBUG: VoiceInputService initialize on platform: '
-      '${Platform.isAndroid
-          ? 'Android'
-          : Platform.isIOS
-          ? 'iOS'
-          : 'Other'}',
-    );
+    // Prepare local speech recognizer
+    try {
+      _speech = stt.SpeechToText();
+      _localSttAvailable = await _speech!.initialize(
+        onStatus: (status) {
+          // When platform end-of-speech triggers, ensure we stop timer/streams
+          if (status.toLowerCase().contains('notListening') ||
+              status.toLowerCase().contains('done')) {
+            // No-op: UI manages stopping; SpeechToText emits final result
+          }
+        },
+        onError: (SpeechRecognitionError error) {
+          // If any error, we keep fallback available; no throws here.
+        },
+      );
+      if (_localSttAvailable) {
+        try {
+          _locales = await _speech!.locales();
+          final deviceTag = WidgetsBinding.instance.platformDispatcher.locale
+              .toLanguageTag();
+          final match = _locales.firstWhere(
+            (l) => l.localeId.toLowerCase() == deviceTag.toLowerCase(),
+            orElse: () {
+              final primary = deviceTag.split(RegExp('[-_]')).first.toLowerCase();
+              return _locales.firstWhere(
+                (l) => l.localeId.toLowerCase().startsWith('$primary-'),
+                orElse: () => _locales.isNotEmpty
+                    ? _locales.first
+                    : stt.LocaleName('en_US', 'English (US)'),
+              );
+            },
+          );
+          _selectedLocaleId = match.localeId;
+        } catch (_) {
+          _selectedLocaleId = null;
+        }
+      }
+    } catch (_) {
+      _localSttAvailable = false;
+    }
    _isInitialized = true;
    return true;
  }
@@ -46,10 +84,16 @@ class VoiceInputService {
  }

  bool get isListening => _isListening;
-  bool get isAvailable => _isInitialized;
+  bool get isAvailable => _isInitialized; // service usable (local or fallback)
+  bool get hasLocalStt => _localSttAvailable;
+  String? get selectedLocaleId => _selectedLocaleId;
+  List<stt.LocaleName> get locales => _locales;
+
+  void setLocale(String? localeId) {
+    _selectedLocaleId = localeId;
+  }

  Stream<String> startListening() {
-    // Ensure initialized; we allow initialize to pass even if native STT unavailable
    if (!_isInitialized) {
      throw Exception('Voice input not initialized');
    }
@@ -61,21 +105,52 @@ class VoiceInputService {
    _textStreamController = StreamController<String>.broadcast();
    _currentText = '';
    _isListening = true;
-
    _intensityController = StreamController<int>.broadcast();

-    // Start recording raw audio; UI or auto-timer will stop and trigger transcription via API
-    // ignore: avoid_print
-    print('DEBUG: VoiceInputService startListening');
-    _startRecordingProxyIntensity();
+    if (_localSttAvailable && _speech != null) {
+      // Local on-device STT path
+      _autoStopTimer?.cancel();
+      // SpeechToText has its own end-of-speech handling; we still cap at 60s
+      _autoStopTimer = Timer(const Duration(seconds: 60), () {
+        if (_isListening) {
+          _stopListening();
+        }
+      });

-    // Auto-stop after 30 seconds similar to native STT behavior
-    _autoStopTimer?.cancel();
-    _autoStopTimer = Timer(const Duration(seconds: 30), () {
-      if (_isListening) {
-        _stopListening();
-      }
-    });
+      _speech!.listen(
+        localeId: _selectedLocaleId,
+        listenFor: const Duration(seconds: 60),
+        pauseFor: const Duration(seconds: 5),
+        onResult: (SpeechRecognitionResult result) {
+          if (!_isListening) return;
+          _currentText = result.recognizedWords;
+          _textStreamController?.add(_currentText);
+          if (result.finalResult) {
+            // Will be followed by notListening status; we proactively close
+            _stopListening();
+          }
+        },
+        onSoundLevelChange: (level) {
+          // level is roughly 0..1+; map to 0..10
+          final scaled = (level * 10).clamp(0, 10).round();
+          _intensityController?.add(scaled);
+        },
+        listenOptions: stt.SpeechListenOptions(
+          partialResults: true,
+          cancelOnError: true,
+          listenMode: stt.ListenMode.confirmation,
+        ),
+      );
+    } else {
+      // Fallback: record audio and signal file path for server transcription
+      _startRecordingProxyIntensity();
+      _autoStopTimer?.cancel();
+      _autoStopTimer = Timer(const Duration(seconds: 30), () {
+        if (_isListening) {
+          _stopListening();
+        }
+      });
+    }

    return _textStreamController!.stream;
  }
@@ -88,10 +163,14 @@ class VoiceInputService {
    if (!_isListening) return;

    _isListening = false;
-    // Also stop recorder if active
-    await _stopRecording();
-    // ignore: avoid_print
-    print('DEBUG: VoiceInputService stopped listening');
+    if (_localSttAvailable && _speech != null) {
+      try {
+        await _speech!.stop();
+      } catch (_) {}
+    } else {
+      // Also stop recorder if active
+      await _stopRecording();
+    }

    _autoStopTimer?.cancel();
    _autoStopTimer = null;
@@ -111,6 +190,9 @@ class VoiceInputService {
  void dispose() {
    stopListening();
    _stopRecording(force: true);
+    try {
+      _speech?.cancel();
+    } catch (_) {}
  }

  // --- Recording and intensity proxy for server transcription path ---
@@ -138,8 +220,7 @@ class VoiceInputService {
        ),
        path: filePath,
      );
-      // ignore: avoid_print
-      print('DEBUG: VoiceInputService recording started at: $filePath');
+      // recording started at filePath

      // Drive intensity from amplitude stream and detect silence
      // Consider amplitude less than threshold as silence; stop after ~3s of continuous silence
@@ -167,8 +248,6 @@ class VoiceInputService {
            }
          });
    } catch (e) {
-      // ignore: avoid_print
-      print('DEBUG: VoiceInputService recording failed: $e');
      _textStreamController?.addError('Audio recording failed: $e');
      _stopListening();
    }
@@ -182,8 +261,6 @@ class VoiceInputService {
        _textStreamController?.addError('Recording failed: no file path');
        return;
      }
-      // ignore: avoid_print
-      print('DEBUG: VoiceInputService recording saved: $path');
      // Hand off recorded file path to listeners as a special token; UI layer will upload for transcription
      _textStreamController?.add('[[AUDIO_FILE_PATH]]:$path');
    } catch (e) {
@@ -203,6 +280,8 @@ final voiceInputAvailableProvider = FutureProvider<bool>((ref) async {
  if (!service.isSupportedPlatform) return false;
  final initialized = await service.initialize();
  if (!initialized) return false;
+  // If local STT exists, we consider it available; otherwise ensure mic permission for fallback
+  if (service.hasLocalStt) return true;
  final hasPermission = await service.checkPermissions();
  if (!hasPermission) return false;
  return service.isAvailable;