Files
iiEsaywebUIapp/lib/features/chat/services/voice_input_service.dart

762 lines
22 KiB
Dart
Raw Normal View History

2025-08-10 01:20:45 +05:30
import 'dart:async';
import 'dart:io' show File, Platform;
import 'package:flutter/widgets.dart';
import 'package:flutter_riverpod/flutter_riverpod.dart';
import 'package:riverpod_annotation/riverpod_annotation.dart';
import 'package:record/record.dart';
import 'package:stts/stts.dart';
import 'package:path/path.dart' as p;
import 'package:path_provider/path_provider.dart';
import '../../../core/providers/app_providers.dart';
import '../../../core/services/api_service.dart';
import '../../../core/services/settings_service.dart';
part 'voice_input_service.g.dart';
// Lightweight replacement for previous stt.LocaleName used across the UI
class LocaleName {
final String localeId;
final String name;
const LocaleName(this.localeId, this.name);
}
2025-08-10 01:20:45 +05:30
class VoiceInputService {
final AudioRecorder _recorder = AudioRecorder();
final Stt _speech = Stt();
final ApiService? _api;
2025-08-10 01:20:45 +05:30
bool _isInitialized = false;
bool _isListening = false;
2025-08-22 13:54:58 +05:30
bool _localSttAvailable = false;
SttPreference _preference = SttPreference.auto;
bool _usingServerStt = false;
bool _serverRecorderActive = false;
String? _serverRecordingPath;
String? _serverRecordingMimeType;
2025-08-22 13:54:58 +05:30
String? _selectedLocaleId;
List<LocaleName> _locales = const [];
2025-08-10 01:20:45 +05:30
StreamController<String>? _textStreamController;
String _currentText = '';
// Public stream for UI waveform visualization (emits partial text length as proxy)
StreamController<int>? _intensityController;
Stream<int> get intensityStream =>
_intensityController?.stream ?? const Stream<int>.empty();
int _lastIntensity = 0;
Timer? _intensityDecayTimer;
Timer? _silenceTimer;
bool _hasDetectedSpeech = false;
/// Public stream of partial/final transcript strings and special audio tokens.
Stream<String> get textStream =>
_textStreamController?.stream ?? const Stream<String>.empty();
2025-08-10 01:20:45 +05:30
Timer? _autoStopTimer;
StreamSubscription<Amplitude>? _ampSub;
StreamSubscription<SttRecognition>? _sttResultSub;
StreamSubscription<SttState>? _sttStateSub;
2025-08-10 01:20:45 +05:30
bool get isSupportedPlatform => Platform.isAndroid || Platform.isIOS;
bool get hasServerStt => _api != null;
SttPreference get preference => _preference;
bool get allowsServerFallback => _preference != SttPreference.deviceOnly;
bool get prefersServerOnly => _preference == SttPreference.serverOnly;
bool get prefersDeviceOnly => _preference == SttPreference.deviceOnly;
VoiceInputService({ApiService? api}) : _api = api;
void updatePreference(SttPreference preference) {
_preference = preference;
}
2025-08-10 01:20:45 +05:30
Future<bool> initialize() async {
if (_isInitialized) return true;
if (!isSupportedPlatform) return false;
2025-08-22 13:54:58 +05:30
// Prepare local speech recognizer
try {
// Check permission and supported status
_localSttAvailable = await _speech.isSupported();
2025-08-22 13:54:58 +05:30
if (_localSttAvailable) {
try {
final langs = await _speech.getLanguages();
_locales = langs.map((l) => LocaleName(l, l)).toList();
2025-08-22 13:54:58 +05:30
final deviceTag = WidgetsBinding.instance.platformDispatcher.locale
.toLanguageTag();
final match = _locales.firstWhere(
(l) => l.localeId.toLowerCase() == deviceTag.toLowerCase(),
orElse: () {
2025-08-25 10:35:48 +05:30
final primary = deviceTag
.split(RegExp('[-_]'))
.first
.toLowerCase();
2025-08-22 13:54:58 +05:30
return _locales.firstWhere(
(l) => l.localeId.toLowerCase().startsWith('$primary-'),
orElse: () => _locales.isNotEmpty
? _locales.first
: LocaleName('en_US', 'en_US'),
2025-08-22 13:54:58 +05:30
);
},
);
_selectedLocaleId = match.localeId;
2025-08-25 10:35:48 +05:30
} catch (e) {
// ignore locale load errors
2025-08-22 13:54:58 +05:30
_selectedLocaleId = null;
}
}
} catch (_) {
_localSttAvailable = false;
}
2025-08-10 01:20:45 +05:30
_isInitialized = true;
return true;
}
Future<bool> checkPermissions() async {
try {
// Prefer stts permission check which will request microphone permission
final mic = await _speech.hasPermission();
if (mic) return true;
2025-08-10 01:20:45 +05:30
return await _recorder.hasPermission();
} catch (_) {
return false;
}
}
bool get isListening => _isListening;
bool get isAvailable =>
_isInitialized && (_localSttAvailable || hasServerStt);
2025-08-22 13:54:58 +05:30
bool get hasLocalStt => _localSttAvailable;
2025-08-25 10:35:48 +05:30
// Add a method to check if on-device STT is properly supported
Future<bool> checkOnDeviceSupport() async {
if (!isSupportedPlatform || !_isInitialized) return false;
try {
final supported = await _speech.isSupported();
return supported;
2025-08-25 10:35:48 +05:30
} catch (e) {
// ignore errors checking on-device support
2025-08-25 10:35:48 +05:30
return false;
}
}
// Test method to verify on-device STT functionality
Future<String> testOnDeviceStt() async {
try {
// starting on-device STT test
2025-08-25 10:35:48 +05:30
// First ensure we're initialized
await initialize();
if (!_localSttAvailable) {
return 'Local STT not available. Available: $_localSttAvailable';
2025-08-25 10:35:48 +05:30
}
// Check microphone permission
final hasMic = await checkPermissions();
if (!hasMic) {
return 'Microphone permission not granted';
}
// Test if speech recognition is available
final supported = await _speech.isSupported();
if (!supported) {
2025-08-25 10:35:48 +05:30
return 'Speech recognition service is not available on this device';
}
2025-08-25 10:35:48 +05:30
// Set language if available, then start and stop quickly
if (_selectedLocaleId != null) {
try {
await _speech.setLanguage(_selectedLocaleId!);
} catch (_) {}
2025-08-25 10:35:48 +05:30
}
await _speech.start(SttRecognitionOptions(punctuation: true));
2025-08-25 10:35:48 +05:30
await Future.delayed(const Duration(milliseconds: 100));
await _speech.stop();
2025-08-25 10:35:48 +05:30
return 'On-device STT test completed successfully. Local STT available: $_localSttAvailable, Selected locale: $_selectedLocaleId';
} catch (e) {
// on-device STT test failed
2025-08-25 10:35:48 +05:30
return 'On-device STT test failed: $e';
}
}
2025-08-22 13:54:58 +05:30
String? get selectedLocaleId => _selectedLocaleId;
List<LocaleName> get locales => _locales;
2025-08-22 13:54:58 +05:30
void setLocale(String? localeId) {
_selectedLocaleId = localeId;
}
2025-08-10 01:20:45 +05:30
Stream<String> startListening() {
if (!_isInitialized) {
throw Exception('Voice input not initialized');
}
if (_isListening) {
unawaited(stopListening());
2025-08-10 01:20:45 +05:30
}
_textStreamController = StreamController<String>.broadcast();
_currentText = '';
_isListening = true;
_intensityController = StreamController<int>.broadcast();
_lastIntensity = 0;
_usingServerStt = false;
_serverRecorderActive = false;
_serverRecordingPath = null;
_serverRecordingMimeType = null;
_startIntensityDecayTimer();
final bool canUseLocal = _localSttAvailable;
final bool serverAvailable = hasServerStt;
final bool shouldUseLocal =
canUseLocal && _preference != SttPreference.serverOnly;
final bool shouldUseServer =
serverAvailable &&
(_preference == SttPreference.serverOnly || !shouldUseLocal);
if (shouldUseLocal) {
_autoStopTimer?.cancel();
_autoStopTimer = Timer(const Duration(seconds: 60), () {
if (_isListening) {
unawaited(_stopListening());
}
});
2025-08-25 10:35:48 +05:30
Future.microtask(() async {
try {
final isStillAvailable = await _speech.isSupported();
2025-08-25 10:35:48 +05:30
if (!isStillAvailable && _isListening) {
_localSttAvailable = false;
if (hasServerStt && allowsServerFallback) {
unawaited(_beginServerFallback());
} else {
unawaited(_stopListening());
}
2025-08-25 10:35:48 +05:30
}
} catch (_) {
// ignore availability check errors
2025-08-25 10:35:48 +05:30
}
});
_sttResultSub = _speech.onResultChanged.listen((SttRecognition result) {
if (!_isListening) return;
final prevLen = _currentText.length;
_currentText = result.text;
_textStreamController?.add(_currentText);
final delta = (_currentText.length - prevLen).clamp(0, 50);
final mapped = (delta / 5.0).ceil();
_lastIntensity = mapped.clamp(0, 10);
try {
_intensityController?.add(_lastIntensity);
} catch (_) {}
if (result.isFinal) {
unawaited(_stopListening());
}
}, onError: (_) {});
_sttStateSub = _speech.onStateChanged.listen((_) {}, onError: (_) {});
Future(() async {
try {
if (_selectedLocaleId != null) {
await _speech.setLanguage(_selectedLocaleId!);
}
await _speech.start(SttRecognitionOptions(punctuation: true));
} catch (error) {
_localSttAvailable = false;
if (!_isListening) return;
if (hasServerStt && allowsServerFallback) {
await _beginServerFallback();
} else {
_textStreamController?.addError(error);
await _stopListening();
}
}
});
} else if (shouldUseServer) {
_usingServerStt = true;
_autoStopTimer?.cancel();
_autoStopTimer = Timer(const Duration(seconds: 90), () {
if (_isListening) {
unawaited(_stopListening());
}
});
Future(() async {
try {
await _startServerRecording();
} catch (error) {
if (!_isListening) return;
_textStreamController?.addError(error);
await _stopListening();
}
});
2025-08-22 13:54:58 +05:30
} else {
final Exception error;
if (prefersDeviceOnly) {
error = Exception(
'On-device speech recognition required but unavailable',
);
} else if (prefersServerOnly) {
error = Exception('Server speech-to-text is not configured');
} else {
error = Exception('Speech recognition not available on this device');
}
Future.microtask(() {
_textStreamController?.addError(error);
unawaited(_stopListening());
});
2025-08-22 13:54:58 +05:30
}
2025-08-10 01:20:45 +05:30
return _textStreamController!.stream;
}
2025-08-28 19:48:35 +05:30
/// Centralized entry point to begin voice recognition.
/// Ensures initialization and microphone permission before starting.
Future<Stream<String>> beginListening() async {
await initialize();
final hasMic = await checkPermissions();
if (!hasMic) {
throw Exception('Microphone permission not granted');
}
return startListening();
}
2025-08-10 01:20:45 +05:30
Future<void> stopListening() async {
await _stopListening();
}
Future<void> _stopListening() async {
if (!_isListening) return;
_isListening = false;
_autoStopTimer?.cancel();
_autoStopTimer = null;
_silenceTimer?.cancel();
_silenceTimer = null;
if (_usingServerStt) {
await _finalizeServerRecording();
} else {
await _stopLocalStt();
}
await _ampSub?.cancel();
_ampSub = null;
_intensityDecayTimer?.cancel();
_intensityDecayTimer = null;
_lastIntensity = 0;
if (!_usingServerStt && _currentText.isNotEmpty) {
_textStreamController?.add(_currentText);
}
await _closeControllers();
_usingServerStt = false;
_serverRecorderActive = false;
_serverRecordingPath = null;
_serverRecordingMimeType = null;
_hasDetectedSpeech = false;
}
Future<void> _stopLocalStt() async {
if (_sttResultSub != null) {
2025-08-22 13:54:58 +05:30
try {
await _sttResultSub?.cancel();
2025-08-22 13:54:58 +05:30
} catch (_) {}
_sttResultSub = null;
}
if (_sttStateSub != null) {
try {
await _sttStateSub?.cancel();
} catch (_) {}
_sttStateSub = null;
2025-08-22 13:54:58 +05:30
}
2025-08-10 01:20:45 +05:30
if (_localSttAvailable) {
try {
await _speech.stop();
} catch (_) {}
}
}
Future<void> _beginServerFallback() async {
if (!allowsServerFallback) {
_textStreamController?.addError(
Exception('Server speech-to-text disabled in preferences'),
);
await _stopListening();
return;
}
await _stopLocalStt();
if (!hasServerStt) {
_textStreamController?.addError(
Exception('Server speech-to-text unavailable'),
);
await _stopListening();
return;
}
_usingServerStt = true;
2025-08-10 01:20:45 +05:30
_autoStopTimer?.cancel();
_autoStopTimer = Timer(const Duration(seconds: 90), () {
if (_isListening) {
unawaited(_stopListening());
}
});
2025-08-10 01:20:45 +05:30
try {
await _startServerRecording();
} catch (error) {
_textStreamController?.addError(error);
await _stopListening();
2025-08-10 01:20:45 +05:30
}
}
Future<void> _startServerRecording() async {
final (path, mimeType) = await _createRecordingTarget();
_serverRecordingPath = path;
_serverRecordingMimeType = mimeType;
final config = RecordConfig(
encoder: AudioEncoder.aacLc,
sampleRate: 44100,
bitRate: 96000,
numChannels: 1,
noiseSuppress: true,
);
await _recorder.start(config, path: path);
_serverRecorderActive = true;
_hasDetectedSpeech = false;
await _ampSub?.cancel();
_ampSub = _recorder
.onAmplitudeChanged(const Duration(milliseconds: 140))
.listen((Amplitude amplitude) {
if (!_isListening) return;
_lastIntensity = _amplitudeToIntensity(amplitude.current);
try {
_intensityController?.add(_lastIntensity);
} catch (_) {}
// Detect silence and auto-stop for server-side STT
_handleServerAmplitude(amplitude.current);
}, onError: (_) {});
}
2025-08-10 01:20:45 +05:30
void _handleServerAmplitude(double? amplitude) {
if (!_usingServerStt || !_isListening) return;
// Threshold for detecting speech (in dB)
const double speechThreshold = -45.0;
final double currentDb = amplitude ?? -100.0;
// If we detect speech, mark it and reset silence timer
if (currentDb > speechThreshold) {
_hasDetectedSpeech = true;
_silenceTimer?.cancel();
_silenceTimer = null;
} else if (_hasDetectedSpeech && _silenceTimer == null) {
// Start silence timer only after we've detected speech at least once
_silenceTimer = Timer(const Duration(seconds: 2), () {
if (_isListening && _usingServerStt) {
unawaited(_stopListening());
}
});
}
}
Future<(String, String)> _createRecordingTarget() async {
final directory = await getTemporaryDirectory();
final timestamp = DateTime.now().millisecondsSinceEpoch;
const extension = 'm4a';
final fileName = 'conduit_voice_$timestamp.$extension';
final path = p.join(directory.path, fileName);
return (path, 'audio/mp4');
}
Future<void> _finalizeServerRecording() async {
final api = _api;
if (api == null) {
return;
}
String? path;
try {
if (_serverRecorderActive && await _recorder.isRecording()) {
path = await _recorder.stop();
} else {
path = _serverRecordingPath;
}
} catch (_) {
path = _serverRecordingPath;
} finally {
_serverRecorderActive = false;
}
final resolvedPath = path;
if (resolvedPath == null || resolvedPath.isEmpty) {
return;
}
final file = File(resolvedPath);
try {
if (!await file.exists()) {
return;
}
final bytes = await file.readAsBytes();
if (bytes.isEmpty) {
return;
}
final response = await api.transcribeSpeech(
audioBytes: bytes,
fileName: p.basename(resolvedPath),
mimeType: _serverRecordingMimeType,
language: _languageForServer(),
);
final transcript = _extractTranscriptionText(response);
if (transcript != null && transcript.trim().isNotEmpty) {
_currentText = transcript.trim();
_textStreamController?.add(_currentText);
} else {
throw StateError('Empty transcription result');
}
} catch (error) {
_textStreamController?.addError(error);
} finally {
unawaited(_cleanupRecordingFile(file));
}
}
Future<void> _cleanupRecordingFile(File file) async {
try {
if (await file.exists()) {
await file.delete();
}
} catch (_) {}
}
String? _languageForServer() {
final locale = _selectedLocaleId;
if (locale != null && locale.isNotEmpty) {
final primary = locale.split(RegExp('[-_]')).first.toLowerCase();
if (primary.length >= 2) {
return primary;
}
}
try {
final fallback = WidgetsBinding.instance.platformDispatcher.locale;
final primary = fallback.languageCode.toLowerCase();
if (primary.isNotEmpty) {
return primary;
}
} catch (_) {}
return null;
}
String? _extractTranscriptionText(Map<String, dynamic> data) {
final direct = data['text'];
if (direct is String && direct.trim().isNotEmpty) {
return direct;
}
final display = data['display_text'] ?? data['DisplayText'];
if (display is String && display.trim().isNotEmpty) {
return display;
}
final result = data['result'];
if (result is Map<String, dynamic>) {
final resultText = result['text'];
if (resultText is String && resultText.trim().isNotEmpty) {
return resultText;
}
}
final combined = data['combinedRecognizedPhrases'];
if (combined is List && combined.isNotEmpty) {
final first = combined.first;
if (first is Map<String, dynamic>) {
final candidate =
first['display'] ??
first['Display'] ??
first['transcript'] ??
first['text'];
if (candidate is String && candidate.trim().isNotEmpty) {
return candidate;
}
} else if (first is String && first.trim().isNotEmpty) {
return first;
}
}
final results = data['results'];
if (results is Map<String, dynamic>) {
final channels = results['channels'];
if (channels is List && channels.isNotEmpty) {
final channel = channels.first;
if (channel is Map<String, dynamic>) {
final alternatives = channel['alternatives'];
if (alternatives is List && alternatives.isNotEmpty) {
final alternative = alternatives.first;
if (alternative is Map<String, dynamic>) {
final transcript =
alternative['transcript'] ?? alternative['text'];
if (transcript is String && transcript.trim().isNotEmpty) {
return transcript;
}
}
}
}
}
}
final segments = data['segments'];
if (segments is List && segments.isNotEmpty) {
final buffer = StringBuffer();
for (final segment in segments) {
if (segment is Map<String, dynamic>) {
final text = segment['text'];
if (text is String && text.trim().isNotEmpty) {
buffer.write(text.trim());
buffer.write(' ');
}
} else if (segment is String && segment.trim().isNotEmpty) {
buffer.write(segment.trim());
buffer.write(' ');
}
}
final combinedText = buffer.toString().trim();
if (combinedText.isNotEmpty) {
return combinedText;
}
}
return null;
}
int _amplitudeToIntensity(double? value) {
if (value == null || value.isNaN || value.isInfinite) {
return 0;
}
const minDb = -55.0;
const maxDb = 0.0;
final double clamped = value.clamp(minDb, maxDb).toDouble();
final double normalized = ((clamped - minDb) / (maxDb - minDb)).clamp(
0.0,
1.0,
);
final int scaled = (normalized * 10).round();
if (scaled <= 0) return 0;
if (scaled >= 10) return 10;
return scaled;
}
Future<void> _closeControllers() async {
if (_textStreamController != null) {
try {
await _textStreamController?.close();
} catch (_) {}
_textStreamController = null;
}
if (_intensityController != null) {
try {
await _intensityController?.close();
} catch (_) {}
_intensityController = null;
}
}
void _startIntensityDecayTimer() {
_intensityDecayTimer?.cancel();
_intensityDecayTimer = Timer.periodic(const Duration(milliseconds: 120), (
_,
) {
if (!_isListening) return;
if (_lastIntensity <= 0) return;
_lastIntensity = (_lastIntensity - 1).clamp(0, 10);
try {
_intensityController?.add(_lastIntensity);
} catch (_) {}
});
2025-08-10 01:20:45 +05:30
}
void dispose() {
stopListening();
_silenceTimer?.cancel();
2025-08-22 13:54:58 +05:30
try {
_speech.dispose().catchError((_) {});
2025-08-22 13:54:58 +05:30
} catch (_) {}
try {
_recorder.dispose().catchError((_) {});
} catch (_) {}
2025-08-10 01:20:45 +05:30
}
}
final voiceInputServiceProvider = Provider<VoiceInputService>((ref) {
final api = ref.watch(apiServiceProvider);
final service = VoiceInputService(api: api);
final currentSettings = ref.read(appSettingsProvider);
service.updatePreference(currentSettings.sttPreference);
ref.listen<AppSettings>(appSettingsProvider, (previous, next) {
if (previous?.sttPreference != next.sttPreference) {
service.updatePreference(next.sttPreference);
}
});
ref.onDispose(service.dispose);
return service;
2025-08-10 01:20:45 +05:30
});
@Riverpod(keepAlive: true)
Future<bool> voiceInputAvailable(Ref ref) async {
2025-08-10 01:20:45 +05:30
final service = ref.watch(voiceInputServiceProvider);
if (!service.isSupportedPlatform) return false;
final initialized = await service.initialize();
if (!initialized) return false;
switch (service.preference) {
case SttPreference.deviceOnly:
return service.hasLocalStt;
case SttPreference.serverOnly:
return service.hasServerStt;
case SttPreference.auto:
if (service.hasLocalStt) return true;
if (!service.hasServerStt) return false;
break;
}
2025-08-10 01:20:45 +05:30
final hasPermission = await service.checkPermissions();
if (!hasPermission) return false;
return service.isAvailable;
}
2025-08-10 01:20:45 +05:30
final voiceInputStreamProvider = StreamProvider<String>((ref) {
final service = ref.watch(voiceInputServiceProvider);
return service.textStream;
2025-08-10 01:20:45 +05:30
});
/// Stream of crude voice intensity for waveform visuals
final voiceIntensityStreamProvider = StreamProvider<int>((ref) {
final service = ref.watch(voiceInputServiceProvider);
return service.intensityStream;
2025-08-10 01:20:45 +05:30
});
final localVoiceRecognitionAvailableProvider = FutureProvider<bool>((
ref,
) async {
final service = ref.watch(voiceInputServiceProvider);
final initialized = await service.initialize();
if (!initialized) return false;
if (service.hasLocalStt) return true;
return service.checkOnDeviceSupport();
});
final serverVoiceRecognitionAvailableProvider = Provider<bool>((ref) {
final service = ref.watch(voiceInputServiceProvider);
return service.hasServerStt;
});