refactor: remove server audio transcription and related fallback logic, retaining only on-device speech-to-text functionality

This commit is contained in:
cogwheel0
2025-08-25 20:56:33 +05:30
parent fa9fa8dd1b
commit ac21ec6493
4 changed files with 56 additions and 355 deletions

View File

@@ -3,7 +3,7 @@ import 'dart:convert';
import 'dart:io';
import 'package:flutter/foundation.dart';
import 'package:dio/dio.dart';
import 'package:http_parser/http_parser.dart';
// import 'package:http_parser/http_parser.dart';
// Removed legacy websocket/socket.io imports
import 'package:uuid/uuid.dart';
import '../models/server_config.dart';
@@ -1651,96 +1651,7 @@ class ApiService {
return [];
}
Future<String> transcribeAudio(
List<int> audioData, {
String? language,
}) async {
// Normalize language to primary ISO 639-1 (e.g., en-US -> en) per server accepted list
String? normalizedLang;
if (language != null && language.isNotEmpty) {
normalizedLang = language.split(RegExp('[-_]')).first.toLowerCase();
}
debugPrint(
'DEBUG: Transcribing audio data: bytes=${audioData.length}, language=${normalizedLang ?? 'null'}',
);
FormData buildForm(String? lang) {
final Map<String, dynamic> formMap = {
'file': MultipartFile.fromBytes(
audioData,
filename: 'audio.wav',
contentType: MediaType.parse('audio/wav'),
),
};
if (lang != null && lang.isNotEmpty) {
formMap['language'] = lang;
}
return FormData.fromMap(formMap);
}
var formData = buildForm(normalizedLang);
try {
final response = await _dio.post(
'/api/v1/audio/transcriptions',
data: formData,
options: Options(headers: {'Accept': 'application/json'}),
);
final data = response.data;
debugPrint(
'DEBUG: Transcription response status: ${response.statusCode}',
);
DebugLogger.log('Transcription response received successfully');
if (data is String) return data;
if (data is Map<String, dynamic>) {
final text = data['text'] ?? data['transcription'] ?? data['result'];
if (text is String) return text;
if (data['data'] is Map && (data['data']['text'] is String)) {
return data['data']['text'] as String;
}
}
return '';
} catch (e) {
debugPrint('DEBUG: Transcription API error: $e');
// If server complains about invalid language code, retry without language
try {
if (e is DioException) {
final data = e.response?.data;
final msg = data is Map<String, dynamic>
? data.toString()
: data?.toString() ?? '';
if (msg.contains("not a valid language code")) {
debugPrint('DEBUG: Retrying transcription without language');
final retryResponse = await _dio.post(
'/api/v1/audio/transcriptions',
data: buildForm(null),
options: Options(headers: {'Accept': 'application/json'}),
);
final rdata = retryResponse.data;
debugPrint(
'DEBUG: Transcription retry status: ${retryResponse.statusCode}',
);
DebugLogger.log(
'Transcription retry response received successfully',
);
if (rdata is String) return rdata;
if (rdata is Map<String, dynamic>) {
final text =
rdata['text'] ?? rdata['transcription'] ?? rdata['result'];
if (text is String) return text;
if (rdata['data'] is Map && (rdata['data']['text'] is String)) {
return rdata['data']['text'] as String;
}
}
return '';
}
}
} catch (e2) {
debugPrint('DEBUG: Transcription retry error: $e2');
}
rethrow;
}
}
// Server audio transcription removed; rely on on-device STT in UI layer
// Image Generation
Future<List<Map<String, dynamic>>> getImageModels() async {

View File

@@ -3,8 +3,7 @@ import 'package:record/record.dart';
import 'package:flutter/widgets.dart';
import 'dart:async';
import 'dart:io' show Platform;
import 'package:path_provider/path_provider.dart';
import 'package:path/path.dart' as p;
// Removed path imports as server transcription fallback was removed
import 'package:stts/stts.dart';
// Lightweight replacement for previous stt.LocaleName used across the UI
@@ -175,16 +174,9 @@ class VoiceInputService {
try {
final isStillAvailable = await _speech.isSupported();
if (!isStillAvailable && _isListening) {
// speech recognition no longer available, fallback to recording
// Speech recognition no longer available; stop listening
_localSttAvailable = false;
// Restart with fallback method
_startRecordingProxyIntensity();
_autoStopTimer?.cancel();
_autoStopTimer = Timer(const Duration(seconds: 30), () {
if (_isListening) {
_stopListening();
}
});
return;
}
} catch (e) {
@@ -218,24 +210,17 @@ class VoiceInputService {
}
// Start recognition (no await blocking the sync flow)
_speech.start(SttRecognitionOptions(punctuation: true)).catchError((_) {
// fallback to recording
// On-device STT failed; stop listening entirely as server transcription is removed
_localSttAvailable = false;
_startRecordingProxyIntensity();
_stopListening();
});
} catch (e) {
_localSttAvailable = false;
_startRecordingProxyIntensity();
}
} else {
// Fallback: record audio and signal file path for server transcription
// Local STT not available, falling back to recording
_startRecordingProxyIntensity();
_autoStopTimer?.cancel();
_autoStopTimer = Timer(const Duration(seconds: 30), () {
if (_isListening) {
_stopListening();
}
});
} else {
// No local STT available; stop immediately since server transcription is removed
_stopListening();
}
return _textStreamController!.stream;
@@ -262,9 +247,6 @@ class VoiceInputService {
_sttStateSub?.cancel();
} catch (_) {}
_sttStateSub = null;
} else {
// Also stop recorder if active
await _stopRecording();
}
_autoStopTimer?.cancel();
@@ -284,84 +266,12 @@ class VoiceInputService {
void dispose() {
stopListening();
_stopRecording(force: true);
try {
_speech.dispose().catchError((_) {});
} catch (_) {}
}
// --- Recording and intensity proxy for server transcription path ---
Future<void> _startRecordingProxyIntensity() async {
try {
final hasMic = await _recorder.hasPermission();
if (!hasMic) {
_textStreamController?.addError('Microphone permission not granted');
_stopListening();
return;
}
// Start recording in a portable format (WAV/PCM) for best compatibility with server
final tmpDir = await getTemporaryDirectory();
final filePath = p.join(
tmpDir.path,
'conduit_voice_${DateTime.now().millisecondsSinceEpoch}.wav',
);
await _recorder.start(
const RecordConfig(
encoder: AudioEncoder.wav,
numChannels: 1,
sampleRate: 16000,
bitRate: 128000,
),
path: filePath,
);
// recording started at filePath
// Drive intensity from amplitude stream and detect silence
// Consider amplitude less than threshold as silence; stop after ~3s of continuous silence
const silenceThresholdDb = -45.0; // dBFS threshold
const silenceWindow = Duration(seconds: 3);
DateTime lastNonSilent = DateTime.now();
_ampSub = _recorder
.onAmplitudeChanged(const Duration(milliseconds: 125))
.listen((amp) {
if (!_isListening) return;
// Normalize peak power (dBFS) into 0-10 bar scale
final db = amp.current;
// Map dB [-60..0] -> [0..10]
final clamped = db.clamp(-60.0, 0.0);
final norm = ((clamped + 60.0) / 60.0) * 10.0;
_intensityController?.add(norm.round().clamp(0, 10));
if (db > silenceThresholdDb) {
lastNonSilent = DateTime.now();
} else {
if (DateTime.now().difference(lastNonSilent) >= silenceWindow) {
_stopListening();
}
}
});
} catch (e) {
_textStreamController?.addError('Audio recording failed: $e');
_stopListening();
}
}
Future<void> _stopRecording({bool force = false}) async {
try {
if (!await _recorder.isRecording() && !force) return;
final path = await _recorder.stop();
if (path == null) {
_textStreamController?.addError('Recording failed: no file path');
return;
}
// Hand off recorded file path to listeners as a special token; UI layer will upload for transcription
_textStreamController?.add('[[AUDIO_FILE_PATH]]:$path');
} catch (e) {
_textStreamController?.addError('Stop recording error: $e');
}
}
// Recording fallback removed; only on-device STT is supported now
// Native locales not used in server transcription mode
}

View File

@@ -7,7 +7,7 @@ import 'package:flutter/services.dart';
import 'package:flutter/cupertino.dart';
import 'package:flutter_riverpod/flutter_riverpod.dart';
import 'package:flutter_animate/flutter_animate.dart';
import 'dart:io' show Platform, File;
import 'dart:io' show Platform;
import 'dart:async';
import '../../../core/providers/app_providers.dart';
import '../providers/chat_providers.dart';
@@ -1927,7 +1927,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
StreamSubscription<String>? _textSub;
int _elapsedSeconds = 0;
Timer? _elapsedTimer;
bool _isTranscribing = false;
// Removed server transcription; keep only on-device listening state
String _languageTag = 'en';
bool _holdToTalk = false;
bool _autoSendFinal = false;
@@ -2005,18 +2005,9 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
});
_textSub = stream.listen(
(text) {
// If we receive a special token with recorded audio path, transcribe it via API (fallback)
if (text.startsWith('[[AUDIO_FILE_PATH]]:')) {
final filePath = text.split(':').skip(1).join(':');
debugPrint(
'DEBUG: VoiceInputSheet received audio file path: $filePath',
);
_transcribeRecordedFile(filePath);
} else {
setState(() {
_recognizedText = text;
});
}
},
onDone: () {
debugPrint('DEBUG: VoiceInputSheet stream done');
@@ -2052,44 +2043,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
}
}
Future<void> _transcribeRecordedFile(String filePath) async {
try {
setState(() => _isTranscribing = true);
final api = ref.read(apiServiceProvider);
if (api == null) throw Exception('API service unavailable');
final file = File(filePath);
final bytes = await file.readAsBytes();
// Try to use device locale; fall back to en-US
String? language;
try {
language = WidgetsBinding.instance.platformDispatcher.locale
.toLanguageTag();
} catch (_) {
language = 'en-US';
}
final text = await api.transcribeAudio(
bytes.toList(),
language: language,
);
debugPrint(
'DEBUG: Transcription received: ${text.isEmpty ? '[empty]' : text}',
);
if (!mounted) return;
setState(() {
_recognizedText = text;
});
// Stop listening state if we have a result
setState(() => _isListening = false);
if (_autoSendFinal && _recognizedText.trim().isNotEmpty) {
_sendText();
}
} catch (e) {
if (!mounted) return;
setState(() => _isListening = false);
} finally {
if (mounted) setState(() => _isTranscribing = false);
}
}
// Server transcription removed; only on-device STT is supported
Future<void> _stopListening() async {
_intensitySub?.cancel();
@@ -2279,9 +2233,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
mainAxisAlignment: MainAxisAlignment.spaceBetween,
children: [
Text(
_isTranscribing
? 'Transcribing…'
: _isListening
_isListening
? (_voiceService.hasLocalStt
? 'Listening…'
: 'Recording…')
@@ -2601,9 +2553,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
tooltip: AppLocalizations.of(
context,
)!.clear,
onPressed:
_recognizedText.isNotEmpty &&
!_isTranscribing
onPressed: _recognizedText.isNotEmpty
? () {
setState(
() => _recognizedText = '',
@@ -2614,38 +2564,6 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
],
),
const SizedBox(height: Spacing.xs),
if (_isTranscribing)
Center(
child: Row(
mainAxisAlignment:
MainAxisAlignment.center,
children: [
ConduitLoadingIndicator(
size: isUltra
? 14
: (isCompact ? 16 : 18),
isCompact: true,
),
const SizedBox(width: Spacing.xs),
Text(
'Transcribing…',
style: TextStyle(
fontSize: isUltra
? AppTypography.bodySmall
: (isCompact
? AppTypography
.bodyMedium
: AppTypography
.bodyLarge),
color: context
.conduitTheme
.textSecondary,
),
),
],
),
)
else
Flexible(
child: SingleChildScrollView(
child: Text(
@@ -2661,8 +2579,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
? AppTypography.bodySmall
: (isCompact
? AppTypography.bodyMedium
: AppTypography
.bodyLarge),
: AppTypography.bodyLarge),
color: _recognizedText.isEmpty
? context
.conduitTheme

View File

@@ -6,7 +6,7 @@ import '../../../shared/widgets/sheet_handle.dart';
import 'package:flutter_riverpod/flutter_riverpod.dart';
import 'dart:io' show Platform, File;
import 'dart:io' show Platform;
import 'dart:async';
import '../providers/chat_providers.dart';
import '../../tools/widgets/unified_tools_modal.dart';
@@ -991,10 +991,6 @@ class _ModernChatInputState extends ConsumerState<ModernChatInput>
_textSub?.cancel();
_textSub = stream.listen(
(text) async {
if (text.startsWith('[[AUDIO_FILE_PATH]]:')) {
final path = text.split(':').skip(1).join(':');
await _transcribeRecordedFile(path);
} else {
final updated =
(_baseTextAtStart.isEmpty
? ''
@@ -1004,7 +1000,6 @@ class _ModernChatInputState extends ConsumerState<ModernChatInput>
text: updated,
selection: TextSelection.collapsed(offset: updated.length),
);
}
},
onDone: () {
if (!mounted) return;
@@ -1039,39 +1034,7 @@ class _ModernChatInputState extends ConsumerState<ModernChatInput>
HapticFeedback.selectionClick();
}
Future<void> _transcribeRecordedFile(String filePath) async {
try {
final api = ref.read(apiServiceProvider);
if (api == null) return;
final file = File(filePath);
final bytes = await file.readAsBytes();
String? language;
try {
language = WidgetsBinding.instance.platformDispatcher.locale
.toLanguageTag();
} catch (_) {
language = 'en-US';
}
final text = await api.transcribeAudio(
bytes.toList(),
language: language,
);
final updated =
(_baseTextAtStart.isEmpty
? ''
: (_baseTextAtStart.trimRight() + ' ')) +
text;
if (!mounted) return;
_controller.value = TextEditingValue(
text: updated,
selection: TextSelection.collapsed(offset: updated.length),
);
} catch (_) {
} finally {
if (!mounted) return;
setState(() => _isRecording = false);
}
}
// Server transcription removed; only on-device STT updates the input text
void _showVoiceUnavailable(String message) {
if (!mounted) return;