refactor: remove server audio transcription and related fallback logic, retaining only on-device speech-to-text functionality

2025-08-25 20:56:33 +05:30
parent fa9fa8dd1b
commit ac21ec6493
4 changed files with 56 additions and 355 deletions
@@ -3,7 +3,7 @@ import 'dart:convert';
 import 'dart:io';
 import 'package:flutter/foundation.dart';
 import 'package:dio/dio.dart';
-import 'package:http_parser/http_parser.dart';
+// import 'package:http_parser/http_parser.dart';
 // Removed legacy websocket/socket.io imports
 import 'package:uuid/uuid.dart';
 import '../models/server_config.dart';
@@ -1651,96 +1651,7 @@ class ApiService {
    return [];
  }

-  Future<String> transcribeAudio(
-    List<int> audioData, {
-    String? language,
-  }) async {
-    // Normalize language to primary ISO 639-1 (e.g., en-US -> en) per server accepted list
-    String? normalizedLang;
-    if (language != null && language.isNotEmpty) {
-      normalizedLang = language.split(RegExp('[-_]')).first.toLowerCase();
-    }
-
-    debugPrint(
-      'DEBUG: Transcribing audio data: bytes=${audioData.length}, language=${normalizedLang ?? 'null'}',
-    );
-
-    FormData buildForm(String? lang) {
-      final Map<String, dynamic> formMap = {
-        'file': MultipartFile.fromBytes(
-          audioData,
-          filename: 'audio.wav',
-          contentType: MediaType.parse('audio/wav'),
-        ),
-      };
-      if (lang != null && lang.isNotEmpty) {
-        formMap['language'] = lang;
-      }
-      return FormData.fromMap(formMap);
-    }
-
-    var formData = buildForm(normalizedLang);
-    try {
-      final response = await _dio.post(
-        '/api/v1/audio/transcriptions',
-        data: formData,
-        options: Options(headers: {'Accept': 'application/json'}),
-      );
-      final data = response.data;
-      debugPrint(
-        'DEBUG: Transcription response status: ${response.statusCode}',
-      );
-      DebugLogger.log('Transcription response received successfully');
-      if (data is String) return data;
-      if (data is Map<String, dynamic>) {
-        final text = data['text'] ?? data['transcription'] ?? data['result'];
-        if (text is String) return text;
-        if (data['data'] is Map && (data['data']['text'] is String)) {
-          return data['data']['text'] as String;
-        }
-      }
-      return '';
-    } catch (e) {
-      debugPrint('DEBUG: Transcription API error: $e');
-      // If server complains about invalid language code, retry without language
-      try {
-        if (e is DioException) {
-          final data = e.response?.data;
-          final msg = data is Map<String, dynamic>
-              ? data.toString()
-              : data?.toString() ?? '';
-          if (msg.contains("not a valid language code")) {
-            debugPrint('DEBUG: Retrying transcription without language');
-            final retryResponse = await _dio.post(
-              '/api/v1/audio/transcriptions',
-              data: buildForm(null),
-              options: Options(headers: {'Accept': 'application/json'}),
-            );
-            final rdata = retryResponse.data;
-            debugPrint(
-              'DEBUG: Transcription retry status: ${retryResponse.statusCode}',
-            );
-            DebugLogger.log(
-              'Transcription retry response received successfully',
-            );
-            if (rdata is String) return rdata;
-            if (rdata is Map<String, dynamic>) {
-              final text =
-                  rdata['text'] ?? rdata['transcription'] ?? rdata['result'];
-              if (text is String) return text;
-              if (rdata['data'] is Map && (rdata['data']['text'] is String)) {
-                return rdata['data']['text'] as String;
-              }
-            }
-            return '';
-          }
-        }
-      } catch (e2) {
-        debugPrint('DEBUG: Transcription retry error: $e2');
-      }
-      rethrow;
-    }
-  }
+  // Server audio transcription removed; rely on on-device STT in UI layer

  // Image Generation
  Future<List<Map<String, dynamic>>> getImageModels() async {
@@ -3,8 +3,7 @@ import 'package:record/record.dart';
 import 'package:flutter/widgets.dart';
 import 'dart:async';
 import 'dart:io' show Platform;
-import 'package:path_provider/path_provider.dart';
-import 'package:path/path.dart' as p;
+// Removed path imports as server transcription fallback was removed
 import 'package:stts/stts.dart';

 // Lightweight replacement for previous stt.LocaleName used across the UI
@@ -175,16 +174,9 @@ class VoiceInputService {
        try {
          final isStillAvailable = await _speech.isSupported();
          if (!isStillAvailable && _isListening) {
-            // speech recognition no longer available, fallback to recording
+            // Speech recognition no longer available; stop listening
            _localSttAvailable = false;
-            // Restart with fallback method
-            _startRecordingProxyIntensity();
-            _autoStopTimer?.cancel();
-            _autoStopTimer = Timer(const Duration(seconds: 30), () {
-              if (_isListening) {
            _stopListening();
-              }
-            });
            return;
          }
        } catch (e) {
@@ -218,24 +210,17 @@ class VoiceInputService {
        }
        // Start recognition (no await blocking the sync flow)
        _speech.start(SttRecognitionOptions(punctuation: true)).catchError((_) {
-          // fallback to recording
+          // On-device STT failed; stop listening entirely as server transcription is removed
          _localSttAvailable = false;
-          _startRecordingProxyIntensity();
+          _stopListening();
        });
      } catch (e) {
        _localSttAvailable = false;
-        _startRecordingProxyIntensity();
-      }
-    } else {
-      // Fallback: record audio and signal file path for server transcription
-      // Local STT not available, falling back to recording
-      _startRecordingProxyIntensity();
-      _autoStopTimer?.cancel();
-      _autoStopTimer = Timer(const Duration(seconds: 30), () {
-        if (_isListening) {
        _stopListening();
      }
-      });
+    } else {
+      // No local STT available; stop immediately since server transcription is removed
+      _stopListening();
    }

    return _textStreamController!.stream;
@@ -262,9 +247,6 @@ class VoiceInputService {
        _sttStateSub?.cancel();
      } catch (_) {}
      _sttStateSub = null;
-    } else {
-      // Also stop recorder if active
-      await _stopRecording();
    }

    _autoStopTimer?.cancel();
@@ -284,84 +266,12 @@ class VoiceInputService {

  void dispose() {
    stopListening();
-    _stopRecording(force: true);
    try {
      _speech.dispose().catchError((_) {});
    } catch (_) {}
  }

-  // --- Recording and intensity proxy for server transcription path ---
-  Future<void> _startRecordingProxyIntensity() async {
-    try {
-      final hasMic = await _recorder.hasPermission();
-      if (!hasMic) {
-        _textStreamController?.addError('Microphone permission not granted');
-        _stopListening();
-        return;
-      }
-
-      // Start recording in a portable format (WAV/PCM) for best compatibility with server
-      final tmpDir = await getTemporaryDirectory();
-      final filePath = p.join(
-        tmpDir.path,
-        'conduit_voice_${DateTime.now().millisecondsSinceEpoch}.wav',
-      );
-      await _recorder.start(
-        const RecordConfig(
-          encoder: AudioEncoder.wav,
-          numChannels: 1,
-          sampleRate: 16000,
-          bitRate: 128000,
-        ),
-        path: filePath,
-      );
-      // recording started at filePath
-
-      // Drive intensity from amplitude stream and detect silence
-      // Consider amplitude less than threshold as silence; stop after ~3s of continuous silence
-      const silenceThresholdDb = -45.0; // dBFS threshold
-      const silenceWindow = Duration(seconds: 3);
-      DateTime lastNonSilent = DateTime.now();
-
-      _ampSub = _recorder
-          .onAmplitudeChanged(const Duration(milliseconds: 125))
-          .listen((amp) {
-            if (!_isListening) return;
-            // Normalize peak power (dBFS) into 0-10 bar scale
-            final db = amp.current;
-            // Map dB [-60..0] -> [0..10]
-            final clamped = db.clamp(-60.0, 0.0);
-            final norm = ((clamped + 60.0) / 60.0) * 10.0;
-            _intensityController?.add(norm.round().clamp(0, 10));
-
-            if (db > silenceThresholdDb) {
-              lastNonSilent = DateTime.now();
-            } else {
-              if (DateTime.now().difference(lastNonSilent) >= silenceWindow) {
-                _stopListening();
-              }
-            }
-          });
-    } catch (e) {
-      _textStreamController?.addError('Audio recording failed: $e');
-      _stopListening();
-    }
-  }
-
-  Future<void> _stopRecording({bool force = false}) async {
-    try {
-      if (!await _recorder.isRecording() && !force) return;
-      final path = await _recorder.stop();
-      if (path == null) {
-        _textStreamController?.addError('Recording failed: no file path');
-        return;
-      }
-      // Hand off recorded file path to listeners as a special token; UI layer will upload for transcription
-      _textStreamController?.add('[[AUDIO_FILE_PATH]]:$path');
-    } catch (e) {
-      _textStreamController?.addError('Stop recording error: $e');
-    }
-  }
+  // Recording fallback removed; only on-device STT is supported now

  // Native locales not used in server transcription mode
 }
@@ -7,7 +7,7 @@ import 'package:flutter/services.dart';
 import 'package:flutter/cupertino.dart';
 import 'package:flutter_riverpod/flutter_riverpod.dart';
 import 'package:flutter_animate/flutter_animate.dart';
-import 'dart:io' show Platform, File;
+import 'dart:io' show Platform;
 import 'dart:async';
 import '../../../core/providers/app_providers.dart';
 import '../providers/chat_providers.dart';
@@ -1927,7 +1927,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
  StreamSubscription<String>? _textSub;
  int _elapsedSeconds = 0;
  Timer? _elapsedTimer;
-  bool _isTranscribing = false;
+  // Removed server transcription; keep only on-device listening state
  String _languageTag = 'en';
  bool _holdToTalk = false;
  bool _autoSendFinal = false;
@@ -2005,18 +2005,9 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
      });
      _textSub = stream.listen(
        (text) {
-          // If we receive a special token with recorded audio path, transcribe it via API (fallback)
-          if (text.startsWith('[[AUDIO_FILE_PATH]]:')) {
-            final filePath = text.split(':').skip(1).join(':');
-            debugPrint(
-              'DEBUG: VoiceInputSheet received audio file path: $filePath',
-            );
-            _transcribeRecordedFile(filePath);
-          } else {
          setState(() {
            _recognizedText = text;
          });
-          }
        },
        onDone: () {
          debugPrint('DEBUG: VoiceInputSheet stream done');
@@ -2052,44 +2043,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
    }
  }

-  Future<void> _transcribeRecordedFile(String filePath) async {
-    try {
-      setState(() => _isTranscribing = true);
-      final api = ref.read(apiServiceProvider);
-      if (api == null) throw Exception('API service unavailable');
-      final file = File(filePath);
-      final bytes = await file.readAsBytes();
-      // Try to use device locale; fall back to en-US
-      String? language;
-      try {
-        language = WidgetsBinding.instance.platformDispatcher.locale
-            .toLanguageTag();
-      } catch (_) {
-        language = 'en-US';
-      }
-      final text = await api.transcribeAudio(
-        bytes.toList(),
-        language: language,
-      );
-      debugPrint(
-        'DEBUG: Transcription received: ${text.isEmpty ? '[empty]' : text}',
-      );
-      if (!mounted) return;
-      setState(() {
-        _recognizedText = text;
-      });
-      // Stop listening state if we have a result
-      setState(() => _isListening = false);
-      if (_autoSendFinal && _recognizedText.trim().isNotEmpty) {
-        _sendText();
-      }
-    } catch (e) {
-      if (!mounted) return;
-      setState(() => _isListening = false);
-    } finally {
-      if (mounted) setState(() => _isTranscribing = false);
-    }
-  }
+  // Server transcription removed; only on-device STT is supported

  Future<void> _stopListening() async {
    _intensitySub?.cancel();
@@ -2279,9 +2233,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
                  mainAxisAlignment: MainAxisAlignment.spaceBetween,
                  children: [
                    Text(
-                      _isTranscribing
-                          ? 'Transcribing…'
-                          : _isListening
+                      _isListening
                          ? (_voiceService.hasLocalStt
                                ? 'Listening…'
                                : 'Recording…')
@@ -2601,9 +2553,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
                                        tooltip: AppLocalizations.of(
                                          context,
                                        )!.clear,
-                                        onPressed:
-                                            _recognizedText.isNotEmpty &&
-                                                !_isTranscribing
+                                        onPressed: _recognizedText.isNotEmpty
                                            ? () {
                                                setState(
                                                  () => _recognizedText = '',
@@ -2614,38 +2564,6 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
                                    ],
                                  ),
                                  const SizedBox(height: Spacing.xs),
-                                  if (_isTranscribing)
-                                    Center(
-                                      child: Row(
-                                        mainAxisAlignment:
-                                            MainAxisAlignment.center,
-                                        children: [
-                                          ConduitLoadingIndicator(
-                                            size: isUltra
-                                                ? 14
-                                                : (isCompact ? 16 : 18),
-                                            isCompact: true,
-                                          ),
-                                          const SizedBox(width: Spacing.xs),
-                                          Text(
-                                            'Transcribing…',
-                                            style: TextStyle(
-                                              fontSize: isUltra
-                                                  ? AppTypography.bodySmall
-                                                  : (isCompact
-                                                        ? AppTypography
-                                                              .bodyMedium
-                                                        : AppTypography
-                                                              .bodyLarge),
-                                              color: context
-                                                  .conduitTheme
-                                                  .textSecondary,
-                                            ),
-                                          ),
-                                        ],
-                                      ),
-                                    )
-                                  else
                                  Flexible(
                                    child: SingleChildScrollView(
                                      child: Text(
@@ -2661,8 +2579,7 @@ class _VoiceInputSheetState extends ConsumerState<_VoiceInputSheet> {
                                              ? AppTypography.bodySmall
                                              : (isCompact
                                                    ? AppTypography.bodyMedium
-                                                      : AppTypography
-                                                            .bodyLarge),
+                                                    : AppTypography.bodyLarge),
                                          color: _recognizedText.isEmpty
                                              ? context
                                                    .conduitTheme
@@ -6,7 +6,7 @@ import '../../../shared/widgets/sheet_handle.dart';

 import 'package:flutter_riverpod/flutter_riverpod.dart';

-import 'dart:io' show Platform, File;
+import 'dart:io' show Platform;
 import 'dart:async';
 import '../providers/chat_providers.dart';
 import '../../tools/widgets/unified_tools_modal.dart';
@@ -991,10 +991,6 @@ class _ModernChatInputState extends ConsumerState<ModernChatInput>
      _textSub?.cancel();
      _textSub = stream.listen(
        (text) async {
-          if (text.startsWith('[[AUDIO_FILE_PATH]]:')) {
-            final path = text.split(':').skip(1).join(':');
-            await _transcribeRecordedFile(path);
-          } else {
          final updated =
              (_baseTextAtStart.isEmpty
                  ? ''
@@ -1004,7 +1000,6 @@ class _ModernChatInputState extends ConsumerState<ModernChatInput>
            text: updated,
            selection: TextSelection.collapsed(offset: updated.length),
          );
-          }
        },
        onDone: () {
          if (!mounted) return;
@@ -1039,39 +1034,7 @@ class _ModernChatInputState extends ConsumerState<ModernChatInput>
    HapticFeedback.selectionClick();
  }

-  Future<void> _transcribeRecordedFile(String filePath) async {
-    try {
-      final api = ref.read(apiServiceProvider);
-      if (api == null) return;
-      final file = File(filePath);
-      final bytes = await file.readAsBytes();
-      String? language;
-      try {
-        language = WidgetsBinding.instance.platformDispatcher.locale
-            .toLanguageTag();
-      } catch (_) {
-        language = 'en-US';
-      }
-      final text = await api.transcribeAudio(
-        bytes.toList(),
-        language: language,
-      );
-      final updated =
-          (_baseTextAtStart.isEmpty
-              ? ''
-              : (_baseTextAtStart.trimRight() + ' ')) +
-          text;
-      if (!mounted) return;
-      _controller.value = TextEditingValue(
-        text: updated,
-        selection: TextSelection.collapsed(offset: updated.length),
-      );
-    } catch (_) {
-    } finally {
-      if (!mounted) return;
-      setState(() => _isRecording = false);
-    }
-  }
+  // Server transcription removed; only on-device STT updates the input text

  void _showVoiceUnavailable(String message) {
    if (!mounted) return;