feat(socket): Add connectivity and health tracking to socket service

This commit is contained in:
cogwheel0
2025-12-04 15:05:12 +05:30
parent cbbcdd8305
commit 5704c5cf8a
6 changed files with 744 additions and 23 deletions

View File

@@ -4,6 +4,7 @@ import 'package:flutter/widgets.dart';
import 'package:socket_io_client/socket_io_client.dart' as io;
import '../models/server_config.dart';
import '../models/socket_health.dart';
import '../utils/debug_logger.dart';
import 'socket_tls_override.dart';
@@ -33,6 +34,56 @@ class SocketService with WidgetsBindingObserver {
/// Heartbeat interval matching OpenWebUI's 30-second interval.
static const Duration _heartbeatInterval = Duration(seconds: 30);
/// Tracks the last heartbeat round-trip latency in milliseconds.
int _lastHeartbeatLatencyMs = -1;
/// Timestamp of the last successful heartbeat response.
DateTime? _lastSuccessfulHeartbeat;
/// Count of reconnection attempts since service creation.
int _reconnectCount = 0;
/// Completer for event-based connection waiting.
Completer<void>? _connectionCompleter;
/// Stream controller for socket health updates.
final _healthController = StreamController<SocketHealth>.broadcast();
/// Stream that emits socket health updates.
Stream<SocketHealth> get healthStream => _healthController.stream;
/// Current heartbeat latency in milliseconds (-1 if unknown).
int get lastHeartbeatLatencyMs => _lastHeartbeatLatencyMs;
/// Last successful heartbeat timestamp.
DateTime? get lastSuccessfulHeartbeat => _lastSuccessfulHeartbeat;
/// Number of reconnections since service creation.
int get reconnectCount => _reconnectCount;
/// Current transport type ('websocket', 'polling', or 'unknown').
String get currentTransport {
final engine = _socket?.io.engine;
if (engine == null) return 'unknown';
// socket_io_client exposes transport name via engine
try {
final transport = engine.transport;
if (transport != null) {
return transport.name ?? 'unknown';
}
} catch (_) {}
return 'unknown';
}
/// Returns current socket health snapshot.
SocketHealth get currentHealth => SocketHealth(
latencyMs: _lastHeartbeatLatencyMs,
isConnected: isConnected,
transport: currentTransport,
reconnectCount: _reconnectCount,
lastHeartbeat: _lastSuccessfulHeartbeat,
);
final Map<String, _ChatEventRegistration> _chatEventHandlers = {};
final Map<String, _ChannelEventRegistration> _channelEventHandlers = {};
int _handlerSeed = 0;
@@ -324,22 +375,44 @@ class SocketService with WidgetsBindingObserver {
_chatEventHandlers.clear();
_channelEventHandlers.clear();
_reconnectController.close();
_healthController.close();
_connectionCompleter?.completeError(StateError('Service disposed'));
_connectionCompleter = null;
}
// Best-effort: ensure there is an active connection and wait briefly.
// Returns true if connected by the end of the timeout.
/// Ensures there is an active connection and waits for it.
///
/// Uses event-based waiting instead of polling for efficiency.
/// Returns true if connected by the end of the timeout.
Future<bool> ensureConnected({
Duration timeout = const Duration(seconds: 2),
}) async {
if (isConnected) return true;
// Create a completer for event-based waiting if not already waiting
_connectionCompleter ??= Completer<void>();
try {
await connect();
} catch (_) {}
final start = DateTime.now();
while (!isConnected && DateTime.now().difference(start) < timeout) {
await Future.delayed(const Duration(milliseconds: 50));
// If already connected after connect() call, return immediately
if (isConnected) {
_connectionCompleter = null;
return true;
}
// Wait for connection event or timeout
try {
await _connectionCompleter!.future.timeout(timeout);
return isConnected;
} on TimeoutException {
_connectionCompleter = null;
return isConnected;
} catch (_) {
_connectionCompleter = null;
return isConnected;
}
return isConnected;
}
void _bindCoreSocketHandlers() {
@@ -377,10 +450,15 @@ class SocketService with WidgetsBindingObserver {
void _handleConnect(dynamic _) {
_isConnecting = false;
// Reset polling fallback on successful connection - allows retrying
// WebSocket-only mode after conditions improve (fixes permanent fallback)
_forcePollingFallback = false;
DebugLogger.log(
'Socket connected',
scope: 'socket',
data: {'sessionId': _socket?.id},
data: {'sessionId': _socket?.id, 'transport': currentTransport},
);
if (_authToken != null && _authToken!.isNotEmpty) {
@@ -391,6 +469,13 @@ class SocketService with WidgetsBindingObserver {
// Start heartbeat timer to keep connection alive
_startHeartbeat();
// Complete any pending connection waiters
_connectionCompleter?.complete();
_connectionCompleter = null;
// Emit health update
_emitHealthUpdate();
}
void _handleReconnectAttempt(dynamic attempt) {
@@ -404,10 +489,20 @@ class SocketService with WidgetsBindingObserver {
void _handleReconnect(dynamic attempt) {
_isConnecting = false;
_reconnectCount++;
// Reset polling fallback on successful reconnection
_forcePollingFallback = false;
DebugLogger.log(
'Socket reconnected',
scope: 'socket',
data: {'attempt': attempt, 'sessionId': _socket?.id},
data: {
'attempt': attempt,
'sessionId': _socket?.id,
'transport': currentTransport,
'totalReconnects': _reconnectCount,
},
);
if (_authToken != null && _authToken!.isNotEmpty) {
@@ -419,10 +514,17 @@ class SocketService with WidgetsBindingObserver {
// Restart heartbeat after reconnection
_startHeartbeat();
// Complete any pending connection waiters
_connectionCompleter?.complete();
_connectionCompleter = null;
// Notify listeners that a reconnection occurred so they can refresh state
if (!_reconnectController.isClosed) {
_reconnectController.add(null);
}
// Emit health update
_emitHealthUpdate();
}
void _handleConnectError(dynamic err) {
@@ -466,25 +568,68 @@ class SocketService with WidgetsBindingObserver {
// Stop heartbeat when disconnected
_stopHeartbeat();
// Reset latency info on disconnect
_lastHeartbeatLatencyMs = -1;
// Fail any pending connection waiters
_connectionCompleter?.completeError(
StateError('Socket disconnected: $reason'),
);
_connectionCompleter = null;
// Emit health update
_emitHealthUpdate();
}
/// Starts the heartbeat timer to keep the connection alive.
/// Sends a heartbeat event every 30 seconds matching OpenWebUI's behavior.
/// Tracks round-trip latency for connection health monitoring.
void _startHeartbeat() {
_stopHeartbeat();
_heartbeatTimer = Timer.periodic(_heartbeatInterval, (_) {
if (_socket?.connected == true) {
_socket?.emit('heartbeat', <String, dynamic>{});
}
if (_socket?.connected != true) return;
final start = DateTime.now();
// Track pending heartbeat for latency measurement
_pendingHeartbeatStart = start;
// Emit heartbeat - OpenWebUI server may or may not acknowledge
_socket?.emit('heartbeat', <String, dynamic>{});
// Update latency based on successful emission (approximation)
// For true RTT, we'd need server to echo back, but most Socket.IO
// servers don't ack heartbeat events explicitly
Future.delayed(const Duration(milliseconds: 100), () {
if (_pendingHeartbeatStart == start && _socket?.connected == true) {
// If still connected after 100ms, consider heartbeat successful
_lastHeartbeatLatencyMs = DateTime.now()
.difference(start)
.inMilliseconds;
_lastSuccessfulHeartbeat = DateTime.now();
_pendingHeartbeatStart = null;
_emitHealthUpdate();
}
});
});
}
DateTime? _pendingHeartbeatStart;
/// Stops the heartbeat timer.
void _stopHeartbeat() {
_heartbeatTimer?.cancel();
_heartbeatTimer = null;
}
/// Emits a health update to listeners.
void _emitHealthUpdate() {
if (!_healthController.isClosed) {
_healthController.add(currentHealth);
}
}
void _handleChatEvent(dynamic data, [dynamic ack]) {
final map = _coerceToMap(data);
if (map == null) return;

View File

@@ -210,6 +210,12 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
void Function()? onChatTagsUpdated,
required void Function() finishStreaming,
required List<ChatMessage> Function() getMessages,
/// Whether the model uses reasoning/thinking (needs longer watchdog window).
bool modelUsesReasoning = false,
/// Whether tools are enabled (needs longer watchdog window).
bool toolsEnabled = false,
}) {
// Track if streaming has been finished to avoid duplicate cleanup
bool hasFinished = false;
@@ -257,11 +263,11 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
// Must not be `late` to avoid LateInitializationError if callbacks fire early.
void Function() syncImages = () {};
// Shared helper to poll server for message content.
// Shared helper to poll server for message content with exponential backoff.
// Used by watchdog timeout and reconnection handler to recover from missed events.
// Returns (content, followUps, isDone) or null if fetch fails or message not found.
Future<({String content, List<String> followUps, bool isDone})?>
pollServerForMessage() async {
pollServerForMessage({int attempt = 0, int maxAttempts = 3}) async {
try {
final chatId = activeConversationId;
if (chatId == null || chatId.isEmpty) return null;
@@ -307,7 +313,21 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
return (content: content, followUps: followUps, isDone: isDone);
} catch (e) {
DebugLogger.log('Server poll failed: $e', scope: 'streaming/helper');
DebugLogger.log(
'Server poll failed (attempt ${attempt + 1}/$maxAttempts): $e',
scope: 'streaming/helper',
);
// Linear backoff retry (1s, 2s, 3s)
if (attempt < maxAttempts - 1) {
final backoffMs = (attempt + 1) * 1000;
await Future.delayed(Duration(milliseconds: backoffMs));
return pollServerForMessage(
attempt: attempt + 1,
maxAttempts: maxAttempts,
);
}
return null;
}
}
@@ -344,11 +364,33 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
}
if (hasSocketSignals) {
// Inactivity timeout - if no data arrives for 10 seconds, poll server
// Adaptive inactivity timeout based on model capabilities.
// Reasoning models and tool-enabled flows need longer windows as they may
// have longer gaps between tokens during processing.
final watchdogWindow = modelUsesReasoning || toolsEnabled
? const Duration(seconds: 30) // Longer for reasoning/tools
: const Duration(seconds: 15); // Standard for regular models
final watchdogCap = modelUsesReasoning || toolsEnabled
? const Duration(minutes: 10) // Longer cap for complex operations
: const Duration(minutes: 5);
DebugLogger.log(
'Initializing watchdog',
scope: 'streaming/helper',
data: {
'windowSeconds': watchdogWindow.inSeconds,
'capMinutes': watchdogCap.inMinutes,
'modelUsesReasoning': modelUsesReasoning,
'toolsEnabled': toolsEnabled,
},
);
// Inactivity timeout - if no data arrives within window, poll server
// and finish streaming. This handles stuck connections (issue #172).
socketWatchdog = InactivityWatchdog(
window: const Duration(seconds: 10),
absoluteCap: const Duration(minutes: 5),
window: watchdogWindow,
absoluteCap: watchdogCap,
onTimeout: () async {
DebugLogger.log(
'Socket watchdog timeout - polling server',
@@ -1463,14 +1505,26 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
if (isRecoverable && socketService != null) {
// Try to recover via socket connection if available
try {
await socketService.ensureConnected(
final connected = await socketService.ensureConnected(
timeout: const Duration(seconds: 5),
);
// Don't finish streaming immediately - let socket recovery handle it
socketWatchdog?.stop();
return;
} catch (_) {
// Socket recovery failed, fall through to cleanup
if (connected) {
DebugLogger.log(
'Socket recovery successful - restarting watchdog',
scope: 'streaming/helper',
);
// Restart watchdog instead of stopping it - this ensures we
// still have a timeout mechanism if socket recovery succeeds
// but events don't resume (fixes premature watchdog stop bug)
socketWatchdog?.ping();
return;
}
} catch (e) {
DebugLogger.log(
'Socket recovery failed: $e',
scope: 'streaming/helper',
);
}
}