From 5fd68f86fe2a83f92fd303b13a28df0020cde6a4 Mon Sep 17 00:00:00 2001 From: cogwheel <172976095+cogwheel0@users.noreply.github.com> Date: Mon, 22 Dec 2025 14:07:04 +0530 Subject: [PATCH] refactor(markdown): remove deprecated stream formatter and enhance preprocessor --- lib/core/utils/html_utils.dart | 18 -- lib/core/utils/markdown_stream_formatter.dart | 71 ----- lib/core/utils/markdown_to_text.dart | 160 ---------- lib/core/utils/reasoning_parser.dart | 100 ++++-- lib/core/utils/tool_calls_parser.dart | 16 +- .../chat/providers/chat_providers.dart | 112 +------ .../providers/text_to_speech_provider.dart | 4 +- .../chat/services/voice_call_service.dart | 4 +- lib/features/chat/views/chat_page.dart | 33 +- lib/features/chat/views/voice_call_page.dart | 4 +- .../widgets/assistant_message_widget.dart | 42 ++- .../markdown/markdown_preprocessor.dart | 288 +++++++++++++----- 12 files changed, 347 insertions(+), 505 deletions(-) delete mode 100644 lib/core/utils/html_utils.dart delete mode 100644 lib/core/utils/markdown_stream_formatter.dart delete mode 100644 lib/core/utils/markdown_to_text.dart diff --git a/lib/core/utils/html_utils.dart b/lib/core/utils/html_utils.dart deleted file mode 100644 index 6b4659e..0000000 --- a/lib/core/utils/html_utils.dart +++ /dev/null @@ -1,18 +0,0 @@ -/// HTML entity utilities for parsing content. -/// -/// Reference: openwebui-src/src/lib/utils/index.ts (unescapeHtml) -library; - -import 'package:html_unescape/html_unescape.dart'; - -/// Utility class for HTML entity handling. -class HtmlUtils { - /// HTML entity unescaper instance. - static final _unescape = HtmlUnescape(); - - /// Unescape HTML entities in a string. - /// - /// Handles all Named, Decimal, and Hexadecimal Character References. - static String unescapeHtml(String s) => _unescape.convert(s); -} - diff --git a/lib/core/utils/markdown_stream_formatter.dart b/lib/core/utils/markdown_stream_formatter.dart deleted file mode 100644 index 45d26b6..0000000 --- a/lib/core/utils/markdown_stream_formatter.dart +++ /dev/null @@ -1,71 +0,0 @@ -// Pre-compiled regex patterns for markdown syntax detection (performance optimization) -final _boldPattern = RegExp(r'\*\*'); -final _italicPattern = RegExp(r'(? _raw.toString(); - - String _syntheticClosures(String content) { - final buffer = StringBuffer(); - - final fenceCount = '```'.allMatches(content).length; - if (fenceCount.isOdd) { - buffer.writeln('```'); - } - - final boldCount = _boldPattern.allMatches(content).length; - if (boldCount.isOdd) { - buffer.write('**'); - } - - final italicCount = _italicPattern.allMatches(content).length; - if (italicCount.isOdd) { - buffer.write('*'); - } - - final openBrackets = '['.allMatches(content).length; - final closeBrackets = ']'.allMatches(content).length; - if (openBrackets > closeBrackets) { - buffer.write(List.filled(openBrackets - closeBrackets, ']').join()); - } - - final openParens = '('.allMatches(content).length; - final closeParens = ')'.allMatches(content).length; - if (openParens > closeParens) { - buffer.write(List.filled(openParens - closeParens, ')').join()); - } - - return buffer.toString(); - } -} diff --git a/lib/core/utils/markdown_to_text.dart b/lib/core/utils/markdown_to_text.dart deleted file mode 100644 index 687720f..0000000 --- a/lib/core/utils/markdown_to_text.dart +++ /dev/null @@ -1,160 +0,0 @@ -/// Converts markdown text to plain text suitable for text-to-speech. -/// -/// Strips formatting while preserving the semantic meaning and readability -/// of the content for audio consumption. -class MarkdownToText { - const MarkdownToText._(); - - static final _thinkingBlockRegex = RegExp( - r']*>.*?', - multiLine: true, - dotAll: true, - ); - static final _thinkTagRegex = RegExp( - r'.*?', - multiLine: true, - dotAll: true, - ); - static final _reasoningTagRegex = RegExp( - r'.*?', - multiLine: true, - dotAll: true, - ); - static final _emojiRegex = RegExp( - r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]', - unicode: true, - ); - static final _codeBlockRegex = RegExp( - r'```[^\n]*\n(.*?)```', - multiLine: true, - dotAll: true, - ); - static final _inlineCodeRegex = RegExp(r'`([^`]+)`'); - static final _boldItalicRegex = RegExp(r'\*\*\*([^*]+)\*\*\*'); - static final _boldRegex = RegExp(r'\*\*([^*]+)\*\*'); - static final _italicRegex = RegExp(r'\*([^*]+)\*|_([^_]+)_'); - static final _strikethroughRegex = RegExp(r'~~([^~]+)~~'); - static final _linkRegex = RegExp(r'\[([^\]]+)\]\([^)]+\)'); - static final _imageRegex = RegExp(r'!\[([^\]]*)\]\([^)]+\)'); - static final _headingRegex = RegExp(r'^#{1,6}\s+(.+)$', multiLine: true); - static final _listItemRegex = RegExp(r'^[\s]*[-*+]\s+(.+)$', multiLine: true); - static final _orderedListRegex = RegExp( - r'^[\s]*\d+\.\s+(.+)$', - multiLine: true, - ); - static final _blockquoteRegex = RegExp(r'^>\s*(.+)$', multiLine: true); - static final _horizontalRuleRegex = RegExp( - r'^[\s]*[-*_]{3,}[\s]*$', - multiLine: true, - ); - static final _htmlTagRegex = RegExp(r'<[^>]+>'); - static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;'); - static final _multipleNewlinesRegex = RegExp(r'\n{3,}'); - static final _multipleSpacesRegex = RegExp(r' {2,}'); - - /// Converts markdown text to plain text suitable for TTS. - /// - /// - Removes thinking/reasoning blocks - /// - Removes emojis - /// - Removes code blocks (replaces with descriptive text) - /// - Strips all formatting (bold, italic, strikethrough) - /// - Converts links to just their text - /// - Removes images (or converts to alt text) - /// - Simplifies headings - /// - Preserves list structure with natural pauses - /// - Removes HTML tags and entities - /// - Normalizes whitespace - static String convert(String markdown) { - if (markdown.trim().isEmpty) { - return ''; - } - - var text = markdown; - - // Remove thinking/reasoning blocks (must be done before general HTML tag removal) - text = text.replaceAll(_thinkingBlockRegex, ''); - text = text.replaceAll(_thinkTagRegex, ''); - text = text.replaceAll(_reasoningTagRegex, ''); - - // Remove emojis - text = text.replaceAll(_emojiRegex, ''); - - // Remove or replace code blocks with descriptive text - text = text.replaceAllMapped(_codeBlockRegex, (match) { - final code = match[1]?.trim() ?? ''; - if (code.isEmpty) { - return ''; - } - return ' (code block) '; - }); - - // Remove inline code backticks but keep the content - text = text.replaceAllMapped(_inlineCodeRegex, (match) => match[1] ?? ''); - - // Strip bold/italic/strikethrough formatting - text = text.replaceAllMapped(_boldItalicRegex, (match) => match[1] ?? ''); - text = text.replaceAllMapped(_boldRegex, (match) => match[1] ?? ''); - text = text.replaceAllMapped( - _italicRegex, - (match) => match[1] ?? match[2] ?? '', - ); - text = text.replaceAllMapped( - _strikethroughRegex, - (match) => match[1] ?? '', - ); - - // Convert links to just their text - text = text.replaceAllMapped(_linkRegex, (match) => match[1] ?? ''); - - // Remove images (or use alt text if available) - text = text.replaceAllMapped(_imageRegex, (match) { - final alt = match[1]?.trim() ?? ''; - return alt.isNotEmpty ? ' ($alt image) ' : ''; - }); - - // Simplify headings (remove # symbols) - text = text.replaceAllMapped(_headingRegex, (match) { - final heading = match[1] ?? ''; - return '$heading.\n'; - }); - - // Preserve list items with natural pauses - text = text.replaceAllMapped(_listItemRegex, (match) => '${match[1]}. '); - text = text.replaceAllMapped(_orderedListRegex, (match) => '${match[1]}. '); - - // Remove blockquote markers - text = text.replaceAllMapped(_blockquoteRegex, (match) => match[1] ?? ''); - - // Remove horizontal rules - text = text.replaceAll(_horizontalRuleRegex, ''); - - // Remove HTML tags - text = text.replaceAll(_htmlTagRegex, ''); - - // Decode HTML entities - text = text.replaceAllMapped(_htmlEntityRegex, (match) { - final entity = match[0] ?? ''; - return switch (entity) { - ' ' => ' ', - '&' => '&', - '<' => '<', - '>' => '>', - '"' => '"', - ''' => "'", - _ => entity, - }; - }); - - // Normalize whitespace - text = text.replaceAll(_multipleNewlinesRegex, '\n\n'); - text = text.replaceAll(_multipleSpacesRegex, ' '); - - // Convert newlines to spaces for natural speech flow - text = text.replaceAll('\n', ' '); - - // Final cleanup - text = text.trim(); - - return text; - } -} diff --git a/lib/core/utils/reasoning_parser.dart b/lib/core/utils/reasoning_parser.dart index 18483d6..a8602dc 100644 --- a/lib/core/utils/reasoning_parser.dart +++ b/lib/core/utils/reasoning_parser.dart @@ -7,7 +7,12 @@ /// Reference: openwebui-src/backend/open_webui/utils/middleware.py DEFAULT_REASONING_TAGS library; -import 'html_utils.dart'; +import 'package:html_unescape/html_unescape.dart'; + +final _htmlUnescape = HtmlUnescape(); + +/// Unescape HTML entities in reasoning content. +String _unescapeHtml(String s) => _htmlUnescape.convert(s); /// All reasoning tag pairs supported by Open WebUI. /// Reference: DEFAULT_REASONING_TAGS in middleware.py @@ -181,9 +186,25 @@ class ReasoningParser { } // Check for raw tag pairs + // Supports tags with optional attributes like + // Reference: openwebui-src/backend/open_webui/utils/middleware.py for (final pair in tagPairs) { final startTag = pair.$1; - final idx = content.indexOf(startTag, index); + int idx = -1; + + // For XML-like tags (e.g., ), match with optional attributes + if (startTag.startsWith('<') && startTag.endsWith('>')) { + final tagName = startTag.substring(1, startTag.length - 1); + final pattern = RegExp('<${RegExp.escape(tagName)}(\\s[^>]*)?>'); + final match = pattern.firstMatch(content.substring(index)); + if (match != null) { + idx = index + match.start; + } + } else { + // For non-XML tags (e.g., ◁think▷), use exact matching + idx = content.indexOf(startTag, index); + } + if (idx != -1 && (nextRawIdx == -1 || idx < nextRawIdx)) { nextRawIdx = idx; matchedRawPair = pair; @@ -336,8 +357,8 @@ class ReasoningParser { return _DetailsResult( entry: ReasoningEntry( - reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining), - summary: HtmlUtils.unescapeHtml(summaryResult.summary), + reasoning: _unescapeHtml(summaryResult.remaining), + summary: _unescapeHtml(summaryResult.summary), duration: effectiveDuration, isDone: false, blockType: blockType, @@ -368,8 +389,8 @@ class ReasoningParser { return _DetailsResult( entry: ReasoningEntry( - reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining), - summary: HtmlUtils.unescapeHtml(summaryResult.summary), + reasoning: _unescapeHtml(summaryResult.remaining), + summary: _unescapeHtml(summaryResult.summary), duration: effectiveDuration, isDone: isDone, blockType: blockType, @@ -381,20 +402,47 @@ class ReasoningParser { } /// Parse a raw reasoning tag pair (e.g., `...`). + /// Supports tags with optional attributes like ``. + /// + /// Reference: openwebui-src/backend/open_webui/utils/middleware.py static _ReasoningResult _parseRawReasoning( String content, int startIdx, String startTag, String endTag, ) { - final endIdx = content.indexOf(endTag, startIdx + startTag.length); + // Find the actual end of the opening tag (handles attributes) + int contentStartIdx; + if (startTag.startsWith('<') && startTag.endsWith('>')) { + // For XML-like tags, find the closing '>' to skip any attributes + final tagCloseIdx = content.indexOf('>', startIdx); + if (tagCloseIdx == -1) { + // Incomplete opening tag + return _ReasoningResult( + entry: ReasoningEntry( + reasoning: '', + summary: '', + duration: 0, + isDone: false, + ), + endIndex: content.length, + isComplete: false, + ); + } + contentStartIdx = tagCloseIdx + 1; + } else { + // For non-XML tags, use exact tag length + contentStartIdx = startIdx + startTag.length; + } + + final endIdx = content.indexOf(endTag, contentStartIdx); if (endIdx == -1) { // Incomplete block (streaming) - final innerContent = content.substring(startIdx + startTag.length); + final innerContent = content.substring(contentStartIdx); return _ReasoningResult( entry: ReasoningEntry( - reasoning: HtmlUtils.unescapeHtml(innerContent.trim()), + reasoning: _unescapeHtml(innerContent.trim()), summary: '', duration: 0, isDone: false, @@ -405,10 +453,10 @@ class ReasoningParser { } // Complete block - final innerContent = content.substring(startIdx + startTag.length, endIdx); + final innerContent = content.substring(contentStartIdx, endIdx); return _ReasoningResult( entry: ReasoningEntry( - reasoning: HtmlUtils.unescapeHtml(innerContent.trim()), + reasoning: _unescapeHtml(innerContent.trim()), summary: '', duration: 0, isDone: true, @@ -533,23 +581,33 @@ class ReasoningParser { } /// Formats the duration for display. - /// Mirrors Open WebUI's formatting: + /// Mirrors Open WebUI's dayjs.duration(seconds, 'seconds').humanize(): /// - < 1: "less than a second" /// - < 60: "X seconds" - /// - >= 60: humanized (e.g., "2 minutes") + /// - >= 60: humanized (e.g., "a minute", "2 minutes", "about an hour") + /// + /// Reference: openwebui-src/src/lib/components/common/Collapsible.svelte static String formatDuration(int seconds) { if (seconds < 1) return 'less than a second'; if (seconds < 60) return '$seconds second${seconds == 1 ? '' : 's'}'; - final minutes = seconds ~/ 60; - final remainingSeconds = seconds % 60; - - if (remainingSeconds == 0) { - return '$minutes minute${minutes == 1 ? '' : 's'}'; + // Match dayjs.duration().humanize() behavior + // Reference: https://day.js.org/docs/en/durations/humanize + if (seconds < 90) return 'a minute'; + if (seconds < 2700) { + // 45 minutes + final minutes = (seconds / 60).round(); + return '$minutes minutes'; } - - // For mixed minutes and seconds, use abbreviated format - return '$minutes min ${remainingSeconds}s'; + if (seconds < 5400) return 'about an hour'; // 90 minutes + if (seconds < 79200) { + // 22 hours + final hours = (seconds / 3600).round(); + return '$hours hours'; + } + if (seconds < 129600) return 'a day'; // 36 hours + final days = (seconds / 86400).round(); + return '$days days'; } } diff --git a/lib/core/utils/tool_calls_parser.dart b/lib/core/utils/tool_calls_parser.dart index 6ecc904..7cb6b6a 100644 --- a/lib/core/utils/tool_calls_parser.dart +++ b/lib/core/utils/tool_calls_parser.dart @@ -1,5 +1,7 @@ import 'dart:convert'; +import '../../shared/widgets/markdown/markdown_preprocessor.dart'; + /// Parsed representation of one tool call emitted as a `
` block class ToolCallEntry { final String id; @@ -255,18 +257,8 @@ class ToolCallsParser { static String sanitizeForApi(String content) { if (content.isEmpty) return content; - // Remove blocks we never want to include in conversation context - final removeTypes = ['reasoning', 'code_interpreter']; - for (final t in removeTypes) { - content = content.replaceAll( - RegExp( - ']*>[\\s\\S]*?
', - multiLine: true, - dotAll: true, - ), - '', - ); - } + // Remove annotations and reasoning blocks + content = ConduitMarkdownPreprocessor.sanitize(content); if (!content.contains('> { bool _taskStatusCheckInFlight = false; bool _observedRemoteTask = false; - MarkdownStreamFormatter? _markdownFormatter; - String? _activeStreamingMessageId; - bool _initialized = false; @override @@ -180,7 +176,6 @@ class ChatMessagesNotifier extends Notifier> { // Cancel any existing message stream when switching conversations _cancelMessageStream(); - _clearStreamingFormatter(); // Explicitly clear formatter on conversation switch _stopRemoteTaskMonitor(); if (next != null) { @@ -222,16 +217,10 @@ class ChatMessagesNotifier extends Notifier> { if (controller != null && controller.isActive) { unawaited(controller.cancel()); } - _clearStreamingFormatter(); cancelSocketSubscriptions(); _stopRemoteTaskMonitor(); } - void _clearStreamingFormatter() { - _markdownFormatter = null; - _activeStreamingMessageId = null; - } - /// Checks if streaming cleanup is needed when adopting server messages. /// Must be called BEFORE updating state, as it compares current local state /// with incoming server state. @@ -397,39 +386,6 @@ class ChatMessagesNotifier extends Notifier> { } } - void _ensureFormatterForMessage(ChatMessage message) { - // If we're switching to a different message, clear the old formatter first - if (_markdownFormatter != null && _activeStreamingMessageId != message.id) { - DebugLogger.log( - 'Clearing formatter for message switch: $_activeStreamingMessageId -> ${message.id}', - scope: 'chat/providers', - ); - _clearStreamingFormatter(); - } - - // If formatter already exists for this message, reuse it - if (_markdownFormatter != null && _activeStreamingMessageId == message.id) { - return; - } - - // Create new formatter - final formatter = MarkdownStreamFormatter(); - - // Only seed with existing content if this is a resume scenario - // For new messages (empty content), start fresh to avoid duplication - final seed = _stripStreamingPlaceholders(message.content); - if (seed.isNotEmpty && message.content.isNotEmpty) { - DebugLogger.log( - 'Seeding formatter with existing content (${seed.length} chars) for message ${message.id}', - scope: 'chat/providers', - ); - formatter.seed(seed); - } - - _markdownFormatter = formatter; - _activeStreamingMessageId = message.id; - } - String _stripStreamingPlaceholders(String content) { var result = content; const ti = '[TYPING_INDICATOR]'; @@ -443,15 +399,6 @@ class ChatMessagesNotifier extends Notifier> { return result; } - String _finalizeFormatter(String messageId, String fallback) { - if (_markdownFormatter != null && _activeStreamingMessageId == messageId) { - final output = _markdownFormatter!.finalize(); - _clearStreamingFormatter(); - return output; - } - return fallback; - } - void _touchStreamingActivity() { _lastStreamingActivity = DateTime.now(); if (_hasStreamingAssistant) { @@ -728,16 +675,11 @@ class ChatMessagesNotifier extends Notifier> { } void appendToLastMessage(String content) { - if (state.isEmpty) { - return; - } + if (state.isEmpty) return; final lastMessage = state.last; - if (lastMessage.role != 'assistant') { - return; - } + if (lastMessage.role != 'assistant') return; if (!lastMessage.isStreaming) { - // Ignore late chunks when streaming already finished DebugLogger.log( 'Ignoring late chunk for finished message: ${lastMessage.id}', scope: 'chat/providers', @@ -745,52 +687,21 @@ class ChatMessagesNotifier extends Notifier> { return; } - _ensureFormatterForMessage(lastMessage); - - // Defensive check: ensure the formatter is for the correct message - // This prevents cross-message pollution when messages change rapidly - if (_activeStreamingMessageId != lastMessage.id) { - DebugLogger.warning( - 'Formatter message ID mismatch: active=$_activeStreamingMessageId, last=${lastMessage.id}. Resetting formatter.', - ); - _clearStreamingFormatter(); - _ensureFormatterForMessage(lastMessage); - } - - final formatter = _markdownFormatter!; - final preview = formatter.ingest(content); - + // Append content directly - the widget's normalize() handles incomplete markdown state = [ ...state.sublist(0, state.length - 1), - lastMessage.copyWith(content: preview), + lastMessage.copyWith(content: lastMessage.content + content), ]; _touchStreamingActivity(); } void replaceLastMessageContent(String content) { - if (state.isEmpty) { - return; - } + if (state.isEmpty) return; final lastMessage = state.last; - if (lastMessage.role != 'assistant') { - return; - } - - _ensureFormatterForMessage(lastMessage); - - // Defensive check: ensure the formatter is for the correct message - if (_activeStreamingMessageId != lastMessage.id) { - DebugLogger.warning( - 'Formatter message ID mismatch in replace: active=$_activeStreamingMessageId, last=${lastMessage.id}. Resetting formatter.', - ); - _clearStreamingFormatter(); - _ensureFormatterForMessage(lastMessage); - } - - final formatter = _markdownFormatter!; - final sanitized = formatter.replace(_stripStreamingPlaceholders(content)); + if (lastMessage.role != 'assistant') return; + final sanitized = _stripStreamingPlaceholders(content); state = [ ...state.sublist(0, state.length - 1), lastMessage.copyWith(content: sanitized), @@ -804,8 +715,7 @@ class ChatMessagesNotifier extends Notifier> { final lastMessage = state.last; if (lastMessage.role != 'assistant' || !lastMessage.isStreaming) return; - final finalized = _finalizeFormatter(lastMessage.id, lastMessage.content); - final cleaned = _stripStreamingPlaceholders(finalized); + final cleaned = _stripStreamingPlaceholders(lastMessage.content); var updatedLast = lastMessage.copyWith( isStreaming: false, @@ -1005,11 +915,7 @@ Future restoreDefaultModel(dynamic ref) async { try { await ref.read(defaultModelProvider.future); } catch (e) { - DebugLogger.error( - 'restore-default-failed', - scope: 'chat/model', - error: e, - ); + DebugLogger.error('restore-default-failed', scope: 'chat/model', error: e); } } diff --git a/lib/features/chat/providers/text_to_speech_provider.dart b/lib/features/chat/providers/text_to_speech_provider.dart index 759a19f..3d4253e 100644 --- a/lib/features/chat/providers/text_to_speech_provider.dart +++ b/lib/features/chat/providers/text_to_speech_provider.dart @@ -4,7 +4,7 @@ import 'package:flutter_riverpod/flutter_riverpod.dart'; import '../../../core/services/settings_service.dart'; import '../../../core/providers/app_providers.dart'; -import '../../../core/utils/markdown_to_text.dart'; +import '../../../shared/widgets/markdown/markdown_preprocessor.dart'; import '../services/text_to_speech_service.dart'; enum TtsPlaybackStatus { idle, initializing, loading, speaking, paused, error } @@ -218,7 +218,7 @@ class TextToSpeechController extends Notifier { } // Prepare sentence split for highlighting - final cleanText = MarkdownToText.convert(text); + final cleanText = ConduitMarkdownPreprocessor.toPlainText(text); final sentences = _service.splitTextForSpeech(cleanText); final offsets = _computeOffsets(cleanText, sentences); diff --git a/lib/features/chat/services/voice_call_service.dart b/lib/features/chat/services/voice_call_service.dart index bb786d4..d98fbf1 100644 --- a/lib/features/chat/services/voice_call_service.dart +++ b/lib/features/chat/services/voice_call_service.dart @@ -12,7 +12,7 @@ import '../../../core/providers/app_providers.dart'; import '../../../core/services/background_streaming_handler.dart'; import '../../../core/services/callkit_service.dart'; import '../../../core/services/socket_service.dart'; -import '../../../core/utils/markdown_to_text.dart'; +import '../../../shared/widgets/markdown/markdown_preprocessor.dart'; import '../providers/chat_providers.dart'; import 'text_to_speech_service.dart'; import '../../../core/services/settings_service.dart'; @@ -589,7 +589,7 @@ class VoiceCallService { void _processSpeakableSegments({required bool isFinalChunk}) { if (_isDisposed) return; - final cleanText = MarkdownToText.convert(_accumulatedResponse).trim(); + final cleanText = ConduitMarkdownPreprocessor.toPlainText(_accumulatedResponse).trim(); if (cleanText.isEmpty) { return; } diff --git a/lib/features/chat/views/chat_page.dart b/lib/features/chat/views/chat_page.dart index 7ba7699..3b2797e 100644 --- a/lib/features/chat/views/chat_page.dart +++ b/lib/features/chat/views/chat_page.dart @@ -18,6 +18,7 @@ import '../providers/chat_providers.dart'; import '../../../core/utils/debug_logger.dart'; import '../../../core/utils/user_display_name.dart'; import '../../../core/utils/model_icon_utils.dart'; +import '../../../shared/widgets/markdown/markdown_preprocessor.dart'; import '../../../core/utils/android_assistant_handler.dart'; import '../widgets/modern_chat_input.dart'; import '../widgets/user_message_bubble.dart'; @@ -1205,36 +1206,8 @@ class _ChatPageState extends ConsumerState { } void _copyMessage(String content) { - // Strip reasoning details from the copied content - String cleanedContent = content; - - // Remove
blocks - cleanedContent = cleanedContent.replaceAll( - RegExp( - r']*>[\s\S]*?<\/details>', - multiLine: true, - dotAll: true, - ), - '', - ); - - // Remove raw reasoning tags - cleanedContent = cleanedContent.replaceAll( - RegExp(r'[\s\S]*?<\/think>', multiLine: true, dotAll: true), - '', - ); - cleanedContent = cleanedContent.replaceAll( - RegExp( - r'[\s\S]*?<\/reasoning>', - multiLine: true, - dotAll: true, - ), - '', - ); - - // Clean up any extra whitespace - cleanedContent = cleanedContent.trim(); - + // Strip reasoning blocks and annotations from copied content + final cleanedContent = ConduitMarkdownPreprocessor.sanitize(content); Clipboard.setData(ClipboardData(text: cleanedContent)); } diff --git a/lib/features/chat/views/voice_call_page.dart b/lib/features/chat/views/voice_call_page.dart index 743f28a..a5863d7 100644 --- a/lib/features/chat/views/voice_call_page.dart +++ b/lib/features/chat/views/voice_call_page.dart @@ -7,7 +7,7 @@ import 'package:flutter/cupertino.dart'; import 'package:flutter_riverpod/flutter_riverpod.dart'; import '../../../core/providers/app_providers.dart'; -import '../../../core/utils/markdown_to_text.dart'; +import '../../../shared/widgets/markdown/markdown_preprocessor.dart'; import '../../../l10n/app_localizations.dart'; import '../../../shared/widgets/conduit_components.dart'; import '../providers/chat_providers.dart'; @@ -335,7 +335,7 @@ class _VoiceCallPageState extends ConsumerState } else if (_currentState == VoiceCallState.speaking && _currentResponse.isNotEmpty) { // Convert markdown to clean text for display - displayText = MarkdownToText.convert(_currentResponse); + displayText = ConduitMarkdownPreprocessor.toPlainText(_currentResponse); } if (displayText.isEmpty) { diff --git a/lib/features/chat/widgets/assistant_message_widget.dart b/lib/features/chat/widgets/assistant_message_widget.dart index a79000c..4270fa4 100644 --- a/lib/features/chat/widgets/assistant_message_widget.dart +++ b/lib/features/chat/widgets/assistant_message_widget.dart @@ -11,7 +11,7 @@ import '../../../core/utils/reasoning_parser.dart'; import '../../../core/utils/message_segments.dart'; import '../../../core/utils/tool_calls_parser.dart'; import '../../../core/models/chat_message.dart'; -import '../../../core/utils/markdown_to_text.dart'; +import '../../../shared/widgets/markdown/markdown_preprocessor.dart'; import '../providers/text_to_speech_provider.dart'; import 'enhanced_image_attachment.dart'; import 'package:conduit/l10n/app_localizations.dart'; @@ -166,6 +166,10 @@ class _AssistantMessageWidgetState extends ConsumerState raw = raw.substring(searchBanner.length); } + // Note: Link reference definitions (including OpenAI annotations like + // [openai_responses:v2:reasoning:ID]: #) are stripped by the markdown + // preprocessor using the `markdown` package for proper CommonMark handling. + // Do not truncate content during streaming; segmented parser skips // incomplete details blocks and tiles will render once complete. final rSegs = ReasoningParser.segments(raw); @@ -263,12 +267,12 @@ class _AssistantMessageWidgetState extends ConsumerState String _buildTtsPlainTextFallback(List segments, String fallback) { if (segments.isEmpty) { - return MarkdownToText.convert(fallback); + return ConduitMarkdownPreprocessor.toPlainText(fallback); } final buffer = StringBuffer(); for (final segment in segments) { - final sanitized = MarkdownToText.convert(segment); + final sanitized = ConduitMarkdownPreprocessor.toPlainText(segment); if (sanitized.isEmpty) { continue; } @@ -281,7 +285,7 @@ class _AssistantMessageWidgetState extends ConsumerState final result = buffer.toString().trim(); if (result.isEmpty) { - return MarkdownToText.convert(fallback); + return ConduitMarkdownPreprocessor.toPlainText(fallback); } return result; } @@ -1738,24 +1742,32 @@ class _AssistantMessageWidgetState extends ConsumerState summaryLower == 'thinking...' || summaryLower.startsWith('thinking'); + // Check if summary contains server-formatted duration (e.g., "(0s)", "for 0 secs") + final hasDurationInSummary = RegExp( + r'\(\d+s\)|\bfor \d+ secs?\b', + caseSensitive: false, + ).hasMatch(rc.summary); + // - If not done (streaming): show "Thinking..." - // - If done with duration: show "Thought for X seconds" - // - If done without duration: show "Thoughts" or custom summary + // - If done: show humanized "Thought for X" (uses our formatDuration) + // - If done without duration and has custom summary: show summary if (!rc.isDone) { // Still thinking - use summary if available, else default return hasSummary && !isThinkingSummary ? rc.summary : l10n.thinking; } - // Done thinking - check duration - if (rc.duration > 0) { + // Done thinking - always use humanized duration format + // This ensures "less than a second" instead of "0 secs" from server + if (rc.duration >= 0 && (rc.duration > 0 || hasDurationInSummary || isThinkingSummary)) { return l10n.thoughtForDuration(rc.formattedDuration); } - // No duration - use custom summary if meaningful, else default - if (!hasSummary || isThinkingSummary) { - return l10n.thoughts; + // Has custom summary that's not a duration - show it + if (hasSummary && !isThinkingSummary) { + return rc.summary; } - return rc.summary; + + return l10n.thoughts; } Widget buildHeader() { @@ -1863,13 +1875,13 @@ String _buildTtsPlainTextWorker(Map payload) { final segments = rawSegments is List ? rawSegments.cast() : const []; if (segments.isEmpty) { - return MarkdownToText.convert(fallback); + return ConduitMarkdownPreprocessor.toPlainText(fallback); } final buffer = StringBuffer(); for (final segment in segments) { if (segment is! String || segment.isEmpty) continue; - final sanitized = MarkdownToText.convert(segment); + final sanitized = ConduitMarkdownPreprocessor.toPlainText(segment); if (sanitized.isEmpty) continue; if (buffer.isNotEmpty) { buffer.writeln(); @@ -1880,7 +1892,7 @@ String _buildTtsPlainTextWorker(Map payload) { final result = buffer.toString().trim(); if (result.isEmpty) { - return MarkdownToText.convert(fallback); + return ConduitMarkdownPreprocessor.toPlainText(fallback); } return result; } diff --git a/lib/shared/widgets/markdown/markdown_preprocessor.dart b/lib/shared/widgets/markdown/markdown_preprocessor.dart index 4fee307..8db779c 100644 --- a/lib/shared/widgets/markdown/markdown_preprocessor.dart +++ b/lib/shared/widgets/markdown/markdown_preprocessor.dart @@ -1,10 +1,22 @@ -/// Utility helpers for normalising markdown content before handing it to -/// [ConduitMarkdown]. The goal is to keep streaming responsive while smoothing -/// out troublesome edge-cases (e.g. nested fences inside lists). +import 'package:html_unescape/html_unescape.dart'; +import 'package:markdown/markdown.dart' as md; + +/// Content preprocessing, sanitization, and transformation for Markdown. +/// +/// Provides: +/// - [normalize] - Prepares content for display (keeps reasoning blocks) +/// - [sanitize] - Cleans content for copy/API (removes reasoning blocks) +/// - [toPlainText] - Converts to plain text for TTS +/// - [softenInlineCode] - Breaks long inline code spans class ConduitMarkdownPreprocessor { const ConduitMarkdownPreprocessor._(); - // Pre-compile regex patterns for better performance during streaming + static final _htmlUnescape = HtmlUnescape(); + + // ============================================================ + // Pre-compiled Patterns - Display/Sanitization + // ============================================================ + static final _bulletFenceRegex = RegExp( r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$', multiLine: true, @@ -14,7 +26,8 @@ class ConduitMarkdownPreprocessor { multiLine: true, ); static final _dedentCloseRegex = RegExp(r'^[ \t]+```\s*$', multiLine: true); - static final _inlineClosingRegex = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))'); + static final _inlineClosingRegex = + RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))'); static final _labelThenDashRegex = RegExp( r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)', multiLine: true, @@ -24,92 +37,143 @@ class ConduitMarkdownPreprocessor { multiLine: true, ); static final _fenceAtBolRegex = RegExp(r'^\s*```', multiLine: true); + static final _linkWithTrailingSpaces = + RegExp(r'\[[^\]]+\]\([^\)]+\)\s{2,}$'); + static final _multipleNewlines = RegExp(r'\n{3,}'); - /// Normalises common fence and hard-break issues produced by LLMs. + /// Combined pattern for all reasoning/thinking blocks. + static final _reasoningBlocks = RegExp( + r']*>[\s\S]*?
|' + r'<(?:think|thinking|reasoning)(?:\s[^>]*)?>[\s\S]*?', + multiLine: true, + dotAll: true, + ); + + // ============================================================ + // Pre-compiled Patterns - Plain Text (TTS) + // ============================================================ + + static final _codeBlock = RegExp(r'```[^\n]*\n[\s\S]*?```'); + static final _inlineCode = RegExp(r'`([^`]+)`'); + static final _image = RegExp(r'!\[[^\]]*\]\([^)]+\)'); + static final _link = RegExp(r'\[([^\]]+)\]\([^)]+\)'); + // Paired markdown formatting - only unambiguous markers for TTS + // Single * and _ are skipped as they're ambiguous (math, variable names) + static final _boldItalic = RegExp(r'\*\*\*([^*]+)\*\*\*'); + static final _bold = RegExp(r'\*\*([^*]+)\*\*'); + static final _strikethrough = RegExp(r'~~([^~]+)~~'); + // Single asterisk italic: only at word boundaries (space or line start/end) + static final _italicAsterisk = RegExp(r'(?:^|\s)\*([^*\s]+)\*(?=\s|$)'); + // Single underscore italic: only when surrounded by spaces (not in identifiers) + static final _italicUnderscore = RegExp(r'(?:^|\s)_([^_\s]+)_(?=\s|$)'); + static final _heading = RegExp(r'^#{1,6}\s+', multiLine: true); + static final _listMarker = RegExp(r'^[\s]*(?:[-*+]|\d+\.)\s+', multiLine: true); + static final _blockquote = RegExp(r'^>\s*', multiLine: true); + static final _horizontalRule = RegExp(r'^[\s]*[-*_]{3,}[\s]*$', multiLine: true); + static final _htmlTag = RegExp(r'<[^>]+>'); + /// Comprehensive emoji pattern for TTS cleanup. + static final _emoji = RegExp( + r'[\u{1F600}-\u{1F64F}]|' // Emoticons + r'[\u{1F300}-\u{1F5FF}]|' // Misc Symbols and Pictographs + r'[\u{1F680}-\u{1F6FF}]|' // Transport and Map + r'[\u{1F1E0}-\u{1F1FF}]|' // Flags + r'[\u{2600}-\u{26FF}]|' // Misc symbols + r'[\u{2700}-\u{27BF}]|' // Dingbats + r'[\u{1F900}-\u{1F9FF}]|' // Supplemental Symbols + r'[\u{1FA00}-\u{1FA6F}]|' // Chess, cards + r'[\u{1FA70}-\u{1FAFF}]|' // Symbols Extended-A + r'[\u{FE00}-\u{FE0F}]|' // Variation Selectors + r'[\u{1F018}-\u{1F270}]|' // Various + r'[\u{238C}-\u{2454}]|' // Misc Technical + r'[\u{20D0}-\u{20FF}]', // Combining Diacritical Marks + unicode: true, + ); + static final _whitespace = RegExp(r'\s+'); + + // ============================================================ + // Public API + // ============================================================ + + /// Normalizes content for Markdown display. + /// + /// - Strips link reference definitions (including OpenAI annotations) + /// - Fixes common LLM fence issues + /// - Preserves reasoning blocks for collapsible UI rendering static String normalize(String input) { - if (input.isEmpty) { - return input; - } + if (input.isEmpty) return input; var output = input.replaceAll('\r\n', '\n'); - // Move fenced code blocks that start on the same line as a list item onto - // their own line so the parser does not treat them as list text. - output = output.replaceAllMapped( - _bulletFenceRegex, - (match) => '${match[1]}\n```${match[2]}', - ); + // Strip link reference definitions using markdown package + output = _stripLinkReferenceDefinitions(output); - // Dedent opening fences to avoid partial code-block detection when the - // model indents fences by accident. - output = output.replaceAllMapped( - _dedentOpenRegex, - (match) => '```${match[1]}', - ); + // Fix fence issues + output = _normalizeFences(output); - // Dedent closing fences for the same reason as the opening fences. - output = output.replaceAllMapped(_dedentCloseRegex, (_) => '```'); - - // Ensure closing fences stand alone. Prevents situations like `}\n```foo` - // from keeping trailing braces inside the code block. - output = output.replaceAllMapped( - _inlineClosingRegex, - (match) => '${match[1]}\n```', - ); - - // Insert a blank line when a "label: value" line is followed by a - // horizontal rule so it is not treated as a Setext heading underline. + // Fix Setext heading false positives output = output.replaceAllMapped( _labelThenDashRegex, (match) => '${match[1]}\n\n${match[2]}', ); - // Allow headings like "## 1. Summary" without triggering ordered-list - // parsing by inserting a zero-width joiner after the numeric marker. + // Fix numeric heading parsing output = output.replaceAllMapped( _atxEnumRegex, (match) => '${match[1]}.\u200C${match[2]}${match[3]}', ); - // Auto-close an unmatched opening fence at EOF to avoid the entire tail - // of the message rendering as code. - final fenceCount = _fenceAtBolRegex.allMatches(output).length; - if (fenceCount.isOdd) { - if (!output.endsWith('\n')) { - output += '\n'; - } - output += '```'; - } - - // Convert Markdown links followed by two trailing spaces into separate - // paragraphs so that consecutive links do not collapse into a single - // paragraph at render time. - final linkWithTrailingSpaces = RegExp(r'\[[^\]]+\]\([^\)]+\)\s{2,}$'); - final lines = output.split('\n'); - if (lines.length > 1) { - final buffer = StringBuffer(); - for (var i = 0; i < lines.length; i++) { - final line = lines[i]; - buffer.write(line); - if (i < lines.length - 1) { - buffer.write('\n'); - } - if (linkWithTrailingSpaces.hasMatch(line)) { - buffer.write('\n'); - } - } - output = buffer.toString(); - } + // Separate consecutive links + output = _separateConsecutiveLinks(output); return output; } - /// Inserts zero-width break characters into long inline code spans so they - /// remain readable and do not overflow narrow layouts. + /// Sanitizes content for clipboard copy or API submission. + /// + /// - Strips link reference definitions (including OpenAI annotations) + /// - Strips reasoning/thinking blocks + /// - Normalizes whitespace + static String sanitize(String input) { + if (input.isEmpty) return input; + + return input + .replaceAll('\r\n', '\n') + .transform(_stripLinkReferenceDefinitions) + .replaceAll(_reasoningBlocks, '') + .replaceAll(_multipleNewlines, '\n\n') + .trim(); + } + + /// Converts markdown to plain text for text-to-speech. + static String toPlainText(String input) { + if (input.trim().isEmpty) return ''; + + return sanitize(input) + .replaceAll(_codeBlock, '') // Remove code blocks + .replaceAllMapped(_inlineCode, (m) => m[1] ?? '') // Keep code text + .replaceAll(_image, '') // Remove images + .replaceAllMapped(_link, (m) => m[1] ?? '') // Keep link text + // Strip paired markdown formatting (preserves lone * and _ in text) + .replaceAllMapped(_boldItalic, (m) => m[1] ?? '') + .replaceAllMapped(_bold, (m) => m[1] ?? '') + .replaceAllMapped(_strikethrough, (m) => m[1] ?? '') + .replaceAllMapped(_italicAsterisk, (m) => ' ${m[1] ?? ''}') + .replaceAllMapped(_italicUnderscore, (m) => ' ${m[1] ?? ''}') + .replaceAll(_heading, '') // Strip # markers + .replaceAll(_listMarker, '') // Strip list markers + .replaceAll(_blockquote, '') // Strip > markers + .replaceAll(_horizontalRule, '') // Remove --- + .replaceAll(_htmlTag, '') // Remove HTML + .transform(_htmlUnescape.convert) // Decode entities + .replaceAll(_emoji, '') // Remove emojis + .replaceAll(_whitespace, ' ') // Normalize whitespace + .trim(); + } + + /// Breaks long inline code spans for better wrapping. static String softenInlineCode(String input, {int chunkSize = 24}) { - if (input.length <= chunkSize) { - return input; - } + if (input.length <= chunkSize) return input; + final buffer = StringBuffer(); for (var i = 0; i < input.length; i++) { buffer.write(input[i]); @@ -119,4 +183,90 @@ class ConduitMarkdownPreprocessor { } return buffer.toString(); } + + // ============================================================ + // Private Helpers + // ============================================================ + + static String _normalizeFences(String input) { + var output = input; + + // Move fences after list markers to new line + output = output.replaceAllMapped( + _bulletFenceRegex, + (match) => '${match[1]}\n```${match[2]}', + ); + + // Dedent opening fences + output = output.replaceAllMapped( + _dedentOpenRegex, + (match) => '```${match[1]}', + ); + + // Dedent closing fences + output = output.replaceAllMapped(_dedentCloseRegex, (_) => '```'); + + // Ensure closing fences stand alone + output = output.replaceAllMapped( + _inlineClosingRegex, + (match) => '${match[1]}\n```', + ); + + // Auto-close unmatched fence + final fenceCount = _fenceAtBolRegex.allMatches(output).length; + if (fenceCount.isOdd) { + if (!output.endsWith('\n')) output += '\n'; + output += '```'; + } + + return output; + } + + static String _separateConsecutiveLinks(String input) { + final lines = input.split('\n'); + if (lines.length <= 1) return input; + + final buffer = StringBuffer(); + for (var i = 0; i < lines.length; i++) { + final line = lines[i]; + buffer.write(line); + if (i < lines.length - 1) buffer.write('\n'); + if (_linkWithTrailingSpaces.hasMatch(line)) buffer.write('\n'); + } + return buffer.toString(); + } + + /// Strips link reference definitions using the `markdown` package. + static String _stripLinkReferenceDefinitions(String input) { + if (!input.contains('[')) return input; + + final document = md.Document(); + document.parseLines(input.split('\n')); + + final refLabels = document.linkReferences.keys.toSet(); + if (refLabels.isEmpty) return input; + + final labelPatterns = + refLabels.map((label) => RegExp.escape(label)).join('|'); + + final refDefRegex = RegExp( + r'^[ ]{0,3}\[(?:' + + labelPatterns + + r')\]:[ \t]*(?:<[^>]*>|[^\s]*)(?:[ \t]+(?:"[^"]*"|' + + r"'[^']*'" + + r'|\([^)]*\)))?[ \t]*$', + multiLine: true, + caseSensitive: false, + ); + + return input + .replaceAll(refDefRegex, '') + .replaceAll(_multipleNewlines, '\n\n') + .trim(); + } +} + +/// Extension for chaining string transformations. +extension _StringTransform on String { + String transform(String Function(String) fn) => fn(this); }