diff --git a/lib/core/utils/markdown_to_text.dart b/lib/core/utils/markdown_to_text.dart index 0debd27..687720f 100644 --- a/lib/core/utils/markdown_to_text.dart +++ b/lib/core/utils/markdown_to_text.dart @@ -5,6 +5,25 @@ class MarkdownToText { const MarkdownToText._(); + static final _thinkingBlockRegex = RegExp( + r']*>.*?', + multiLine: true, + dotAll: true, + ); + static final _thinkTagRegex = RegExp( + r'.*?', + multiLine: true, + dotAll: true, + ); + static final _reasoningTagRegex = RegExp( + r'.*?', + multiLine: true, + dotAll: true, + ); + static final _emojiRegex = RegExp( + r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]', + unicode: true, + ); static final _codeBlockRegex = RegExp( r'```[^\n]*\n(.*?)```', multiLine: true, @@ -29,18 +48,21 @@ class MarkdownToText { multiLine: true, ); static final _htmlTagRegex = RegExp(r'<[^>]+>'); + static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;'); static final _multipleNewlinesRegex = RegExp(r'\n{3,}'); static final _multipleSpacesRegex = RegExp(r' {2,}'); /// Converts markdown text to plain text suitable for TTS. /// + /// - Removes thinking/reasoning blocks + /// - Removes emojis /// - Removes code blocks (replaces with descriptive text) /// - Strips all formatting (bold, italic, strikethrough) /// - Converts links to just their text /// - Removes images (or converts to alt text) /// - Simplifies headings /// - Preserves list structure with natural pauses - /// - Removes HTML tags + /// - Removes HTML tags and entities /// - Normalizes whitespace static String convert(String markdown) { if (markdown.trim().isEmpty) { @@ -49,13 +71,20 @@ class MarkdownToText { var text = markdown; + // Remove thinking/reasoning blocks (must be done before general HTML tag removal) + text = text.replaceAll(_thinkingBlockRegex, ''); + text = text.replaceAll(_thinkTagRegex, ''); + text = text.replaceAll(_reasoningTagRegex, ''); + + // Remove emojis + text = text.replaceAll(_emojiRegex, ''); + // Remove or replace code blocks with descriptive text text = text.replaceAllMapped(_codeBlockRegex, (match) { final code = match[1]?.trim() ?? ''; if (code.isEmpty) { return ''; } - // For TTS, skip code blocks or use a brief description return ' (code block) '; }); @@ -86,7 +115,6 @@ class MarkdownToText { // Simplify headings (remove # symbols) text = text.replaceAllMapped(_headingRegex, (match) { final heading = match[1] ?? ''; - // Add a pause after headings for natural speech flow return '$heading.\n'; }); @@ -103,6 +131,20 @@ class MarkdownToText { // Remove HTML tags text = text.replaceAll(_htmlTagRegex, ''); + // Decode HTML entities + text = text.replaceAllMapped(_htmlEntityRegex, (match) { + final entity = match[0] ?? ''; + return switch (entity) { + ' ' => ' ', + '&' => '&', + '<' => '<', + '>' => '>', + '"' => '"', + ''' => "'", + _ => entity, + }; + }); + // Normalize whitespace text = text.replaceAll(_multipleNewlinesRegex, '\n\n'); text = text.replaceAll(_multipleSpacesRegex, ' '); diff --git a/lib/features/chat/widgets/assistant_message_widget.dart b/lib/features/chat/widgets/assistant_message_widget.dart index ee3b775..022c332 100644 --- a/lib/features/chat/widgets/assistant_message_widget.dart +++ b/lib/features/chat/widgets/assistant_message_widget.dart @@ -11,6 +11,7 @@ import '../../../core/utils/reasoning_parser.dart'; import '../../../core/utils/message_segments.dart'; import '../../../core/utils/tool_calls_parser.dart'; import '../../../core/models/chat_message.dart'; +import '../../../core/utils/markdown_to_text.dart'; import '../providers/text_to_speech_provider.dart'; import 'enhanced_image_attachment.dart'; import 'package:conduit/l10n/app_localizations.dart'; @@ -23,21 +24,6 @@ import '../../../core/utils/debug_logger.dart'; import 'sources/openwebui_sources.dart'; import '../providers/assistant_response_builder_provider.dart'; -// Pre-compiled regex patterns for TTS sanitization (performance optimization) -final _ttsCodeBlockPattern = RegExp(r'```'); -final _ttsInlineCodePattern = RegExp(r'`'); -final _ttsImagePattern = RegExp(r'!\[(.*?)\]\((.*?)\)'); -final _ttsLinkPattern = RegExp(r'\[(.*?)\]\((.*?)\)'); -final _ttsBoldPattern1 = RegExp(r'\*\*'); -final _ttsBoldPattern2 = RegExp(r'__'); -final _ttsItalicPattern1 = RegExp(r'\*'); -final _ttsItalicPattern2 = RegExp(r'_'); -final _ttsStrikePattern = RegExp(r'~'); -final _ttsListPattern = RegExp(r'^[-*+]\s+', multiLine: true); -final _ttsQuotePattern = RegExp(r'^>\s?', multiLine: true); -final _ttsMultiSpacePattern = RegExp(r'[ \t]{2,}'); -final _ttsMultiNewlinePattern = RegExp(r'\n{3,}'); - // Pre-compiled regex patterns for image processing (performance optimization) final _base64ImagePattern = RegExp(r'data:image/[^;]+;base64,[A-Za-z0-9+/]+=*'); final _fileIdPattern = RegExp(r'/api/v1/files/([^/]+)/content'); @@ -258,7 +244,7 @@ class _AssistantMessageWidgetState extends ConsumerState String _buildTtsPlainText(List segments, String fallback) { if (segments.isEmpty) { - return _sanitizeForSpeech(fallback); + return MarkdownToText.convert(fallback); } final buffer = StringBuffer(); @@ -267,7 +253,7 @@ class _AssistantMessageWidgetState extends ConsumerState continue; } final text = segment.text ?? ''; - final sanitized = _sanitizeForSpeech(text); + final sanitized = MarkdownToText.convert(text); if (sanitized.isEmpty) { continue; } @@ -280,38 +266,11 @@ class _AssistantMessageWidgetState extends ConsumerState final result = buffer.toString().trim(); if (result.isEmpty) { - return _sanitizeForSpeech(fallback); + return MarkdownToText.convert(fallback); } return result; } - String _sanitizeForSpeech(String input) { - if (input.isEmpty) { - return ''; - } - - var text = input; - // Use pre-compiled regex patterns for better performance - text = text.replaceAll(_ttsCodeBlockPattern, ' '); - text = text.replaceAll(_ttsInlineCodePattern, ''); - text = text.replaceAll(_ttsImagePattern, r'$1'); - text = text.replaceAll(_ttsLinkPattern, r'$1'); - text = text.replaceAll(_ttsBoldPattern1, ''); - text = text.replaceAll(_ttsBoldPattern2, ''); - text = text.replaceAll(_ttsItalicPattern1, ''); - text = text.replaceAll(_ttsItalicPattern2, ''); - text = text.replaceAll(_ttsStrikePattern, ''); - text = text.replaceAll(_ttsListPattern, ''); - text = text.replaceAll(_ttsQuotePattern, ''); - text = text.replaceAll(' ', ' '); - text = text.replaceAll('&', '&'); - text = text.replaceAll('<', '<'); - text = text.replaceAll('>', '>'); - text = text.replaceAll(_ttsMultiSpacePattern, ' '); - text = text.replaceAll(_ttsMultiNewlinePattern, '\n\n'); - return text.trim(); - } - // No streaming-specific markdown fixes needed here; handled by Markdown widget Widget _buildToolCallTile(ToolCallEntry tc) {