refactor: Enhance markdown processing for text-to-speech conversion

- Introduced new regex patterns to remove thinking and reasoning blocks from markdown input. - Added functionality to strip emojis from the text, improving clarity for TTS. - Implemented HTML entity decoding to ensure proper text representation. - Replaced the existing sanitization method with a more comprehensive markdown-to-text conversion approach, enhancing performance and maintainability.
2025-10-20 23:53:07 +05:30
parent 6ea7b3231c
commit e05a560966
2 changed files with 49 additions and 48 deletions
@@ -5,6 +5,25 @@
 class MarkdownToText {
  const MarkdownToText._();

+  static final _thinkingBlockRegex = RegExp(
+    r'<details\s+type="reasoning"[^>]*>.*?</details>',
+    multiLine: true,
+    dotAll: true,
+  );
+  static final _thinkTagRegex = RegExp(
+    r'<think>.*?</think>',
+    multiLine: true,
+    dotAll: true,
+  );
+  static final _reasoningTagRegex = RegExp(
+    r'<reasoning>.*?</reasoning>',
+    multiLine: true,
+    dotAll: true,
+  );
+  static final _emojiRegex = RegExp(
+    r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]',
+    unicode: true,
+  );
  static final _codeBlockRegex = RegExp(
    r'```[^\n]*\n(.*?)```',
    multiLine: true,
@@ -29,18 +48,21 @@ class MarkdownToText {
    multiLine: true,
  );
  static final _htmlTagRegex = RegExp(r'<[^>]+>');
+  static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;');
  static final _multipleNewlinesRegex = RegExp(r'\n{3,}');
  static final _multipleSpacesRegex = RegExp(r' {2,}');

  /// Converts markdown text to plain text suitable for TTS.
  ///
+  /// - Removes thinking/reasoning blocks
+  /// - Removes emojis
  /// - Removes code blocks (replaces with descriptive text)
  /// - Strips all formatting (bold, italic, strikethrough)
  /// - Converts links to just their text
  /// - Removes images (or converts to alt text)
  /// - Simplifies headings
  /// - Preserves list structure with natural pauses
-  /// - Removes HTML tags
+  /// - Removes HTML tags and entities
  /// - Normalizes whitespace
  static String convert(String markdown) {
    if (markdown.trim().isEmpty) {
@@ -49,13 +71,20 @@ class MarkdownToText {

    var text = markdown;

+    // Remove thinking/reasoning blocks (must be done before general HTML tag removal)
+    text = text.replaceAll(_thinkingBlockRegex, '');
+    text = text.replaceAll(_thinkTagRegex, '');
+    text = text.replaceAll(_reasoningTagRegex, '');
+
+    // Remove emojis
+    text = text.replaceAll(_emojiRegex, '');
+
    // Remove or replace code blocks with descriptive text
    text = text.replaceAllMapped(_codeBlockRegex, (match) {
      final code = match[1]?.trim() ?? '';
      if (code.isEmpty) {
        return '';
      }
-      // For TTS, skip code blocks or use a brief description
      return ' (code block) ';
    });

@@ -86,7 +115,6 @@ class MarkdownToText {
    // Simplify headings (remove # symbols)
    text = text.replaceAllMapped(_headingRegex, (match) {
      final heading = match[1] ?? '';
-      // Add a pause after headings for natural speech flow
      return '$heading.\n';
    });

@@ -103,6 +131,20 @@ class MarkdownToText {
    // Remove HTML tags
    text = text.replaceAll(_htmlTagRegex, '');

+    // Decode HTML entities
+    text = text.replaceAllMapped(_htmlEntityRegex, (match) {
+      final entity = match[0] ?? '';
+      return switch (entity) {
+        '&nbsp;' => ' ',
+        '&amp;' => '&',
+        '&lt;' => '<',
+        '&gt;' => '>',
+        '&quot;' => '"',
+        '&apos;' => "'",
+        _ => entity,
+      };
+    });
+
    // Normalize whitespace
    text = text.replaceAll(_multipleNewlinesRegex, '\n\n');
    text = text.replaceAll(_multipleSpacesRegex, ' ');
@@ -11,6 +11,7 @@ import '../../../core/utils/reasoning_parser.dart';
 import '../../../core/utils/message_segments.dart';
 import '../../../core/utils/tool_calls_parser.dart';
 import '../../../core/models/chat_message.dart';
+import '../../../core/utils/markdown_to_text.dart';
 import '../providers/text_to_speech_provider.dart';
 import 'enhanced_image_attachment.dart';
 import 'package:conduit/l10n/app_localizations.dart';
@@ -23,21 +24,6 @@ import '../../../core/utils/debug_logger.dart';
 import 'sources/openwebui_sources.dart';
 import '../providers/assistant_response_builder_provider.dart';

-// Pre-compiled regex patterns for TTS sanitization (performance optimization)
-final _ttsCodeBlockPattern = RegExp(r'```');
-final _ttsInlineCodePattern = RegExp(r'`');
-final _ttsImagePattern = RegExp(r'!\[(.*?)\]\((.*?)\)');
-final _ttsLinkPattern = RegExp(r'\[(.*?)\]\((.*?)\)');
-final _ttsBoldPattern1 = RegExp(r'\*\*');
-final _ttsBoldPattern2 = RegExp(r'__');
-final _ttsItalicPattern1 = RegExp(r'\*');
-final _ttsItalicPattern2 = RegExp(r'_');
-final _ttsStrikePattern = RegExp(r'~');
-final _ttsListPattern = RegExp(r'^[-*+]\s+', multiLine: true);
-final _ttsQuotePattern = RegExp(r'^>\s?', multiLine: true);
-final _ttsMultiSpacePattern = RegExp(r'[ \t]{2,}');
-final _ttsMultiNewlinePattern = RegExp(r'\n{3,}');
-
 // Pre-compiled regex patterns for image processing (performance optimization)
 final _base64ImagePattern = RegExp(r'data:image/[^;]+;base64,[A-Za-z0-9+/]+=*');
 final _fileIdPattern = RegExp(r'/api/v1/files/([^/]+)/content');
@@ -258,7 +244,7 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>

  String _buildTtsPlainText(List<MessageSegment> segments, String fallback) {
    if (segments.isEmpty) {
-      return _sanitizeForSpeech(fallback);
+      return MarkdownToText.convert(fallback);
    }

    final buffer = StringBuffer();
@@ -267,7 +253,7 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
        continue;
      }
      final text = segment.text ?? '';
-      final sanitized = _sanitizeForSpeech(text);
+      final sanitized = MarkdownToText.convert(text);
      if (sanitized.isEmpty) {
        continue;
      }
@@ -280,38 +266,11 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>

    final result = buffer.toString().trim();
    if (result.isEmpty) {
-      return _sanitizeForSpeech(fallback);
+      return MarkdownToText.convert(fallback);
    }
    return result;
  }

-  String _sanitizeForSpeech(String input) {
-    if (input.isEmpty) {
-      return '';
-    }
-
-    var text = input;
-    // Use pre-compiled regex patterns for better performance
-    text = text.replaceAll(_ttsCodeBlockPattern, ' ');
-    text = text.replaceAll(_ttsInlineCodePattern, '');
-    text = text.replaceAll(_ttsImagePattern, r'$1');
-    text = text.replaceAll(_ttsLinkPattern, r'$1');
-    text = text.replaceAll(_ttsBoldPattern1, '');
-    text = text.replaceAll(_ttsBoldPattern2, '');
-    text = text.replaceAll(_ttsItalicPattern1, '');
-    text = text.replaceAll(_ttsItalicPattern2, '');
-    text = text.replaceAll(_ttsStrikePattern, '');
-    text = text.replaceAll(_ttsListPattern, '');
-    text = text.replaceAll(_ttsQuotePattern, '');
-    text = text.replaceAll('&nbsp;', ' ');
-    text = text.replaceAll('&amp;', '&');
-    text = text.replaceAll('&lt;', '<');
-    text = text.replaceAll('&gt;', '>');
-    text = text.replaceAll(_ttsMultiSpacePattern, ' ');
-    text = text.replaceAll(_ttsMultiNewlinePattern, '\n\n');
-    return text.trim();
-  }
-
  // No streaming-specific markdown fixes needed here; handled by Markdown widget

  Widget _buildToolCallTile(ToolCallEntry tc) {