refactor(markdown): remove deprecated stream formatter and enhance preprocessor

2025-12-22 14:07:04 +05:30
parent 653162cb76
commit 5fd68f86fe
12 changed files with 347 additions and 505 deletions
@@ -1,18 +0,0 @@
-/// HTML entity utilities for parsing content.
-///
-/// Reference: openwebui-src/src/lib/utils/index.ts (unescapeHtml)
-library;
-
-import 'package:html_unescape/html_unescape.dart';
-
-/// Utility class for HTML entity handling.
-class HtmlUtils {
-  /// HTML entity unescaper instance.
-  static final _unescape = HtmlUnescape();
-
-  /// Unescape HTML entities in a string.
-  ///
-  /// Handles all Named, Decimal, and Hexadecimal Character References.
-  static String unescapeHtml(String s) => _unescape.convert(s);
-}
-
@@ -1,71 +0,0 @@
-// Pre-compiled regex patterns for markdown syntax detection (performance optimization)
-final _boldPattern = RegExp(r'\*\*');
-final _italicPattern = RegExp(r'(?<!\*)\*(?!\*)');
-
-/// Maintains a raw markdown buffer for streaming content and generates
-/// preview-safe output by appending synthetic closing tokens when necessary.
-class MarkdownStreamFormatter {
-  StringBuffer _raw = StringBuffer();
-
-  /// Seeds the formatter with existing markdown content.
-  void seed(String content) {
-    _raw = StringBuffer(content);
-  }
-
-  /// Adds a streaming chunk to the internal buffer and returns a preview-ready
-  /// string with any required synthetic closing markers.
-  String ingest(String chunk) {
-    if (chunk.isNotEmpty) {
-      _raw.write(chunk);
-    }
-    return preview();
-  }
-
-  /// Replaces the current buffer with the provided [content].
-  String replace(String content) {
-    seed(content);
-    return preview();
-  }
-
-  /// Returns the preview-safe markdown string.
-  String preview() {
-    final raw = _raw.toString();
-    return raw + _syntheticClosures(raw);
-  }
-
-  /// Returns the raw markdown accumulated so far.
-  String finalize() => _raw.toString();
-
-  String _syntheticClosures(String content) {
-    final buffer = StringBuffer();
-
-    final fenceCount = '```'.allMatches(content).length;
-    if (fenceCount.isOdd) {
-      buffer.writeln('```');
-    }
-
-    final boldCount = _boldPattern.allMatches(content).length;
-    if (boldCount.isOdd) {
-      buffer.write('**');
-    }
-
-    final italicCount = _italicPattern.allMatches(content).length;
-    if (italicCount.isOdd) {
-      buffer.write('*');
-    }
-
-    final openBrackets = '['.allMatches(content).length;
-    final closeBrackets = ']'.allMatches(content).length;
-    if (openBrackets > closeBrackets) {
-      buffer.write(List.filled(openBrackets - closeBrackets, ']').join());
-    }
-
-    final openParens = '('.allMatches(content).length;
-    final closeParens = ')'.allMatches(content).length;
-    if (openParens > closeParens) {
-      buffer.write(List.filled(openParens - closeParens, ')').join());
-    }
-
-    return buffer.toString();
-  }
-}
@@ -1,160 +0,0 @@
-/// Converts markdown text to plain text suitable for text-to-speech.
-///
-/// Strips formatting while preserving the semantic meaning and readability
-/// of the content for audio consumption.
-class MarkdownToText {
-  const MarkdownToText._();
-
-  static final _thinkingBlockRegex = RegExp(
-    r'<details\s+type="reasoning"[^>]*>.*?</details>',
-    multiLine: true,
-    dotAll: true,
-  );
-  static final _thinkTagRegex = RegExp(
-    r'<think>.*?</think>',
-    multiLine: true,
-    dotAll: true,
-  );
-  static final _reasoningTagRegex = RegExp(
-    r'<reasoning>.*?</reasoning>',
-    multiLine: true,
-    dotAll: true,
-  );
-  static final _emojiRegex = RegExp(
-    r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]',
-    unicode: true,
-  );
-  static final _codeBlockRegex = RegExp(
-    r'```[^\n]*\n(.*?)```',
-    multiLine: true,
-    dotAll: true,
-  );
-  static final _inlineCodeRegex = RegExp(r'`([^`]+)`');
-  static final _boldItalicRegex = RegExp(r'\*\*\*([^*]+)\*\*\*');
-  static final _boldRegex = RegExp(r'\*\*([^*]+)\*\*');
-  static final _italicRegex = RegExp(r'\*([^*]+)\*|_([^_]+)_');
-  static final _strikethroughRegex = RegExp(r'~~([^~]+)~~');
-  static final _linkRegex = RegExp(r'\[([^\]]+)\]\([^)]+\)');
-  static final _imageRegex = RegExp(r'!\[([^\]]*)\]\([^)]+\)');
-  static final _headingRegex = RegExp(r'^#{1,6}\s+(.+)$', multiLine: true);
-  static final _listItemRegex = RegExp(r'^[\s]*[-*+]\s+(.+)$', multiLine: true);
-  static final _orderedListRegex = RegExp(
-    r'^[\s]*\d+\.\s+(.+)$',
-    multiLine: true,
-  );
-  static final _blockquoteRegex = RegExp(r'^>\s*(.+)$', multiLine: true);
-  static final _horizontalRuleRegex = RegExp(
-    r'^[\s]*[-*_]{3,}[\s]*$',
-    multiLine: true,
-  );
-  static final _htmlTagRegex = RegExp(r'<[^>]+>');
-  static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;');
-  static final _multipleNewlinesRegex = RegExp(r'\n{3,}');
-  static final _multipleSpacesRegex = RegExp(r' {2,}');
-
-  /// Converts markdown text to plain text suitable for TTS.
-  ///
-  /// - Removes thinking/reasoning blocks
-  /// - Removes emojis
-  /// - Removes code blocks (replaces with descriptive text)
-  /// - Strips all formatting (bold, italic, strikethrough)
-  /// - Converts links to just their text
-  /// - Removes images (or converts to alt text)
-  /// - Simplifies headings
-  /// - Preserves list structure with natural pauses
-  /// - Removes HTML tags and entities
-  /// - Normalizes whitespace
-  static String convert(String markdown) {
-    if (markdown.trim().isEmpty) {
-      return '';
-    }
-
-    var text = markdown;
-
-    // Remove thinking/reasoning blocks (must be done before general HTML tag removal)
-    text = text.replaceAll(_thinkingBlockRegex, '');
-    text = text.replaceAll(_thinkTagRegex, '');
-    text = text.replaceAll(_reasoningTagRegex, '');
-
-    // Remove emojis
-    text = text.replaceAll(_emojiRegex, '');
-
-    // Remove or replace code blocks with descriptive text
-    text = text.replaceAllMapped(_codeBlockRegex, (match) {
-      final code = match[1]?.trim() ?? '';
-      if (code.isEmpty) {
-        return '';
-      }
-      return ' (code block) ';
-    });
-
-    // Remove inline code backticks but keep the content
-    text = text.replaceAllMapped(_inlineCodeRegex, (match) => match[1] ?? '');
-
-    // Strip bold/italic/strikethrough formatting
-    text = text.replaceAllMapped(_boldItalicRegex, (match) => match[1] ?? '');
-    text = text.replaceAllMapped(_boldRegex, (match) => match[1] ?? '');
-    text = text.replaceAllMapped(
-      _italicRegex,
-      (match) => match[1] ?? match[2] ?? '',
-    );
-    text = text.replaceAllMapped(
-      _strikethroughRegex,
-      (match) => match[1] ?? '',
-    );
-
-    // Convert links to just their text
-    text = text.replaceAllMapped(_linkRegex, (match) => match[1] ?? '');
-
-    // Remove images (or use alt text if available)
-    text = text.replaceAllMapped(_imageRegex, (match) {
-      final alt = match[1]?.trim() ?? '';
-      return alt.isNotEmpty ? ' ($alt image) ' : '';
-    });
-
-    // Simplify headings (remove # symbols)
-    text = text.replaceAllMapped(_headingRegex, (match) {
-      final heading = match[1] ?? '';
-      return '$heading.\n';
-    });
-
-    // Preserve list items with natural pauses
-    text = text.replaceAllMapped(_listItemRegex, (match) => '${match[1]}. ');
-    text = text.replaceAllMapped(_orderedListRegex, (match) => '${match[1]}. ');
-
-    // Remove blockquote markers
-    text = text.replaceAllMapped(_blockquoteRegex, (match) => match[1] ?? '');
-
-    // Remove horizontal rules
-    text = text.replaceAll(_horizontalRuleRegex, '');
-
-    // Remove HTML tags
-    text = text.replaceAll(_htmlTagRegex, '');
-
-    // Decode HTML entities
-    text = text.replaceAllMapped(_htmlEntityRegex, (match) {
-      final entity = match[0] ?? '';
-      return switch (entity) {
-        '&nbsp;' => ' ',
-        '&amp;' => '&',
-        '&lt;' => '<',
-        '&gt;' => '>',
-        '&quot;' => '"',
-        '&apos;' => "'",
-        _ => entity,
-      };
-    });
-
-    // Normalize whitespace
-    text = text.replaceAll(_multipleNewlinesRegex, '\n\n');
-    text = text.replaceAll(_multipleSpacesRegex, ' ');
-
-    // Convert newlines to spaces for natural speech flow
-    text = text.replaceAll('\n', ' ');
-
-    // Final cleanup
-    text = text.trim();
-
-    return text;
-  }
-}
@@ -7,7 +7,12 @@
 /// Reference: openwebui-src/backend/open_webui/utils/middleware.py DEFAULT_REASONING_TAGS
 library;

-import 'html_utils.dart';
+import 'package:html_unescape/html_unescape.dart';
+
+final _htmlUnescape = HtmlUnescape();
+
+/// Unescape HTML entities in reasoning content.
+String _unescapeHtml(String s) => _htmlUnescape.convert(s);

 /// All reasoning tag pairs supported by Open WebUI.
 /// Reference: DEFAULT_REASONING_TAGS in middleware.py
@@ -181,9 +186,25 @@ class ReasoningParser {
      }

      // Check for raw tag pairs
+      // Supports tags with optional attributes like <think foo="bar">
+      // Reference: openwebui-src/backend/open_webui/utils/middleware.py
      for (final pair in tagPairs) {
        final startTag = pair.$1;
-        final idx = content.indexOf(startTag, index);
+        int idx = -1;
+
+        // For XML-like tags (e.g., <think>), match with optional attributes
+        if (startTag.startsWith('<') && startTag.endsWith('>')) {
+          final tagName = startTag.substring(1, startTag.length - 1);
+          final pattern = RegExp('<${RegExp.escape(tagName)}(\\s[^>]*)?>');
+          final match = pattern.firstMatch(content.substring(index));
+          if (match != null) {
+            idx = index + match.start;
+          }
+        } else {
+          // For non-XML tags (e.g., ◁think▷), use exact matching
+          idx = content.indexOf(startTag, index);
+        }
+
        if (idx != -1 && (nextRawIdx == -1 || idx < nextRawIdx)) {
          nextRawIdx = idx;
          matchedRawPair = pair;
@@ -336,8 +357,8 @@ class ReasoningParser {

      return _DetailsResult(
        entry: ReasoningEntry(
-          reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining),
-          summary: HtmlUtils.unescapeHtml(summaryResult.summary),
+          reasoning: _unescapeHtml(summaryResult.remaining),
+          summary: _unescapeHtml(summaryResult.summary),
          duration: effectiveDuration,
          isDone: false,
          blockType: blockType,
@@ -368,8 +389,8 @@ class ReasoningParser {

    return _DetailsResult(
      entry: ReasoningEntry(
-        reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining),
-        summary: HtmlUtils.unescapeHtml(summaryResult.summary),
+        reasoning: _unescapeHtml(summaryResult.remaining),
+        summary: _unescapeHtml(summaryResult.summary),
        duration: effectiveDuration,
        isDone: isDone,
        blockType: blockType,
@@ -381,20 +402,47 @@ class ReasoningParser {
  }

  /// Parse a raw reasoning tag pair (e.g., `<think>...</think>`).
+  /// Supports tags with optional attributes like `<think foo="bar">`.
+  ///
+  /// Reference: openwebui-src/backend/open_webui/utils/middleware.py
  static _ReasoningResult _parseRawReasoning(
    String content,
    int startIdx,
    String startTag,
    String endTag,
  ) {
-    final endIdx = content.indexOf(endTag, startIdx + startTag.length);
+    // Find the actual end of the opening tag (handles attributes)
+    int contentStartIdx;
+    if (startTag.startsWith('<') && startTag.endsWith('>')) {
+      // For XML-like tags, find the closing '>' to skip any attributes
+      final tagCloseIdx = content.indexOf('>', startIdx);
+      if (tagCloseIdx == -1) {
+        // Incomplete opening tag
+        return _ReasoningResult(
+          entry: ReasoningEntry(
+            reasoning: '',
+            summary: '',
+            duration: 0,
+            isDone: false,
+          ),
+          endIndex: content.length,
+          isComplete: false,
+        );
+      }
+      contentStartIdx = tagCloseIdx + 1;
+    } else {
+      // For non-XML tags, use exact tag length
+      contentStartIdx = startIdx + startTag.length;
+    }
+
+    final endIdx = content.indexOf(endTag, contentStartIdx);

    if (endIdx == -1) {
      // Incomplete block (streaming)
-      final innerContent = content.substring(startIdx + startTag.length);
+      final innerContent = content.substring(contentStartIdx);
      return _ReasoningResult(
        entry: ReasoningEntry(
-          reasoning: HtmlUtils.unescapeHtml(innerContent.trim()),
+          reasoning: _unescapeHtml(innerContent.trim()),
          summary: '',
          duration: 0,
          isDone: false,
@@ -405,10 +453,10 @@ class ReasoningParser {
    }

    // Complete block
-    final innerContent = content.substring(startIdx + startTag.length, endIdx);
+    final innerContent = content.substring(contentStartIdx, endIdx);
    return _ReasoningResult(
      entry: ReasoningEntry(
-        reasoning: HtmlUtils.unescapeHtml(innerContent.trim()),
+        reasoning: _unescapeHtml(innerContent.trim()),
        summary: '',
        duration: 0,
        isDone: true,
@@ -533,23 +581,33 @@ class ReasoningParser {
  }

  /// Formats the duration for display.
-  /// Mirrors Open WebUI's formatting:
+  /// Mirrors Open WebUI's dayjs.duration(seconds, 'seconds').humanize():
  /// - < 1: "less than a second"
  /// - < 60: "X seconds"
-  /// - >= 60: humanized (e.g., "2 minutes")
+  /// - >= 60: humanized (e.g., "a minute", "2 minutes", "about an hour")
+  ///
+  /// Reference: openwebui-src/src/lib/components/common/Collapsible.svelte
  static String formatDuration(int seconds) {
    if (seconds < 1) return 'less than a second';
    if (seconds < 60) return '$seconds second${seconds == 1 ? '' : 's'}';

-    final minutes = seconds ~/ 60;
-    final remainingSeconds = seconds % 60;
-
-    if (remainingSeconds == 0) {
-      return '$minutes minute${minutes == 1 ? '' : 's'}';
+    // Match dayjs.duration().humanize() behavior
+    // Reference: https://day.js.org/docs/en/durations/humanize
+    if (seconds < 90) return 'a minute';
+    if (seconds < 2700) {
+      // 45 minutes
+      final minutes = (seconds / 60).round();
+      return '$minutes minutes';
    }
-
-    // For mixed minutes and seconds, use abbreviated format
-    return '$minutes min ${remainingSeconds}s';
+    if (seconds < 5400) return 'about an hour'; // 90 minutes
+    if (seconds < 79200) {
+      // 22 hours
+      final hours = (seconds / 3600).round();
+      return '$hours hours';
+    }
+    if (seconds < 129600) return 'a day'; // 36 hours
+    final days = (seconds / 86400).round();
+    return '$days days';
  }
 }

@@ -1,5 +1,7 @@
 import 'dart:convert';

+import '../../shared/widgets/markdown/markdown_preprocessor.dart';
+
 /// Parsed representation of one tool call emitted as a `<details type="tool_calls" ...>` block
 class ToolCallEntry {
  final String id;
@@ -255,18 +257,8 @@ class ToolCallsParser {
  static String sanitizeForApi(String content) {
    if (content.isEmpty) return content;

-    // Remove blocks we never want to include in conversation context
-    final removeTypes = ['reasoning', 'code_interpreter'];
-    for (final t in removeTypes) {
-      content = content.replaceAll(
-        RegExp(
-          '<details\\s+type="$t"[^>]*>[\\s\\S]*?</details>',
-          multiLine: true,
-          dotAll: true,
-        ),
-        '',
-      );
-    }
+    // Remove annotations and reasoning blocks
+    content = ConduitMarkdownPreprocessor.sanitize(content);

    if (!content.contains('<details')) return content.trim();