refactor(markdown): remove deprecated stream formatter and enhance preprocessor

This commit is contained in:
cogwheel
2025-12-22 14:07:04 +05:30
parent 653162cb76
commit 5fd68f86fe
12 changed files with 347 additions and 505 deletions

View File

@@ -1,18 +0,0 @@
/// HTML entity utilities for parsing content.
///
/// Reference: openwebui-src/src/lib/utils/index.ts (unescapeHtml)
library;
import 'package:html_unescape/html_unescape.dart';
/// Utility class for HTML entity handling.
class HtmlUtils {
/// HTML entity unescaper instance.
static final _unescape = HtmlUnescape();
/// Unescape HTML entities in a string.
///
/// Handles all Named, Decimal, and Hexadecimal Character References.
static String unescapeHtml(String s) => _unescape.convert(s);
}

View File

@@ -1,71 +0,0 @@
// Pre-compiled regex patterns for markdown syntax detection (performance optimization)
final _boldPattern = RegExp(r'\*\*');
final _italicPattern = RegExp(r'(?<!\*)\*(?!\*)');
/// Maintains a raw markdown buffer for streaming content and generates
/// preview-safe output by appending synthetic closing tokens when necessary.
class MarkdownStreamFormatter {
StringBuffer _raw = StringBuffer();
/// Seeds the formatter with existing markdown content.
void seed(String content) {
_raw = StringBuffer(content);
}
/// Adds a streaming chunk to the internal buffer and returns a preview-ready
/// string with any required synthetic closing markers.
String ingest(String chunk) {
if (chunk.isNotEmpty) {
_raw.write(chunk);
}
return preview();
}
/// Replaces the current buffer with the provided [content].
String replace(String content) {
seed(content);
return preview();
}
/// Returns the preview-safe markdown string.
String preview() {
final raw = _raw.toString();
return raw + _syntheticClosures(raw);
}
/// Returns the raw markdown accumulated so far.
String finalize() => _raw.toString();
String _syntheticClosures(String content) {
final buffer = StringBuffer();
final fenceCount = '```'.allMatches(content).length;
if (fenceCount.isOdd) {
buffer.writeln('```');
}
final boldCount = _boldPattern.allMatches(content).length;
if (boldCount.isOdd) {
buffer.write('**');
}
final italicCount = _italicPattern.allMatches(content).length;
if (italicCount.isOdd) {
buffer.write('*');
}
final openBrackets = '['.allMatches(content).length;
final closeBrackets = ']'.allMatches(content).length;
if (openBrackets > closeBrackets) {
buffer.write(List.filled(openBrackets - closeBrackets, ']').join());
}
final openParens = '('.allMatches(content).length;
final closeParens = ')'.allMatches(content).length;
if (openParens > closeParens) {
buffer.write(List.filled(openParens - closeParens, ')').join());
}
return buffer.toString();
}
}

View File

@@ -1,160 +0,0 @@
/// Converts markdown text to plain text suitable for text-to-speech.
///
/// Strips formatting while preserving the semantic meaning and readability
/// of the content for audio consumption.
class MarkdownToText {
const MarkdownToText._();
static final _thinkingBlockRegex = RegExp(
r'<details\s+type="reasoning"[^>]*>.*?</details>',
multiLine: true,
dotAll: true,
);
static final _thinkTagRegex = RegExp(
r'<think>.*?</think>',
multiLine: true,
dotAll: true,
);
static final _reasoningTagRegex = RegExp(
r'<reasoning>.*?</reasoning>',
multiLine: true,
dotAll: true,
);
static final _emojiRegex = RegExp(
r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]',
unicode: true,
);
static final _codeBlockRegex = RegExp(
r'```[^\n]*\n(.*?)```',
multiLine: true,
dotAll: true,
);
static final _inlineCodeRegex = RegExp(r'`([^`]+)`');
static final _boldItalicRegex = RegExp(r'\*\*\*([^*]+)\*\*\*');
static final _boldRegex = RegExp(r'\*\*([^*]+)\*\*');
static final _italicRegex = RegExp(r'\*([^*]+)\*|_([^_]+)_');
static final _strikethroughRegex = RegExp(r'~~([^~]+)~~');
static final _linkRegex = RegExp(r'\[([^\]]+)\]\([^)]+\)');
static final _imageRegex = RegExp(r'!\[([^\]]*)\]\([^)]+\)');
static final _headingRegex = RegExp(r'^#{1,6}\s+(.+)$', multiLine: true);
static final _listItemRegex = RegExp(r'^[\s]*[-*+]\s+(.+)$', multiLine: true);
static final _orderedListRegex = RegExp(
r'^[\s]*\d+\.\s+(.+)$',
multiLine: true,
);
static final _blockquoteRegex = RegExp(r'^>\s*(.+)$', multiLine: true);
static final _horizontalRuleRegex = RegExp(
r'^[\s]*[-*_]{3,}[\s]*$',
multiLine: true,
);
static final _htmlTagRegex = RegExp(r'<[^>]+>');
static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;');
static final _multipleNewlinesRegex = RegExp(r'\n{3,}');
static final _multipleSpacesRegex = RegExp(r' {2,}');
/// Converts markdown text to plain text suitable for TTS.
///
/// - Removes thinking/reasoning blocks
/// - Removes emojis
/// - Removes code blocks (replaces with descriptive text)
/// - Strips all formatting (bold, italic, strikethrough)
/// - Converts links to just their text
/// - Removes images (or converts to alt text)
/// - Simplifies headings
/// - Preserves list structure with natural pauses
/// - Removes HTML tags and entities
/// - Normalizes whitespace
static String convert(String markdown) {
if (markdown.trim().isEmpty) {
return '';
}
var text = markdown;
// Remove thinking/reasoning blocks (must be done before general HTML tag removal)
text = text.replaceAll(_thinkingBlockRegex, '');
text = text.replaceAll(_thinkTagRegex, '');
text = text.replaceAll(_reasoningTagRegex, '');
// Remove emojis
text = text.replaceAll(_emojiRegex, '');
// Remove or replace code blocks with descriptive text
text = text.replaceAllMapped(_codeBlockRegex, (match) {
final code = match[1]?.trim() ?? '';
if (code.isEmpty) {
return '';
}
return ' (code block) ';
});
// Remove inline code backticks but keep the content
text = text.replaceAllMapped(_inlineCodeRegex, (match) => match[1] ?? '');
// Strip bold/italic/strikethrough formatting
text = text.replaceAllMapped(_boldItalicRegex, (match) => match[1] ?? '');
text = text.replaceAllMapped(_boldRegex, (match) => match[1] ?? '');
text = text.replaceAllMapped(
_italicRegex,
(match) => match[1] ?? match[2] ?? '',
);
text = text.replaceAllMapped(
_strikethroughRegex,
(match) => match[1] ?? '',
);
// Convert links to just their text
text = text.replaceAllMapped(_linkRegex, (match) => match[1] ?? '');
// Remove images (or use alt text if available)
text = text.replaceAllMapped(_imageRegex, (match) {
final alt = match[1]?.trim() ?? '';
return alt.isNotEmpty ? ' ($alt image) ' : '';
});
// Simplify headings (remove # symbols)
text = text.replaceAllMapped(_headingRegex, (match) {
final heading = match[1] ?? '';
return '$heading.\n';
});
// Preserve list items with natural pauses
text = text.replaceAllMapped(_listItemRegex, (match) => '${match[1]}. ');
text = text.replaceAllMapped(_orderedListRegex, (match) => '${match[1]}. ');
// Remove blockquote markers
text = text.replaceAllMapped(_blockquoteRegex, (match) => match[1] ?? '');
// Remove horizontal rules
text = text.replaceAll(_horizontalRuleRegex, '');
// Remove HTML tags
text = text.replaceAll(_htmlTagRegex, '');
// Decode HTML entities
text = text.replaceAllMapped(_htmlEntityRegex, (match) {
final entity = match[0] ?? '';
return switch (entity) {
'&nbsp;' => ' ',
'&amp;' => '&',
'&lt;' => '<',
'&gt;' => '>',
'&quot;' => '"',
'&apos;' => "'",
_ => entity,
};
});
// Normalize whitespace
text = text.replaceAll(_multipleNewlinesRegex, '\n\n');
text = text.replaceAll(_multipleSpacesRegex, ' ');
// Convert newlines to spaces for natural speech flow
text = text.replaceAll('\n', ' ');
// Final cleanup
text = text.trim();
return text;
}
}

View File

@@ -7,7 +7,12 @@
/// Reference: openwebui-src/backend/open_webui/utils/middleware.py DEFAULT_REASONING_TAGS
library;
import 'html_utils.dart';
import 'package:html_unescape/html_unescape.dart';
final _htmlUnescape = HtmlUnescape();
/// Unescape HTML entities in reasoning content.
String _unescapeHtml(String s) => _htmlUnescape.convert(s);
/// All reasoning tag pairs supported by Open WebUI.
/// Reference: DEFAULT_REASONING_TAGS in middleware.py
@@ -181,9 +186,25 @@ class ReasoningParser {
}
// Check for raw tag pairs
// Supports tags with optional attributes like <think foo="bar">
// Reference: openwebui-src/backend/open_webui/utils/middleware.py
for (final pair in tagPairs) {
final startTag = pair.$1;
final idx = content.indexOf(startTag, index);
int idx = -1;
// For XML-like tags (e.g., <think>), match with optional attributes
if (startTag.startsWith('<') && startTag.endsWith('>')) {
final tagName = startTag.substring(1, startTag.length - 1);
final pattern = RegExp('<${RegExp.escape(tagName)}(\\s[^>]*)?>');
final match = pattern.firstMatch(content.substring(index));
if (match != null) {
idx = index + match.start;
}
} else {
// For non-XML tags (e.g., ◁think▷), use exact matching
idx = content.indexOf(startTag, index);
}
if (idx != -1 && (nextRawIdx == -1 || idx < nextRawIdx)) {
nextRawIdx = idx;
matchedRawPair = pair;
@@ -336,8 +357,8 @@ class ReasoningParser {
return _DetailsResult(
entry: ReasoningEntry(
reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining),
summary: HtmlUtils.unescapeHtml(summaryResult.summary),
reasoning: _unescapeHtml(summaryResult.remaining),
summary: _unescapeHtml(summaryResult.summary),
duration: effectiveDuration,
isDone: false,
blockType: blockType,
@@ -368,8 +389,8 @@ class ReasoningParser {
return _DetailsResult(
entry: ReasoningEntry(
reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining),
summary: HtmlUtils.unescapeHtml(summaryResult.summary),
reasoning: _unescapeHtml(summaryResult.remaining),
summary: _unescapeHtml(summaryResult.summary),
duration: effectiveDuration,
isDone: isDone,
blockType: blockType,
@@ -381,20 +402,47 @@ class ReasoningParser {
}
/// Parse a raw reasoning tag pair (e.g., `<think>...</think>`).
/// Supports tags with optional attributes like `<think foo="bar">`.
///
/// Reference: openwebui-src/backend/open_webui/utils/middleware.py
static _ReasoningResult _parseRawReasoning(
String content,
int startIdx,
String startTag,
String endTag,
) {
final endIdx = content.indexOf(endTag, startIdx + startTag.length);
// Find the actual end of the opening tag (handles attributes)
int contentStartIdx;
if (startTag.startsWith('<') && startTag.endsWith('>')) {
// For XML-like tags, find the closing '>' to skip any attributes
final tagCloseIdx = content.indexOf('>', startIdx);
if (tagCloseIdx == -1) {
// Incomplete opening tag
return _ReasoningResult(
entry: ReasoningEntry(
reasoning: '',
summary: '',
duration: 0,
isDone: false,
),
endIndex: content.length,
isComplete: false,
);
}
contentStartIdx = tagCloseIdx + 1;
} else {
// For non-XML tags, use exact tag length
contentStartIdx = startIdx + startTag.length;
}
final endIdx = content.indexOf(endTag, contentStartIdx);
if (endIdx == -1) {
// Incomplete block (streaming)
final innerContent = content.substring(startIdx + startTag.length);
final innerContent = content.substring(contentStartIdx);
return _ReasoningResult(
entry: ReasoningEntry(
reasoning: HtmlUtils.unescapeHtml(innerContent.trim()),
reasoning: _unescapeHtml(innerContent.trim()),
summary: '',
duration: 0,
isDone: false,
@@ -405,10 +453,10 @@ class ReasoningParser {
}
// Complete block
final innerContent = content.substring(startIdx + startTag.length, endIdx);
final innerContent = content.substring(contentStartIdx, endIdx);
return _ReasoningResult(
entry: ReasoningEntry(
reasoning: HtmlUtils.unescapeHtml(innerContent.trim()),
reasoning: _unescapeHtml(innerContent.trim()),
summary: '',
duration: 0,
isDone: true,
@@ -533,23 +581,33 @@ class ReasoningParser {
}
/// Formats the duration for display.
/// Mirrors Open WebUI's formatting:
/// Mirrors Open WebUI's dayjs.duration(seconds, 'seconds').humanize():
/// - < 1: "less than a second"
/// - < 60: "X seconds"
/// - >= 60: humanized (e.g., "2 minutes")
/// - >= 60: humanized (e.g., "a minute", "2 minutes", "about an hour")
///
/// Reference: openwebui-src/src/lib/components/common/Collapsible.svelte
static String formatDuration(int seconds) {
if (seconds < 1) return 'less than a second';
if (seconds < 60) return '$seconds second${seconds == 1 ? '' : 's'}';
final minutes = seconds ~/ 60;
final remainingSeconds = seconds % 60;
if (remainingSeconds == 0) {
return '$minutes minute${minutes == 1 ? '' : 's'}';
// Match dayjs.duration().humanize() behavior
// Reference: https://day.js.org/docs/en/durations/humanize
if (seconds < 90) return 'a minute';
if (seconds < 2700) {
// 45 minutes
final minutes = (seconds / 60).round();
return '$minutes minutes';
}
// For mixed minutes and seconds, use abbreviated format
return '$minutes min ${remainingSeconds}s';
if (seconds < 5400) return 'about an hour'; // 90 minutes
if (seconds < 79200) {
// 22 hours
final hours = (seconds / 3600).round();
return '$hours hours';
}
if (seconds < 129600) return 'a day'; // 36 hours
final days = (seconds / 86400).round();
return '$days days';
}
}

View File

@@ -1,5 +1,7 @@
import 'dart:convert';
import '../../shared/widgets/markdown/markdown_preprocessor.dart';
/// Parsed representation of one tool call emitted as a `<details type="tool_calls" ...>` block
class ToolCallEntry {
final String id;
@@ -255,18 +257,8 @@ class ToolCallsParser {
static String sanitizeForApi(String content) {
if (content.isEmpty) return content;
// Remove blocks we never want to include in conversation context
final removeTypes = ['reasoning', 'code_interpreter'];
for (final t in removeTypes) {
content = content.replaceAll(
RegExp(
'<details\\s+type="$t"[^>]*>[\\s\\S]*?</details>',
multiLine: true,
dotAll: true,
),
'',
);
}
// Remove annotations and reasoning blocks
content = ConduitMarkdownPreprocessor.sanitize(content);
if (!content.contains('<details')) return content.trim();