refactor(markdown): remove deprecated stream formatter and enhance preprocessor

2025-12-22 14:07:04 +05:30
parent 653162cb76
commit 5fd68f86fe
12 changed files with 347 additions and 505 deletions
@@ -1,10 +1,22 @@
-/// Utility helpers for normalising markdown content before handing it to
-/// [ConduitMarkdown]. The goal is to keep streaming responsive while smoothing
-/// out troublesome edge-cases (e.g. nested fences inside lists).
+import 'package:html_unescape/html_unescape.dart';
+import 'package:markdown/markdown.dart' as md;
+
+/// Content preprocessing, sanitization, and transformation for Markdown.
+///
+/// Provides:
+/// - [normalize] - Prepares content for display (keeps reasoning blocks)
+/// - [sanitize] - Cleans content for copy/API (removes reasoning blocks)
+/// - [toPlainText] - Converts to plain text for TTS
+/// - [softenInlineCode] - Breaks long inline code spans
 class ConduitMarkdownPreprocessor {
  const ConduitMarkdownPreprocessor._();

-  // Pre-compile regex patterns for better performance during streaming
+  static final _htmlUnescape = HtmlUnescape();
+
+  // ============================================================
+  // Pre-compiled Patterns - Display/Sanitization
+  // ============================================================
+
  static final _bulletFenceRegex = RegExp(
    r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$',
    multiLine: true,
@@ -14,7 +26,8 @@ class ConduitMarkdownPreprocessor {
    multiLine: true,
  );
  static final _dedentCloseRegex = RegExp(r'^[ \t]+```\s*$', multiLine: true);
-  static final _inlineClosingRegex = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
+  static final _inlineClosingRegex =
+      RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
  static final _labelThenDashRegex = RegExp(
    r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)',
    multiLine: true,
@@ -24,92 +37,143 @@ class ConduitMarkdownPreprocessor {
    multiLine: true,
  );
  static final _fenceAtBolRegex = RegExp(r'^\s*```', multiLine: true);
+  static final _linkWithTrailingSpaces =
+      RegExp(r'\[[^\]]+\]\([^\)]+\)\s{2,}$');
+  static final _multipleNewlines = RegExp(r'\n{3,}');

-  /// Normalises common fence and hard-break issues produced by LLMs.
+  /// Combined pattern for all reasoning/thinking blocks.
+  static final _reasoningBlocks = RegExp(
+    r'<details\s+type="(?:reasoning|code_interpreter)"[^>]*>[\s\S]*?</details>|'
+    r'<(?:think|thinking|reasoning)(?:\s[^>]*)?>[\s\S]*?</(?:think|thinking|reasoning)>',
+    multiLine: true,
+    dotAll: true,
+  );
+
+  // ============================================================
+  // Pre-compiled Patterns - Plain Text (TTS)
+  // ============================================================
+
+  static final _codeBlock = RegExp(r'```[^\n]*\n[\s\S]*?```');
+  static final _inlineCode = RegExp(r'`([^`]+)`');
+  static final _image = RegExp(r'!\[[^\]]*\]\([^)]+\)');
+  static final _link = RegExp(r'\[([^\]]+)\]\([^)]+\)');
+  // Paired markdown formatting - only unambiguous markers for TTS
+  // Single * and _ are skipped as they're ambiguous (math, variable names)
+  static final _boldItalic = RegExp(r'\*\*\*([^*]+)\*\*\*');
+  static final _bold = RegExp(r'\*\*([^*]+)\*\*');
+  static final _strikethrough = RegExp(r'~~([^~]+)~~');
+  // Single asterisk italic: only at word boundaries (space or line start/end)
+  static final _italicAsterisk = RegExp(r'(?:^|\s)\*([^*\s]+)\*(?=\s|$)');
+  // Single underscore italic: only when surrounded by spaces (not in identifiers)
+  static final _italicUnderscore = RegExp(r'(?:^|\s)_([^_\s]+)_(?=\s|$)');
+  static final _heading = RegExp(r'^#{1,6}\s+', multiLine: true);
+  static final _listMarker = RegExp(r'^[\s]*(?:[-*+]|\d+\.)\s+', multiLine: true);
+  static final _blockquote = RegExp(r'^>\s*', multiLine: true);
+  static final _horizontalRule = RegExp(r'^[\s]*[-*_]{3,}[\s]*$', multiLine: true);
+  static final _htmlTag = RegExp(r'<[^>]+>');
+  /// Comprehensive emoji pattern for TTS cleanup.
+  static final _emoji = RegExp(
+    r'[\u{1F600}-\u{1F64F}]|'  // Emoticons
+    r'[\u{1F300}-\u{1F5FF}]|'  // Misc Symbols and Pictographs
+    r'[\u{1F680}-\u{1F6FF}]|'  // Transport and Map
+    r'[\u{1F1E0}-\u{1F1FF}]|'  // Flags
+    r'[\u{2600}-\u{26FF}]|'    // Misc symbols
+    r'[\u{2700}-\u{27BF}]|'    // Dingbats
+    r'[\u{1F900}-\u{1F9FF}]|'  // Supplemental Symbols
+    r'[\u{1FA00}-\u{1FA6F}]|'  // Chess, cards
+    r'[\u{1FA70}-\u{1FAFF}]|'  // Symbols Extended-A
+    r'[\u{FE00}-\u{FE0F}]|'    // Variation Selectors
+    r'[\u{1F018}-\u{1F270}]|'  // Various
+    r'[\u{238C}-\u{2454}]|'    // Misc Technical
+    r'[\u{20D0}-\u{20FF}]',    // Combining Diacritical Marks
+    unicode: true,
+  );
+  static final _whitespace = RegExp(r'\s+');
+
+  // ============================================================
+  // Public API
+  // ============================================================
+
+  /// Normalizes content for Markdown display.
+  ///
+  /// - Strips link reference definitions (including OpenAI annotations)
+  /// - Fixes common LLM fence issues
+  /// - Preserves reasoning blocks for collapsible UI rendering
  static String normalize(String input) {
-    if (input.isEmpty) {
-      return input;
-    }
+    if (input.isEmpty) return input;

    var output = input.replaceAll('\r\n', '\n');

-    // Move fenced code blocks that start on the same line as a list item onto
-    // their own line so the parser does not treat them as list text.
-    output = output.replaceAllMapped(
-      _bulletFenceRegex,
-      (match) => '${match[1]}\n```${match[2]}',
-    );
+    // Strip link reference definitions using markdown package
+    output = _stripLinkReferenceDefinitions(output);

-    // Dedent opening fences to avoid partial code-block detection when the
-    // model indents fences by accident.
-    output = output.replaceAllMapped(
-      _dedentOpenRegex,
-      (match) => '```${match[1]}',
-    );
+    // Fix fence issues
+    output = _normalizeFences(output);

-    // Dedent closing fences for the same reason as the opening fences.
-    output = output.replaceAllMapped(_dedentCloseRegex, (_) => '```');
-
-    // Ensure closing fences stand alone. Prevents situations like `}\n```foo`
-    // from keeping trailing braces inside the code block.
-    output = output.replaceAllMapped(
-      _inlineClosingRegex,
-      (match) => '${match[1]}\n```',
-    );
-
-    // Insert a blank line when a "label: value" line is followed by a
-    // horizontal rule so it is not treated as a Setext heading underline.
+    // Fix Setext heading false positives
    output = output.replaceAllMapped(
      _labelThenDashRegex,
      (match) => '${match[1]}\n\n${match[2]}',
    );

-    // Allow headings like "## 1. Summary" without triggering ordered-list
-    // parsing by inserting a zero-width joiner after the numeric marker.
+    // Fix numeric heading parsing
    output = output.replaceAllMapped(
      _atxEnumRegex,
      (match) => '${match[1]}.\u200C${match[2]}${match[3]}',
    );

-    // Auto-close an unmatched opening fence at EOF to avoid the entire tail
-    // of the message rendering as code.
-    final fenceCount = _fenceAtBolRegex.allMatches(output).length;
-    if (fenceCount.isOdd) {
-      if (!output.endsWith('\n')) {
-        output += '\n';
-      }
-      output += '```';
-    }
-
-    // Convert Markdown links followed by two trailing spaces into separate
-    // paragraphs so that consecutive links do not collapse into a single
-    // paragraph at render time.
-    final linkWithTrailingSpaces = RegExp(r'\[[^\]]+\]\([^\)]+\)\s{2,}$');
-    final lines = output.split('\n');
-    if (lines.length > 1) {
-      final buffer = StringBuffer();
-      for (var i = 0; i < lines.length; i++) {
-        final line = lines[i];
-        buffer.write(line);
-        if (i < lines.length - 1) {
-          buffer.write('\n');
-        }
-        if (linkWithTrailingSpaces.hasMatch(line)) {
-          buffer.write('\n');
-        }
-      }
-      output = buffer.toString();
-    }
+    // Separate consecutive links
+    output = _separateConsecutiveLinks(output);

    return output;
  }

-  /// Inserts zero-width break characters into long inline code spans so they
-  /// remain readable and do not overflow narrow layouts.
+  /// Sanitizes content for clipboard copy or API submission.
+  ///
+  /// - Strips link reference definitions (including OpenAI annotations)
+  /// - Strips reasoning/thinking blocks
+  /// - Normalizes whitespace
+  static String sanitize(String input) {
+    if (input.isEmpty) return input;
+
+    return input
+        .replaceAll('\r\n', '\n')
+        .transform(_stripLinkReferenceDefinitions)
+        .replaceAll(_reasoningBlocks, '')
+        .replaceAll(_multipleNewlines, '\n\n')
+        .trim();
+  }
+
+  /// Converts markdown to plain text for text-to-speech.
+  static String toPlainText(String input) {
+    if (input.trim().isEmpty) return '';
+
+    return sanitize(input)
+        .replaceAll(_codeBlock, '') // Remove code blocks
+        .replaceAllMapped(_inlineCode, (m) => m[1] ?? '') // Keep code text
+        .replaceAll(_image, '') // Remove images
+        .replaceAllMapped(_link, (m) => m[1] ?? '') // Keep link text
+        // Strip paired markdown formatting (preserves lone * and _ in text)
+        .replaceAllMapped(_boldItalic, (m) => m[1] ?? '')
+        .replaceAllMapped(_bold, (m) => m[1] ?? '')
+        .replaceAllMapped(_strikethrough, (m) => m[1] ?? '')
+        .replaceAllMapped(_italicAsterisk, (m) => ' ${m[1] ?? ''}')
+        .replaceAllMapped(_italicUnderscore, (m) => ' ${m[1] ?? ''}')
+        .replaceAll(_heading, '') // Strip # markers
+        .replaceAll(_listMarker, '') // Strip list markers
+        .replaceAll(_blockquote, '') // Strip > markers
+        .replaceAll(_horizontalRule, '') // Remove ---
+        .replaceAll(_htmlTag, '') // Remove HTML
+        .transform(_htmlUnescape.convert) // Decode entities
+        .replaceAll(_emoji, '') // Remove emojis
+        .replaceAll(_whitespace, ' ') // Normalize whitespace
+        .trim();
+  }
+
+  /// Breaks long inline code spans for better wrapping.
  static String softenInlineCode(String input, {int chunkSize = 24}) {
-    if (input.length <= chunkSize) {
-      return input;
-    }
+    if (input.length <= chunkSize) return input;
+
    final buffer = StringBuffer();
    for (var i = 0; i < input.length; i++) {
      buffer.write(input[i]);
@@ -119,4 +183,90 @@ class ConduitMarkdownPreprocessor {
    }
    return buffer.toString();
  }
+
+  // ============================================================
+  // Private Helpers
+  // ============================================================
+
+  static String _normalizeFences(String input) {
+    var output = input;
+
+    // Move fences after list markers to new line
+    output = output.replaceAllMapped(
+      _bulletFenceRegex,
+      (match) => '${match[1]}\n```${match[2]}',
+    );
+
+    // Dedent opening fences
+    output = output.replaceAllMapped(
+      _dedentOpenRegex,
+      (match) => '```${match[1]}',
+    );
+
+    // Dedent closing fences
+    output = output.replaceAllMapped(_dedentCloseRegex, (_) => '```');
+
+    // Ensure closing fences stand alone
+    output = output.replaceAllMapped(
+      _inlineClosingRegex,
+      (match) => '${match[1]}\n```',
+    );
+
+    // Auto-close unmatched fence
+    final fenceCount = _fenceAtBolRegex.allMatches(output).length;
+    if (fenceCount.isOdd) {
+      if (!output.endsWith('\n')) output += '\n';
+      output += '```';
+    }
+
+    return output;
+  }
+
+  static String _separateConsecutiveLinks(String input) {
+    final lines = input.split('\n');
+    if (lines.length <= 1) return input;
+
+    final buffer = StringBuffer();
+    for (var i = 0; i < lines.length; i++) {
+      final line = lines[i];
+      buffer.write(line);
+      if (i < lines.length - 1) buffer.write('\n');
+      if (_linkWithTrailingSpaces.hasMatch(line)) buffer.write('\n');
+    }
+    return buffer.toString();
+  }
+
+  /// Strips link reference definitions using the `markdown` package.
+  static String _stripLinkReferenceDefinitions(String input) {
+    if (!input.contains('[')) return input;
+
+    final document = md.Document();
+    document.parseLines(input.split('\n'));
+
+    final refLabels = document.linkReferences.keys.toSet();
+    if (refLabels.isEmpty) return input;
+
+    final labelPatterns =
+        refLabels.map((label) => RegExp.escape(label)).join('|');
+
+    final refDefRegex = RegExp(
+      r'^[ ]{0,3}\[(?:' +
+          labelPatterns +
+          r')\]:[ \t]*(?:<[^>]*>|[^\s]*)(?:[ \t]+(?:"[^"]*"|' +
+          r"'[^']*'" +
+          r'|\([^)]*\)))?[ \t]*$',
+      multiLine: true,
+      caseSensitive: false,
+    );
+
+    return input
+        .replaceAll(refDefRegex, '')
+        .replaceAll(_multipleNewlines, '\n\n')
+        .trim();
+  }
+}
+
+/// Extension for chaining string transformations.
+extension _StringTransform on String {
+  String transform(String Function(String) fn) => fn(this);
 }