/ / hints from really old legacy content * 3. Parse subset of Markdown (bold, italic, code, links, line breaks) * 4. Sanitize the rendered HTML: whitelist-only tags, strip attributes * 5. Return safe HTML ready for storage or display */ class ContentSanitizer { /** Maximum number of emoji allowed before triggering a flood error. */ public const EMOJI_COUNT_MAX = 50; /** * Maximum ratio of emoji-to-total-characters before content is considered * an emoji flood (applies only when emoji count > 5 to avoid false positives * on very short strings like a single reaction comment). */ public const EMOJI_DENSITY_MAX = 0.40; // HTML tags we allow in the final rendered output private const ALLOWED_TAGS = [ 'p', 'br', 'strong', 'em', 'code', 'pre', 'a', 'ul', 'ol', 'li', 'blockquote', 'del', ]; // Allowed attributes per tag private const ALLOWED_ATTRS = [ 'a' => ['href', 'title', 'rel', 'target'], ]; private static ?MarkdownConverter $converter = null; // ───────────────────────────────────────────────────────────────────────── // Public API // ───────────────────────────────────────────────────────────────────────── /** * Convert raw user input (legacy or new) to sanitized HTML. * * @param string|null $raw * @return string Safe HTML */ public static function render(?string $raw): string { if ($raw === null || trim($raw) === '') { return ''; } // 1. Convert legacy HTML fragments to Markdown-friendly text $text = static::legacyHtmlToMarkdown($raw); // 2. Parse Markdown → HTML $html = static::parseMarkdown($text); // 3. Sanitize HTML (strip disallowed tags / attrs) $html = static::sanitizeHtml($html); return $html; } /** * Normalize previously rendered HTML for display-time policy changes. * This is useful when stored HTML predates current link attributes or * when display rules depend on the author rather than the raw content. */ public static function sanitizeRenderedHtml(?string $html, bool $allowLinks = true): string { if ($html === null || trim($html) === '') { return ''; } return static::sanitizeHtml($html, $allowLinks); } /** * Strip ALL HTML from input, returning plain text with newlines preserved. */ public static function stripToPlain(?string $html): string { if ($html === null) { return ''; } // Convert
and
to line breaks before stripping $text = preg_replace(['//i', '/<\/p>/i'], "\n", $html); $text = strip_tags($text ?? ''); $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); return trim($text); } /** * Validate that a Markdown-lite string does not contain disallowed patterns. * Returns an array of validation errors (empty = OK). */ public static function validate(string $raw): array { $errors = []; if (mb_strlen($raw) > 10_000) { $errors[] = 'Content exceeds maximum length of 10,000 characters.'; } // Detect raw HTML tags (we forbid them) if (preg_match('/<[a-z][^>]*>/i', $raw)) { $errors[] = 'HTML tags are not allowed. Use Markdown formatting instead.'; } // Count emoji to prevent absolute spam $emojiCount = static::countEmoji($raw); if ($emojiCount > self::EMOJI_COUNT_MAX) { $errors[] = 'Too many emoji. Please limit emoji usage.'; } // Reject emoji-flood content: density guard catches e.g. 15 emoji in a // 20-char string even when the absolute count is below EMOJI_COUNT_MAX. if ($emojiCount > 5) { $totalChars = mb_strlen($raw); if ($totalChars > 0 && ($emojiCount / $totalChars) > self::EMOJI_DENSITY_MAX) { $errors[] = 'Content is mostly emoji. Please add some text.'; } } return $errors; } /** * Collapse consecutive runs of the same emoji in $text. * * Delegates to LegacySmileyMapper::collapseFlood() so the behaviour is * consistent between new submissions and migrated legacy content. * * Example: "🍺 🍺 🍺 🍺 🍺 🍺 🍺" (7×) → "🍺 🍺 🍺 🍺 🍺 ×7" * * @param int $maxRun Keep at most this many consecutive identical emoji. */ public static function collapseFlood(string $text, int $maxRun = 5): string { return LegacySmileyMapper::collapseFlood($text, $maxRun); } // ───────────────────────────────────────────────────────────────────────── // Private helpers // ───────────────────────────────────────────────────────────────────────── /** * Convert legacy HTML-style formatting to Markdown equivalents. * This runs BEFORE Markdown parsing to handle old content gracefully. */ private static function legacyHtmlToMarkdown(string $html): string { $replacements = [ // Bold '/(.*?)<\/b>/is' => '**$1**', '/(.*?)<\/strong>/is' => '**$1**', // Italic '/(.*?)<\/i>/is' => '*$1*', '/(.*?)<\/em>/is' => '*$1*', // Line breaks → actual newlines '//i' => "\n", // Paragraphs '/
(.*?)<\/p>/is' => "$1\n\n", // Strip remaining tags '/<[^>]+>/' => '', ]; $result = $html; foreach ($replacements as $pattern => $replacement) { $result = preg_replace($pattern, $replacement, $result) ?? $result; } // Decode HTML entities (e.g. & → &) $result = html_entity_decode($result, ENT_QUOTES | ENT_HTML5, 'UTF-8'); return $result; } /** * Parse Markdown-lite subset to HTML. */ private static function parseMarkdown(string $text): string { $converter = static::getConverter(); $result = $converter->convert($text); return (string) $result->getContent(); } /** * Whitelist-based HTML sanitizer. * Removes all tags not in ALLOWED_TAGS, and strips disallowed attributes. */ private static function sanitizeHtml(string $html, bool $allowLinks = true): string { // Parse with DOMDocument $doc = new \DOMDocument('1.0', 'UTF-8'); // Suppress warnings from malformed fragments libxml_use_internal_errors(true); $doc->loadHTML( '' . $html . '', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD ); libxml_clear_errors(); static::cleanNode($doc->getElementsByTagName('body')->item(0), $allowLinks); // Serialize back, removing the wrapping html/body $body = $doc->getElementsByTagName('body')->item(0); $inner = ''; foreach ($body->childNodes as $child) { $inner .= $doc->saveHTML($child); } // Fix self-closing etc. return trim($inner); } /** * Recursively clean a DOMNode — strip forbidden tags/attributes. */ private static function cleanNode(\DOMNode $node, bool $allowLinks = true): void { $toRemove = []; $toUnwrap = []; foreach ($node->childNodes as $child) { if ($child->nodeType === XML_ELEMENT_NODE) { if (! $child instanceof \DOMElement) { continue; } $tag = strtolower($child->nodeName); if (! in_array($tag, self::ALLOWED_TAGS, true)) { // Replace element with its text content $toUnwrap[] = $child; } else { // Strip disallowed attributes $allowedAttrs = self::ALLOWED_ATTRS[$tag] ?? []; $attrsToRemove = []; foreach ($child->attributes as $attr) { if (! in_array($attr->nodeName, $allowedAttrs, true)) { $attrsToRemove[] = $attr->nodeName; } } foreach ($attrsToRemove as $attrName) { $child->removeAttribute($attrName); } // Force external links to be safe if ($tag === 'a') { if (! $allowLinks) { $toUnwrap[] = $child; continue; } $href = $child->getAttribute('href'); if ($href && ! static::isSafeUrl($href)) { $toUnwrap[] = $child; continue; } $child->setAttribute('rel', 'noopener noreferrer nofollow'); $child->setAttribute('target', '_blank'); } // Recurse static::cleanNode($child, $allowLinks); } } } // Unwrap forbidden elements (replace with their children) foreach ($toUnwrap as $el) { while ($el->firstChild) { $node->insertBefore($el->firstChild, $el); } $node->removeChild($el); } } /** * Very conservative URL whitelist. */ private static function isSafeUrl(string $url): bool { $lower = strtolower(trim($url)); // Allow relative paths and anchors if (str_starts_with($url, '/') || str_starts_with($url, '#')) { return true; } // Only allow http(s) return str_starts_with($lower, 'http://') || str_starts_with($lower, 'https://'); } /** * Count Unicode emoji in a string (basic heuristic). */ private static function countEmoji(string $text): int { // Match common emoji ranges preg_match_all( '/[\x{1F300}-\x{1FAD6}\x{2600}-\x{27BF}\x{FE00}-\x{FEFF}]/u', $text, $matches ); return count($matches[0]); } /** * Lazy-load and cache the Markdown converter. */ private static function getConverter(): MarkdownConverter { if (static::$converter === null) { $env = new Environment([ 'html_input' => 'strip', 'allow_unsafe_links' => false, 'max_nesting_level' => 10, ]); $env->addExtension(new CommonMarkCoreExtension()); $env->addExtension(new AutolinkExtension()); $env->addExtension(new StrikethroughExtension()); static::$converter = new MarkdownConverter($env); } return static::$converter; } }