/ / hints from really old legacy content
* 3. Parse subset of Markdown (bold, italic, code, links, line breaks)
* 4. Sanitize the rendered HTML: whitelist-only tags, strip attributes
* 5. Return safe HTML ready for storage or display
*/
class ContentSanitizer
{
/** Maximum number of emoji allowed before triggering a flood error. */
public const EMOJI_COUNT_MAX = 50;
/**
* Maximum ratio of emoji-to-total-characters before content is considered
* an emoji flood (applies only when emoji count > 5 to avoid false positives
* on very short strings like a single reaction comment).
*/
public const EMOJI_DENSITY_MAX = 0.40;
// HTML tags we allow in the final rendered output
private const ALLOWED_TAGS = [
'p', 'br', 'strong', 'em', 'code', 'pre',
'a', 'ul', 'ol', 'li', 'blockquote', 'del',
];
// Allowed attributes per tag
private const ALLOWED_ATTRS = [
'a' => ['href', 'title', 'rel', 'target'],
];
private static ?MarkdownConverter $converter = null;
// ─────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────
/**
* Convert raw user input (legacy or new) to sanitized HTML.
*
* @param string|null $raw
* @return string Safe HTML
*/
public static function render(?string $raw): string
{
if ($raw === null || trim($raw) === '') {
return '';
}
// 1. Convert legacy HTML fragments to Markdown-friendly text
$text = static::legacyHtmlToMarkdown($raw);
// 2. Parse Markdown → HTML
$html = static::parseMarkdown($text);
// 3. Sanitize HTML (strip disallowed tags / attrs)
$html = static::sanitizeHtml($html);
return $html;
}
/**
* Normalize previously rendered HTML for display-time policy changes.
* This is useful when stored HTML predates current link attributes or
* when display rules depend on the author rather than the raw content.
*/
public static function sanitizeRenderedHtml(?string $html, bool $allowLinks = true): string
{
if ($html === null || trim($html) === '') {
return '';
}
return static::sanitizeHtml($html, $allowLinks);
}
/**
* Strip ALL HTML from input, returning plain text with newlines preserved.
*/
public static function stripToPlain(?string $html): string
{
if ($html === null) {
return '';
}
// Convert
and
to line breaks before stripping
$text = preg_replace(['/ (.*?)<\/p>/is' => "$1\n\n",
// Strip remaining tags
'/<[^>]+>/' => '',
];
$result = $html;
foreach ($replacements as $pattern => $replacement) {
$result = preg_replace($pattern, $replacement, $result) ?? $result;
}
// Decode HTML entities (e.g. & → &)
$result = html_entity_decode($result, ENT_QUOTES | ENT_HTML5, 'UTF-8');
return $result;
}
/**
* Parse Markdown-lite subset to HTML.
*/
private static function parseMarkdown(string $text): string
{
$converter = static::getConverter();
$result = $converter->convert($text);
return (string) $result->getContent();
}
/**
* Whitelist-based HTML sanitizer.
* Removes all tags not in ALLOWED_TAGS, and strips disallowed attributes.
*/
private static function sanitizeHtml(string $html, bool $allowLinks = true): string
{
// Parse with DOMDocument
$doc = new \DOMDocument('1.0', 'UTF-8');
// Suppress warnings from malformed fragments
libxml_use_internal_errors(true);
$doc->loadHTML(
'
/i', '/<\/p>/i'], "\n", $html);
$text = strip_tags($text ?? '');
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
return trim($text);
}
/**
* Validate that a Markdown-lite string does not contain disallowed patterns.
* Returns an array of validation errors (empty = OK).
*/
public static function validate(string $raw): array
{
$errors = [];
if (mb_strlen($raw) > 10_000) {
$errors[] = 'Content exceeds maximum length of 10,000 characters.';
}
// Detect raw HTML tags (we forbid them)
if (preg_match('/<[a-z][^>]*>/i', $raw)) {
$errors[] = 'HTML tags are not allowed. Use Markdown formatting instead.';
}
// Count emoji to prevent absolute spam
$emojiCount = static::countEmoji($raw);
if ($emojiCount > self::EMOJI_COUNT_MAX) {
$errors[] = 'Too many emoji. Please limit emoji usage.';
}
// Reject emoji-flood content: density guard catches e.g. 15 emoji in a
// 20-char string even when the absolute count is below EMOJI_COUNT_MAX.
if ($emojiCount > 5) {
$totalChars = mb_strlen($raw);
if ($totalChars > 0 && ($emojiCount / $totalChars) > self::EMOJI_DENSITY_MAX) {
$errors[] = 'Content is mostly emoji. Please add some text.';
}
}
return $errors;
}
/**
* Collapse consecutive runs of the same emoji in $text.
*
* Delegates to LegacySmileyMapper::collapseFlood() so the behaviour is
* consistent between new submissions and migrated legacy content.
*
* Example: "🍺 🍺 🍺 🍺 🍺 🍺 🍺" (7×) → "🍺 🍺 🍺 🍺 🍺 ×7"
*
* @param int $maxRun Keep at most this many consecutive identical emoji.
*/
public static function collapseFlood(string $text, int $maxRun = 5): string
{
return LegacySmileyMapper::collapseFlood($text, $maxRun);
}
// ─────────────────────────────────────────────────────────────────────────
// Private helpers
// ─────────────────────────────────────────────────────────────────────────
/**
* Convert legacy HTML-style formatting to Markdown equivalents.
* This runs BEFORE Markdown parsing to handle old content gracefully.
*/
private static function legacyHtmlToMarkdown(string $html): string
{
$replacements = [
// Bold
'/(.*?)<\/b>/is' => '**$1**',
'/(.*?)<\/strong>/is' => '**$1**',
// Italic
'/(.*?)<\/i>/is' => '*$1*',
'/(.*?)<\/em>/is' => '*$1*',
// Line breaks → actual newlines
'/
/i' => "\n",
// Paragraphs
'/