get('content_moderation.weights', []); // Detect homoglyph / lookalike characters // Common spam tactic: replace Latin chars with Cyrillic, Greek, or special Unicode $suspiciousPatterns = [ // Mixed script detection: Latin + Cyrillic in same word '/\b(?=\S*[\x{0400}-\x{04FF}])(?=\S*[a-zA-Z])\S+\b/u', // Zero-width characters '/[\x{200B}\x{200C}\x{200D}\x{FEFF}\x{00AD}]/u', // Invisible formatting characters '/[\x{2060}\x{2061}\x{2062}\x{2063}\x{2064}]/u', // Fullwidth Latin letters (used to bypass filters) '/[\x{FF01}-\x{FF5E}]/u', // Mathematical alphanumeric symbols used as text '/[\x{1D400}-\x{1D7FF}]/u', ]; $matchCount = 0; foreach ($suspiciousPatterns as $pattern) { if (preg_match($pattern, $content)) { $matchCount++; } } if ($matchCount > 0) { $findings[] = [ 'rule' => 'unicode_obfuscation', 'score' => ($weights['unicode_obfuscation'] ?? 30) * $matchCount, 'reason' => 'Contains suspicious Unicode characters/obfuscation (' . $matchCount . ' pattern(s) matched)', 'links' => [], 'domains' => [], 'keywords' => [], ]; } return $findings; } }