50 lines
1.7 KiB
PHP
50 lines
1.7 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Moderation\Rules;
|
|
|
|
use App\Contracts\Moderation\ModerationRuleInterface;
|
|
|
|
class UnicodeObfuscationRule implements ModerationRuleInterface
|
|
{
|
|
public function analyze(string $content, string $normalized, array $context = []): array
|
|
{
|
|
$findings = [];
|
|
$weights = app('config')->get('content_moderation.weights', []);
|
|
|
|
// Detect homoglyph / lookalike characters
|
|
// Common spam tactic: replace Latin chars with Cyrillic, Greek, or special Unicode
|
|
$suspiciousPatterns = [
|
|
// Mixed script detection: Latin + Cyrillic in same word
|
|
'/\b(?=\S*[\x{0400}-\x{04FF}])(?=\S*[a-zA-Z])\S+\b/u',
|
|
// Zero-width characters
|
|
'/[\x{200B}\x{200C}\x{200D}\x{FEFF}\x{00AD}]/u',
|
|
// Invisible formatting characters
|
|
'/[\x{2060}\x{2061}\x{2062}\x{2063}\x{2064}]/u',
|
|
// Fullwidth Latin letters (used to bypass filters)
|
|
'/[\x{FF01}-\x{FF5E}]/u',
|
|
// Mathematical alphanumeric symbols used as text
|
|
'/[\x{1D400}-\x{1D7FF}]/u',
|
|
];
|
|
|
|
$matchCount = 0;
|
|
foreach ($suspiciousPatterns as $pattern) {
|
|
if (preg_match($pattern, $content)) {
|
|
$matchCount++;
|
|
}
|
|
}
|
|
|
|
if ($matchCount > 0) {
|
|
$findings[] = [
|
|
'rule' => 'unicode_obfuscation',
|
|
'score' => ($weights['unicode_obfuscation'] ?? 30) * $matchCount,
|
|
'reason' => 'Contains suspicious Unicode characters/obfuscation (' . $matchCount . ' pattern(s) matched)',
|
|
'links' => [],
|
|
'domains' => [],
|
|
'keywords' => [],
|
|
];
|
|
}
|
|
|
|
return $findings;
|
|
}
|
|
}
|