57 lines
2.0 KiB
PHP
57 lines
2.0 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Moderation\Rules;
|
|
|
|
use App\Contracts\Moderation\ModerationRuleInterface;
|
|
|
|
class RepeatedPhraseRule implements ModerationRuleInterface
|
|
{
|
|
public function analyze(string $content, string $normalized, array $context = []): array
|
|
{
|
|
$config = app('config')->get('content_moderation.repeated_phrase', []);
|
|
$minPhraseLength = $config['min_phrase_length'] ?? 4;
|
|
$minRepetitions = $config['min_repetitions'] ?? 3;
|
|
$weights = app('config')->get('content_moderation.weights', []);
|
|
|
|
$words = preg_split('/\s+/', $normalized);
|
|
if (count($words) < $minPhraseLength * $minRepetitions) {
|
|
return [];
|
|
}
|
|
|
|
$findings = [];
|
|
$repeatedPhrases = [];
|
|
|
|
// Check for repeated n-grams of various lengths
|
|
for ($phraseLen = $minPhraseLength; $phraseLen <= min(8, intdiv(count($words), 2)); $phraseLen++) {
|
|
$ngrams = [];
|
|
for ($i = 0; $i <= count($words) - $phraseLen; $i++) {
|
|
$ngram = implode(' ', array_slice($words, $i, $phraseLen));
|
|
$ngrams[$ngram] = ($ngrams[$ngram] ?? 0) + 1;
|
|
}
|
|
|
|
foreach ($ngrams as $phrase => $count) {
|
|
if ($count >= $minRepetitions) {
|
|
$repeatedPhrases[$phrase] = $count;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!empty($repeatedPhrases)) {
|
|
$findings[] = [
|
|
'rule' => 'repeated_phrase',
|
|
'score' => $weights['repeated_phrase'] ?? 25,
|
|
'reason' => 'Contains repeated phrases: ' . implode(', ', array_map(
|
|
fn ($phrase, $count) => "\"{$phrase}\" ({$count}x)",
|
|
array_keys($repeatedPhrases),
|
|
array_values($repeatedPhrases)
|
|
)),
|
|
'links' => [],
|
|
'domains' => [],
|
|
'keywords' => array_keys($repeatedPhrases),
|
|
];
|
|
}
|
|
|
|
return $findings;
|
|
}
|
|
}
|