106 lines
3.8 KiB
PHP
106 lines
3.8 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Moderation;
|
|
|
|
use App\Enums\ModerationContentType;
|
|
use App\Models\Artwork;
|
|
use App\Models\ArtworkComment;
|
|
|
|
class DuplicateDetectionService
|
|
{
|
|
public function campaignText(string $content): string
|
|
{
|
|
$text = mb_strtolower($content);
|
|
$text = preg_replace('/https?:\/\/\S+/iu', ' [link] ', $text);
|
|
$text = preg_replace('/www\.\S+/iu', ' [link] ', (string) $text);
|
|
$text = preg_replace('/[^\p{L}\p{N}\s\[\]]+/u', ' ', (string) $text);
|
|
$text = preg_replace('/\s+/u', ' ', trim((string) $text));
|
|
|
|
return (string) $text;
|
|
}
|
|
|
|
/**
|
|
* @param array<int, string> $domains
|
|
*/
|
|
public function buildGroupKey(string $content, array $domains = []): string
|
|
{
|
|
$template = $this->campaignText($content);
|
|
$tokens = preg_split('/\s+/u', $template, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
|
$signature = implode(' ', array_slice($tokens, 0, 12));
|
|
$domainPart = implode('|', array_slice(array_values(array_unique($domains)), 0, 2));
|
|
|
|
return hash('sha256', $domainPart . '::' . $signature);
|
|
}
|
|
|
|
/**
|
|
* @param array<string, mixed> $context
|
|
* @param array<int, string> $domains
|
|
*/
|
|
public function nearDuplicateCount(string $content, array $context = [], array $domains = []): int
|
|
{
|
|
$type = (string) ($context['content_type'] ?? '');
|
|
$contentId = (int) ($context['content_id'] ?? 0);
|
|
$artworkId = (int) ($context['artwork_id'] ?? 0);
|
|
$signature = $this->campaignText($content);
|
|
if ($signature === '') {
|
|
return 0;
|
|
}
|
|
|
|
$candidates = match ($type) {
|
|
ModerationContentType::ArtworkComment->value => ArtworkComment::query()
|
|
->where('id', '!=', $contentId)
|
|
->whereNull('deleted_at')
|
|
->latest('id')
|
|
->limit(80)
|
|
->get(['id', 'artwork_id', 'raw_content', 'content']),
|
|
ModerationContentType::ArtworkDescription->value => Artwork::query()
|
|
->where('id', '!=', $contentId)
|
|
->whereNotNull('description')
|
|
->latest('id')
|
|
->limit(80)
|
|
->get(['id', 'description']),
|
|
default => \collect(),
|
|
};
|
|
|
|
$matches = 0;
|
|
|
|
foreach ($candidates as $candidate) {
|
|
$candidateText = match ($type) {
|
|
ModerationContentType::ArtworkComment->value => (string) ($candidate->raw_content ?: $candidate->content),
|
|
ModerationContentType::ArtworkDescription->value => (string) ($candidate->description ?? ''),
|
|
default => '',
|
|
};
|
|
|
|
if ($candidateText === '') {
|
|
continue;
|
|
}
|
|
|
|
$candidateSignature = $this->campaignText($candidateText);
|
|
similar_text($signature, $candidateSignature, $similarity);
|
|
|
|
$sameArtworkPenalty = $artworkId > 0 && (int) ($candidate->artwork_id ?? $candidate->id ?? 0) === $artworkId ? 4 : 0;
|
|
|
|
if ($similarity >= (float) \app('config')->get('content_moderation.duplicate_detection.near_duplicate_similarity', 84) - $sameArtworkPenalty) {
|
|
$matches++;
|
|
continue;
|
|
}
|
|
|
|
if ($domains !== []) {
|
|
$topDomain = $domains[0] ?? null;
|
|
if ($topDomain !== null && str_contains(mb_strtolower($candidateText), mb_strtolower($topDomain))) {
|
|
similar_text($this->stripLinks($signature), $this->stripLinks($candidateSignature), $linklessSimilarity);
|
|
if ($linklessSimilarity >= 72) {
|
|
$matches++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return $matches;
|
|
}
|
|
|
|
private function stripLinks(string $text): string
|
|
{
|
|
return trim(str_replace('[link]', '', $text));
|
|
}
|
|
} |