Files
SkinbaseNova/app/Services/Traffic/BotClassifier.php

158 lines
4.7 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Services\Traffic;
use Illuminate\Http\Request;
final class BotClassifier
{
/**
* @return array{is_bot: bool, type: ?string, family: ?string}
*/
public function classify(Request $request): array
{
$userAgent = trim((string) $request->userAgent());
if ($userAgent === '') {
return $this->bot('suspicious_bot', 'Empty UA');
}
$normalized = strtolower($userAgent);
if ($family = $this->matchFamily($normalized, [
'curl' => ['curl'],
'wget' => ['wget'],
'python-requests' => ['python-requests'],
'libwww-perl' => ['libwww-perl'],
'Go-http-client' => ['go-http-client'],
'Java' => ['java/'],
'scrapy' => ['scrapy'],
'httpclient' => ['httpclient'],
'masscan' => ['masscan'],
'nikto' => ['nikto'],
'sqlmap' => ['sqlmap'],
])) {
return $this->bot('suspicious_bot', $family);
}
if ($family = $this->matchFamily($normalized, [
'Googlebot' => ['googlebot'],
'Bingbot' => ['bingbot'],
'DuckDuckBot' => ['duckduckbot'],
'YandexBot' => ['yandexbot'],
'Baiduspider' => ['baiduspider'],
'Applebot' => ['applebot'],
'Slurp' => ['slurp'],
])) {
return $this->bot('search_bot', $family);
}
if ($family = $this->matchFamily($normalized, [
'GPTBot' => ['gptbot'],
'ChatGPT-User' => ['chatgpt-user'],
'OAI-SearchBot' => ['oai-searchbot'],
'ClaudeBot' => ['claudebot'],
'PerplexityBot' => ['perplexitybot'],
'Bytespider' => ['bytespider'],
'CCBot' => ['ccbot'],
'Google-Extended' => ['google-extended'],
'anthropic-ai' => ['anthropic-ai'],
'cohere-ai' => ['cohere-ai'],
])) {
return $this->bot('ai_bot', $family);
}
if ($family = $this->matchFamily($normalized, [
'AhrefsBot' => ['ahrefsbot'],
'SemrushBot' => ['semrushbot'],
'MJ12bot' => ['mj12bot'],
'DotBot' => ['dotbot'],
'PetalBot' => ['petalbot'],
'DataForSeoBot' => ['dataforseobot'],
'BLEXBot' => ['blexbot'],
'MauiBot' => ['mauibot'],
'serpstatbot' => ['serpstatbot'],
])) {
return $this->bot('seo_bot', $family);
}
if ($family = $this->matchFamily($normalized, [
'facebookexternalhit' => ['facebookexternalhit'],
'Twitterbot' => ['twitterbot'],
'LinkedInBot' => ['linkedinbot'],
'Slackbot' => ['slackbot'],
'Discordbot' => ['discordbot'],
'TelegramBot' => ['telegrambot'],
'WhatsApp' => ['whatsapp'],
'Pinterestbot' => ['pinterestbot'],
])) {
return $this->bot('social_bot', $family);
}
if ($family = $this->matchFamily($normalized, [
'UptimeRobot' => ['uptimerobot'],
'Pingdom' => ['pingdom'],
'StatusCake' => ['statuscake'],
'Better Stack' => ['better stack', 'betterstack'],
'BetterUptime' => ['betteruptime'],
])) {
return $this->bot('monitoring_bot', $family);
}
if (strlen($userAgent) < 8) {
return $this->bot('suspicious_bot', 'Short UA');
}
if ($this->containsAny($normalized, ['bot', 'crawler', 'spider', 'crawl', 'preview'])) {
return $this->bot('unknown_bot', 'Unknown crawler');
}
return [
'is_bot' => false,
'type' => null,
'family' => null,
];
}
/**
* @param array<string, array<int, string>> $families
*/
private function matchFamily(string $normalizedUserAgent, array $families): ?string
{
foreach ($families as $family => $keywords) {
if ($this->containsAny($normalizedUserAgent, $keywords)) {
return $family;
}
}
return null;
}
/**
* @param array<int, string> $keywords
*/
private function containsAny(string $haystack, array $keywords): bool
{
foreach ($keywords as $keyword) {
if ($keyword !== '' && str_contains($haystack, $keyword)) {
return true;
}
}
return false;
}
/**
* @return array{is_bot: bool, type: string, family: string}
*/
private function bot(string $type, string $family): array
{
return [
'is_bot' => true,
'type' => $type,
'family' => $family,
];
}
}