286 lines
9.0 KiB
PHP
286 lines
9.0 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Services\Sitemaps;
|
|
|
|
use App\Models\Artwork;
|
|
use App\Models\User;
|
|
use DOMDocument;
|
|
use DOMXPath;
|
|
|
|
final class SitemapValidationService
|
|
{
|
|
public function __construct(
|
|
private readonly SitemapBuildService $build,
|
|
private readonly SitemapIndexService $index,
|
|
private readonly SitemapRegistry $registry,
|
|
private readonly SitemapShardService $shards,
|
|
) {
|
|
}
|
|
|
|
/**
|
|
* @param list<string> $onlyFamilies
|
|
* @return array<string, mixed>
|
|
*/
|
|
public function validate(array $onlyFamilies = []): array
|
|
{
|
|
$families = $onlyFamilies !== []
|
|
? array_values(array_filter($onlyFamilies, fn (string $family): bool => $this->registry->get($family) !== null))
|
|
: $this->build->enabledFamilies();
|
|
|
|
$expectedIndexLocs = array_map(
|
|
static fn (SitemapIndexItem $item): string => $item->loc,
|
|
array_values(array_filter(
|
|
$this->index->items(),
|
|
fn (SitemapIndexItem $item): bool => $this->isFamilySelected($families, $item->loc),
|
|
)),
|
|
);
|
|
|
|
$indexBuild = $this->build->buildIndex(true, false);
|
|
$indexErrors = [];
|
|
$indexXml = $this->loadXml($indexBuild['content']);
|
|
|
|
if ($indexXml === null) {
|
|
$indexErrors[] = 'The main sitemap index XML could not be parsed.';
|
|
}
|
|
|
|
$actualIndexLocs = $indexXml ? $this->extractLocs($indexXml, 'sitemap') : [];
|
|
if ($indexXml !== null && $actualIndexLocs !== $expectedIndexLocs) {
|
|
$indexErrors[] = 'Main sitemap index child references do not match the expected shard-aware manifest.';
|
|
}
|
|
|
|
$familyReports = [];
|
|
$duplicates = [];
|
|
$seenUrls = [];
|
|
$totalUrlCount = 0;
|
|
$totalShardCount = 0;
|
|
|
|
foreach ($families as $family) {
|
|
$builder = $this->registry->get($family);
|
|
|
|
if ($builder === null) {
|
|
continue;
|
|
}
|
|
|
|
$report = [
|
|
'family' => $family,
|
|
'documents' => 0,
|
|
'url_count' => 0,
|
|
'shard_count' => max(1, $this->shards->shardCount($builder)),
|
|
'errors' => [],
|
|
'warnings' => [],
|
|
];
|
|
|
|
$totalShardCount += $report['shard_count'];
|
|
|
|
foreach ($this->build->documentNamesForFamily($family, true) as $name) {
|
|
$built = $this->build->buildNamed($name, true, false);
|
|
|
|
if ($built === null) {
|
|
$report['errors'][] = 'Unable to resolve sitemap [' . $name . '].';
|
|
continue;
|
|
}
|
|
|
|
$document = $this->loadXml($built['content']);
|
|
if ($document === null) {
|
|
$report['errors'][] = 'Invalid XML emitted for [' . $name . '].';
|
|
continue;
|
|
}
|
|
|
|
$report['documents']++;
|
|
|
|
if ($built['type'] === SitemapTarget::TYPE_INDEX) {
|
|
$expectedFamilyLocs = array_map(
|
|
static fn (SitemapIndexItem $item): string => $item->loc,
|
|
$this->index->itemsForBuilder($builder),
|
|
);
|
|
|
|
$actualFamilyLocs = $this->extractLocs($document, 'sitemap');
|
|
if ($actualFamilyLocs !== $expectedFamilyLocs) {
|
|
$report['errors'][] = 'Shard compatibility index [' . $name . '] does not reference the expected shard URLs.';
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
$locs = $this->extractLocs($document, 'url');
|
|
$report['url_count'] += count($locs);
|
|
$totalUrlCount += count($locs);
|
|
|
|
foreach ($locs as $loc) {
|
|
if (isset($seenUrls[$loc])) {
|
|
$duplicates[$loc] = ($duplicates[$loc] ?? 1) + 1;
|
|
}
|
|
|
|
$seenUrls[$loc] = true;
|
|
|
|
$reason = $this->urlError($family, $loc);
|
|
if ($reason !== null) {
|
|
$report['errors'][] = $reason . ' [' . $loc . ']';
|
|
}
|
|
}
|
|
|
|
foreach ($this->extractImageLocs($document) as $imageLoc) {
|
|
if (! preg_match('/^https?:\/\//i', $imageLoc)) {
|
|
$report['warnings'][] = 'Non-absolute image URL found [' . $imageLoc . ']';
|
|
}
|
|
}
|
|
}
|
|
|
|
$familyReports[] = $report;
|
|
}
|
|
|
|
return [
|
|
'ok' => $indexErrors === [] && $this->familyErrors($familyReports) === [] && $duplicates === [],
|
|
'index' => [
|
|
'errors' => $indexErrors,
|
|
'url_count' => count($actualIndexLocs),
|
|
],
|
|
'families' => $familyReports,
|
|
'duplicates' => array_keys($duplicates),
|
|
'totals' => [
|
|
'families' => count($familyReports),
|
|
'documents' => array_sum(array_map(static fn (array $report): int => (int) $report['documents'], $familyReports)),
|
|
'urls' => $totalUrlCount,
|
|
'shards' => $totalShardCount,
|
|
],
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @param list<array<string, mixed>> $reports
|
|
* @return list<string>
|
|
*/
|
|
private function familyErrors(array $reports): array
|
|
{
|
|
$errors = [];
|
|
|
|
foreach ($reports as $report) {
|
|
foreach ((array) ($report['errors'] ?? []) as $error) {
|
|
$errors[] = (string) $error;
|
|
}
|
|
}
|
|
|
|
return $errors;
|
|
}
|
|
|
|
private function loadXml(string $content): ?DOMDocument
|
|
{
|
|
$document = new DOMDocument();
|
|
$previous = libxml_use_internal_errors(true);
|
|
$loaded = $document->loadXML($content);
|
|
libxml_clear_errors();
|
|
libxml_use_internal_errors($previous);
|
|
|
|
return $loaded ? $document : null;
|
|
}
|
|
|
|
/**
|
|
* @return list<string>
|
|
*/
|
|
private function extractLocs(DOMDocument $document, string $nodeName): array
|
|
{
|
|
$xpath = new DOMXPath($document);
|
|
$nodes = $xpath->query('//*[local-name()="' . $nodeName . '"]/*[local-name()="loc"]');
|
|
$locs = [];
|
|
|
|
foreach ($nodes ?: [] as $node) {
|
|
$value = trim((string) $node->textContent);
|
|
if ($value !== '') {
|
|
$locs[] = $value;
|
|
}
|
|
}
|
|
|
|
return $locs;
|
|
}
|
|
|
|
/**
|
|
* @return list<string>
|
|
*/
|
|
private function extractImageLocs(DOMDocument $document): array
|
|
{
|
|
$xpath = new DOMXPath($document);
|
|
$nodes = $xpath->query('//*[local-name()="image"]/*[local-name()="loc"]');
|
|
$locs = [];
|
|
|
|
foreach ($nodes ?: [] as $node) {
|
|
$value = trim((string) $node->textContent);
|
|
if ($value !== '') {
|
|
$locs[] = $value;
|
|
}
|
|
}
|
|
|
|
return $locs;
|
|
}
|
|
|
|
private function isFamilySelected(array $families, string $loc): bool
|
|
{
|
|
foreach ($families as $family) {
|
|
if (str_contains($loc, '/sitemaps/' . $family . '.xml') || str_contains($loc, '/sitemaps/' . $family . '-')) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private function urlError(string $family, string $loc): ?string
|
|
{
|
|
$parts = parse_url($loc);
|
|
|
|
if (! is_array($parts) || ! isset($parts['scheme'], $parts['host'])) {
|
|
return 'Non-absolute URL emitted';
|
|
}
|
|
|
|
if (($parts['query'] ?? '') !== '') {
|
|
return 'Query-string URL emitted';
|
|
}
|
|
|
|
if (($parts['fragment'] ?? '') !== '') {
|
|
return 'Fragment URL emitted';
|
|
}
|
|
|
|
$path = '/' . ltrim((string) ($parts['path'] ?? '/'), '/');
|
|
|
|
foreach ((array) config('sitemaps.validation.forbidden_paths', []) as $forbidden) {
|
|
if ($forbidden !== '/' && str_contains($path, (string) $forbidden)) {
|
|
return 'Non-public path emitted';
|
|
}
|
|
}
|
|
|
|
return match ($family) {
|
|
'artworks' => $this->validateArtworkUrl($path),
|
|
'users' => $this->validateUserUrl($path),
|
|
default => null,
|
|
};
|
|
}
|
|
|
|
private function validateArtworkUrl(string $path): ?string
|
|
{
|
|
if (! preg_match('~^/art/(\d+)(?:/[^/?#]+)?$~', $path, $matches)) {
|
|
return 'Non-canonical artwork URL emitted';
|
|
}
|
|
|
|
$artwork = Artwork::query()->public()->published()->find((int) $matches[1]);
|
|
|
|
return $artwork === null ? 'Non-public artwork URL emitted' : null;
|
|
}
|
|
|
|
private function validateUserUrl(string $path): ?string
|
|
{
|
|
if (! preg_match('#^/@([A-Za-z0-9_\-]+)$#', $path, $matches)) {
|
|
return 'Non-canonical user URL emitted';
|
|
}
|
|
|
|
$username = strtolower((string) $matches[1]);
|
|
|
|
$user = User::query()
|
|
->where('is_active', true)
|
|
->whereNull('deleted_at')
|
|
->whereRaw('LOWER(username) = ?', [$username])
|
|
->first();
|
|
|
|
return $user === null ? 'Non-public user URL emitted' : null;
|
|
}
|
|
} |