Files
SkinbaseNova/app/Services/Sitemaps/SitemapValidationService.php

286 lines
9.0 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Services\Sitemaps;
use App\Models\Artwork;
use App\Models\User;
use DOMDocument;
use DOMXPath;
final class SitemapValidationService
{
public function __construct(
private readonly SitemapBuildService $build,
private readonly SitemapIndexService $index,
private readonly SitemapRegistry $registry,
private readonly SitemapShardService $shards,
) {
}
/**
* @param list<string> $onlyFamilies
* @return array<string, mixed>
*/
public function validate(array $onlyFamilies = []): array
{
$families = $onlyFamilies !== []
? array_values(array_filter($onlyFamilies, fn (string $family): bool => $this->registry->get($family) !== null))
: $this->build->enabledFamilies();
$expectedIndexLocs = array_map(
static fn (SitemapIndexItem $item): string => $item->loc,
array_values(array_filter(
$this->index->items(),
fn (SitemapIndexItem $item): bool => $this->isFamilySelected($families, $item->loc),
)),
);
$indexBuild = $this->build->buildIndex(true, false);
$indexErrors = [];
$indexXml = $this->loadXml($indexBuild['content']);
if ($indexXml === null) {
$indexErrors[] = 'The main sitemap index XML could not be parsed.';
}
$actualIndexLocs = $indexXml ? $this->extractLocs($indexXml, 'sitemap') : [];
if ($indexXml !== null && $actualIndexLocs !== $expectedIndexLocs) {
$indexErrors[] = 'Main sitemap index child references do not match the expected shard-aware manifest.';
}
$familyReports = [];
$duplicates = [];
$seenUrls = [];
$totalUrlCount = 0;
$totalShardCount = 0;
foreach ($families as $family) {
$builder = $this->registry->get($family);
if ($builder === null) {
continue;
}
$report = [
'family' => $family,
'documents' => 0,
'url_count' => 0,
'shard_count' => max(1, $this->shards->shardCount($builder)),
'errors' => [],
'warnings' => [],
];
$totalShardCount += $report['shard_count'];
foreach ($this->build->documentNamesForFamily($family, true) as $name) {
$built = $this->build->buildNamed($name, true, false);
if ($built === null) {
$report['errors'][] = 'Unable to resolve sitemap [' . $name . '].';
continue;
}
$document = $this->loadXml($built['content']);
if ($document === null) {
$report['errors'][] = 'Invalid XML emitted for [' . $name . '].';
continue;
}
$report['documents']++;
if ($built['type'] === SitemapTarget::TYPE_INDEX) {
$expectedFamilyLocs = array_map(
static fn (SitemapIndexItem $item): string => $item->loc,
$this->index->itemsForBuilder($builder),
);
$actualFamilyLocs = $this->extractLocs($document, 'sitemap');
if ($actualFamilyLocs !== $expectedFamilyLocs) {
$report['errors'][] = 'Shard compatibility index [' . $name . '] does not reference the expected shard URLs.';
}
continue;
}
$locs = $this->extractLocs($document, 'url');
$report['url_count'] += count($locs);
$totalUrlCount += count($locs);
foreach ($locs as $loc) {
if (isset($seenUrls[$loc])) {
$duplicates[$loc] = ($duplicates[$loc] ?? 1) + 1;
}
$seenUrls[$loc] = true;
$reason = $this->urlError($family, $loc);
if ($reason !== null) {
$report['errors'][] = $reason . ' [' . $loc . ']';
}
}
foreach ($this->extractImageLocs($document) as $imageLoc) {
if (! preg_match('/^https?:\/\//i', $imageLoc)) {
$report['warnings'][] = 'Non-absolute image URL found [' . $imageLoc . ']';
}
}
}
$familyReports[] = $report;
}
return [
'ok' => $indexErrors === [] && $this->familyErrors($familyReports) === [] && $duplicates === [],
'index' => [
'errors' => $indexErrors,
'url_count' => count($actualIndexLocs),
],
'families' => $familyReports,
'duplicates' => array_keys($duplicates),
'totals' => [
'families' => count($familyReports),
'documents' => array_sum(array_map(static fn (array $report): int => (int) $report['documents'], $familyReports)),
'urls' => $totalUrlCount,
'shards' => $totalShardCount,
],
];
}
/**
* @param list<array<string, mixed>> $reports
* @return list<string>
*/
private function familyErrors(array $reports): array
{
$errors = [];
foreach ($reports as $report) {
foreach ((array) ($report['errors'] ?? []) as $error) {
$errors[] = (string) $error;
}
}
return $errors;
}
private function loadXml(string $content): ?DOMDocument
{
$document = new DOMDocument();
$previous = libxml_use_internal_errors(true);
$loaded = $document->loadXML($content);
libxml_clear_errors();
libxml_use_internal_errors($previous);
return $loaded ? $document : null;
}
/**
* @return list<string>
*/
private function extractLocs(DOMDocument $document, string $nodeName): array
{
$xpath = new DOMXPath($document);
$nodes = $xpath->query('//*[local-name()="' . $nodeName . '"]/*[local-name()="loc"]');
$locs = [];
foreach ($nodes ?: [] as $node) {
$value = trim((string) $node->textContent);
if ($value !== '') {
$locs[] = $value;
}
}
return $locs;
}
/**
* @return list<string>
*/
private function extractImageLocs(DOMDocument $document): array
{
$xpath = new DOMXPath($document);
$nodes = $xpath->query('//*[local-name()="image"]/*[local-name()="loc"]');
$locs = [];
foreach ($nodes ?: [] as $node) {
$value = trim((string) $node->textContent);
if ($value !== '') {
$locs[] = $value;
}
}
return $locs;
}
private function isFamilySelected(array $families, string $loc): bool
{
foreach ($families as $family) {
if (str_contains($loc, '/sitemaps/' . $family . '.xml') || str_contains($loc, '/sitemaps/' . $family . '-')) {
return true;
}
}
return false;
}
private function urlError(string $family, string $loc): ?string
{
$parts = parse_url($loc);
if (! is_array($parts) || ! isset($parts['scheme'], $parts['host'])) {
return 'Non-absolute URL emitted';
}
if (($parts['query'] ?? '') !== '') {
return 'Query-string URL emitted';
}
if (($parts['fragment'] ?? '') !== '') {
return 'Fragment URL emitted';
}
$path = '/' . ltrim((string) ($parts['path'] ?? '/'), '/');
foreach ((array) config('sitemaps.validation.forbidden_paths', []) as $forbidden) {
if ($forbidden !== '/' && str_contains($path, (string) $forbidden)) {
return 'Non-public path emitted';
}
}
return match ($family) {
'artworks' => $this->validateArtworkUrl($path),
'users' => $this->validateUserUrl($path),
default => null,
};
}
private function validateArtworkUrl(string $path): ?string
{
if (! preg_match('~^/art/(\d+)(?:/[^/?#]+)?$~', $path, $matches)) {
return 'Non-canonical artwork URL emitted';
}
$artwork = Artwork::query()->public()->published()->find((int) $matches[1]);
return $artwork === null ? 'Non-public artwork URL emitted' : null;
}
private function validateUserUrl(string $path): ?string
{
if (! preg_match('#^/@([A-Za-z0-9_\-]+)$#', $path, $matches)) {
return 'Non-canonical user URL emitted';
}
$username = strtolower((string) $matches[1]);
$user = User::query()
->where('is_active', true)
->whereNull('deleted_at')
->whereRaw('LOWER(username) = ?', [$username])
->first();
return $user === null ? 'Non-public user URL emitted' : null;
}
}