Files
SkinbaseNova/app/Services/Sitemaps/SitemapReleaseValidator.php

258 lines
8.6 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Services\Sitemaps;
use DOMDocument;
use DOMXPath;
final class SitemapReleaseValidator
{
public function __construct(
private readonly SitemapBuildService $build,
private readonly SitemapReleaseManager $releases,
) {
}
/**
* @return array<string, mixed>
*/
public function validate(string $releaseId): array
{
$manifest = $this->releases->readManifest($releaseId);
if ($manifest === null) {
return [
'ok' => false,
'release_id' => $releaseId,
'errors' => ['Release manifest not found.'],
];
}
$errors = [];
$families = (array) ($manifest['families'] ?? []);
$documents = (array) ($manifest['documents'] ?? []);
$rootContent = $this->releases->getDocument($releaseId, SitemapCacheService::INDEX_DOCUMENT);
$rootXml = is_string($rootContent) ? $this->loadXml($rootContent) : null;
if ($rootXml === null) {
$errors[] = 'Root sitemap.xml is missing or invalid.';
} else {
$rootLocs = $this->extractLocs($rootXml, 'sitemap');
$expectedRootLocs = array_map(
fn (string $entryName): string => url('/sitemaps/' . $entryName . '.xml'),
array_values(array_map(static fn (array $family): string => (string) ($family['entry_name'] ?? ''), $families)),
);
if ($rootLocs !== $expectedRootLocs) {
$errors[] = 'Root sitemap index does not match the manifest family entries.';
}
}
$reports = [];
foreach ($families as $familyName => $family) {
$familyErrors = [];
$familyWarnings = [];
$seenLocs = [];
$duplicates = [];
foreach ((array) ($family['documents'] ?? []) as $documentName) {
$artifact = $this->releases->getDocument($releaseId, (string) $documentName);
if (! is_string($artifact) || $artifact === '') {
$familyErrors[] = 'Missing artifact [' . $documentName . '].';
continue;
}
$artifactXml = $this->loadXml($artifact);
if ($artifactXml === null) {
$familyErrors[] = 'Invalid XML in artifact [' . $documentName . '].';
continue;
}
$expected = $documentName === SitemapCacheService::INDEX_DOCUMENT
? $this->build->buildIndex(true, false, array_keys($families))
: $this->build->buildNamed((string) $documentName, true, false);
if ($expected === null) {
$familyErrors[] = 'Unable to rebuild expected document [' . $documentName . '] for validation.';
continue;
}
$expectedXml = $this->loadXml((string) $expected['content']);
if ($expectedXml === null) {
$familyErrors[] = 'Expected document [' . $documentName . '] could not be parsed.';
continue;
}
if ((string) $expected['type'] === SitemapTarget::TYPE_INDEX) {
if ($this->extractLocs($artifactXml, 'sitemap') !== $this->extractLocs($expectedXml, 'sitemap')) {
$familyErrors[] = 'Index artifact [' . $documentName . '] does not match expected sitemap references.';
}
continue;
}
$artifactLocs = $this->extractLocs($artifactXml, 'url');
$expectedLocs = $this->extractLocs($expectedXml, 'url');
if ($artifactLocs !== $expectedLocs) {
$familyErrors[] = 'URL artifact [' . $documentName . '] does not match expected canonical URLs.';
}
foreach ($artifactLocs as $loc) {
if (isset($seenLocs[$loc])) {
$duplicates[$loc] = true;
}
$seenLocs[$loc] = true;
$urlError = $this->urlError($loc);
if ($urlError !== null) {
$familyErrors[] = $urlError . ' [' . $loc . ']';
}
}
if ((string) $familyName === (string) config('sitemaps.news.google_variant_name', 'news-google')) {
if ($this->extractNewsTitles($artifactXml) === []) {
$familyErrors[] = 'Google News sitemap contains no valid news:title elements.';
}
}
foreach ($this->extractImageLocs($artifactXml) as $imageLoc) {
if (! preg_match('/^https?:\/\//i', $imageLoc)) {
$familyWarnings[] = 'Non-absolute image URL [' . $imageLoc . ']';
}
}
}
if ($duplicates !== []) {
$familyErrors[] = 'Duplicate URLs detected across family artifacts.';
}
$reports[] = [
'family' => $familyName,
'documents' => count((array) ($family['documents'] ?? [])),
'url_count' => (int) ($family['url_count'] ?? 0),
'shard_count' => (int) ($family['shard_count'] ?? 0),
'errors' => $familyErrors,
'warnings' => $familyWarnings,
];
foreach ($familyErrors as $familyError) {
$errors[] = $familyName . ': ' . $familyError;
}
}
return [
'ok' => $errors === [],
'release_id' => $releaseId,
'errors' => $errors,
'families' => $reports,
'totals' => [
'families' => count($families),
'documents' => count($documents),
'urls' => array_sum(array_map(static fn (array $family): int => (int) ($family['url_count'] ?? 0), $families)),
'shards' => array_sum(array_map(static fn (array $family): int => (int) ($family['shard_count'] ?? 0), $families)),
],
];
}
private function loadXml(string $content): ?DOMDocument
{
$document = new DOMDocument();
$previous = libxml_use_internal_errors(true);
$loaded = $document->loadXML($content);
libxml_clear_errors();
libxml_use_internal_errors($previous);
return $loaded ? $document : null;
}
/**
* @return list<string>
*/
private function extractLocs(DOMDocument $document, string $nodeName): array
{
$xpath = new DOMXPath($document);
$nodes = $xpath->query('//*[local-name()="' . $nodeName . '"]/*[local-name()="loc"]');
$locs = [];
foreach ($nodes ?: [] as $node) {
$value = trim((string) $node->textContent);
if ($value !== '') {
$locs[] = $value;
}
}
return $locs;
}
/**
* @return list<string>
*/
private function extractImageLocs(DOMDocument $document): array
{
$xpath = new DOMXPath($document);
$nodes = $xpath->query('//*[local-name()="image"]/*[local-name()="loc"]');
$locs = [];
foreach ($nodes ?: [] as $node) {
$value = trim((string) $node->textContent);
if ($value !== '') {
$locs[] = $value;
}
}
return $locs;
}
/**
* @return list<string>
*/
private function extractNewsTitles(DOMDocument $document): array
{
$xpath = new DOMXPath($document);
$nodes = $xpath->query('//*[local-name()="title"]');
$titles = [];
foreach ($nodes ?: [] as $node) {
$value = trim((string) $node->textContent);
if ($value !== '') {
$titles[] = $value;
}
}
return $titles;
}
private function urlError(string $loc): ?string
{
$parts = parse_url($loc);
if (! is_array($parts) || ! isset($parts['scheme'], $parts['host'])) {
return 'Non-absolute URL emitted';
}
if (($parts['query'] ?? '') !== '') {
return 'Query-string URL emitted';
}
if (($parts['fragment'] ?? '') !== '') {
return 'Fragment URL emitted';
}
$path = '/' . ltrim((string) ($parts['path'] ?? '/'), '/');
foreach ((array) config('sitemaps.validation.forbidden_paths', []) as $forbidden) {
if ($forbidden !== '/' && str_contains($path, (string) $forbidden)) {
return 'Non-public path emitted';
}
}
return null;
}
}