258 lines
8.6 KiB
PHP
258 lines
8.6 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Services\Sitemaps;
|
|
|
|
use DOMDocument;
|
|
use DOMXPath;
|
|
|
|
final class SitemapReleaseValidator
|
|
{
|
|
public function __construct(
|
|
private readonly SitemapBuildService $build,
|
|
private readonly SitemapReleaseManager $releases,
|
|
) {
|
|
}
|
|
|
|
/**
|
|
* @return array<string, mixed>
|
|
*/
|
|
public function validate(string $releaseId): array
|
|
{
|
|
$manifest = $this->releases->readManifest($releaseId);
|
|
|
|
if ($manifest === null) {
|
|
return [
|
|
'ok' => false,
|
|
'release_id' => $releaseId,
|
|
'errors' => ['Release manifest not found.'],
|
|
];
|
|
}
|
|
|
|
$errors = [];
|
|
$families = (array) ($manifest['families'] ?? []);
|
|
$documents = (array) ($manifest['documents'] ?? []);
|
|
|
|
$rootContent = $this->releases->getDocument($releaseId, SitemapCacheService::INDEX_DOCUMENT);
|
|
$rootXml = is_string($rootContent) ? $this->loadXml($rootContent) : null;
|
|
|
|
if ($rootXml === null) {
|
|
$errors[] = 'Root sitemap.xml is missing or invalid.';
|
|
} else {
|
|
$rootLocs = $this->extractLocs($rootXml, 'sitemap');
|
|
$expectedRootLocs = array_map(
|
|
fn (string $entryName): string => url('/sitemaps/' . $entryName . '.xml'),
|
|
array_values(array_map(static fn (array $family): string => (string) ($family['entry_name'] ?? ''), $families)),
|
|
);
|
|
|
|
if ($rootLocs !== $expectedRootLocs) {
|
|
$errors[] = 'Root sitemap index does not match the manifest family entries.';
|
|
}
|
|
}
|
|
|
|
$reports = [];
|
|
|
|
foreach ($families as $familyName => $family) {
|
|
$familyErrors = [];
|
|
$familyWarnings = [];
|
|
$seenLocs = [];
|
|
$duplicates = [];
|
|
|
|
foreach ((array) ($family['documents'] ?? []) as $documentName) {
|
|
$artifact = $this->releases->getDocument($releaseId, (string) $documentName);
|
|
|
|
if (! is_string($artifact) || $artifact === '') {
|
|
$familyErrors[] = 'Missing artifact [' . $documentName . '].';
|
|
continue;
|
|
}
|
|
|
|
$artifactXml = $this->loadXml($artifact);
|
|
if ($artifactXml === null) {
|
|
$familyErrors[] = 'Invalid XML in artifact [' . $documentName . '].';
|
|
continue;
|
|
}
|
|
|
|
$expected = $documentName === SitemapCacheService::INDEX_DOCUMENT
|
|
? $this->build->buildIndex(true, false, array_keys($families))
|
|
: $this->build->buildNamed((string) $documentName, true, false);
|
|
|
|
if ($expected === null) {
|
|
$familyErrors[] = 'Unable to rebuild expected document [' . $documentName . '] for validation.';
|
|
continue;
|
|
}
|
|
|
|
$expectedXml = $this->loadXml((string) $expected['content']);
|
|
if ($expectedXml === null) {
|
|
$familyErrors[] = 'Expected document [' . $documentName . '] could not be parsed.';
|
|
continue;
|
|
}
|
|
|
|
if ((string) $expected['type'] === SitemapTarget::TYPE_INDEX) {
|
|
if ($this->extractLocs($artifactXml, 'sitemap') !== $this->extractLocs($expectedXml, 'sitemap')) {
|
|
$familyErrors[] = 'Index artifact [' . $documentName . '] does not match expected sitemap references.';
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
$artifactLocs = $this->extractLocs($artifactXml, 'url');
|
|
$expectedLocs = $this->extractLocs($expectedXml, 'url');
|
|
|
|
if ($artifactLocs !== $expectedLocs) {
|
|
$familyErrors[] = 'URL artifact [' . $documentName . '] does not match expected canonical URLs.';
|
|
}
|
|
|
|
foreach ($artifactLocs as $loc) {
|
|
if (isset($seenLocs[$loc])) {
|
|
$duplicates[$loc] = true;
|
|
}
|
|
|
|
$seenLocs[$loc] = true;
|
|
|
|
$urlError = $this->urlError($loc);
|
|
if ($urlError !== null) {
|
|
$familyErrors[] = $urlError . ' [' . $loc . ']';
|
|
}
|
|
}
|
|
|
|
if ((string) $familyName === (string) config('sitemaps.news.google_variant_name', 'news-google')) {
|
|
if ($this->extractNewsTitles($artifactXml) === []) {
|
|
$familyErrors[] = 'Google News sitemap contains no valid news:title elements.';
|
|
}
|
|
}
|
|
|
|
foreach ($this->extractImageLocs($artifactXml) as $imageLoc) {
|
|
if (! preg_match('/^https?:\/\//i', $imageLoc)) {
|
|
$familyWarnings[] = 'Non-absolute image URL [' . $imageLoc . ']';
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($duplicates !== []) {
|
|
$familyErrors[] = 'Duplicate URLs detected across family artifacts.';
|
|
}
|
|
|
|
$reports[] = [
|
|
'family' => $familyName,
|
|
'documents' => count((array) ($family['documents'] ?? [])),
|
|
'url_count' => (int) ($family['url_count'] ?? 0),
|
|
'shard_count' => (int) ($family['shard_count'] ?? 0),
|
|
'errors' => $familyErrors,
|
|
'warnings' => $familyWarnings,
|
|
];
|
|
|
|
foreach ($familyErrors as $familyError) {
|
|
$errors[] = $familyName . ': ' . $familyError;
|
|
}
|
|
}
|
|
|
|
return [
|
|
'ok' => $errors === [],
|
|
'release_id' => $releaseId,
|
|
'errors' => $errors,
|
|
'families' => $reports,
|
|
'totals' => [
|
|
'families' => count($families),
|
|
'documents' => count($documents),
|
|
'urls' => array_sum(array_map(static fn (array $family): int => (int) ($family['url_count'] ?? 0), $families)),
|
|
'shards' => array_sum(array_map(static fn (array $family): int => (int) ($family['shard_count'] ?? 0), $families)),
|
|
],
|
|
];
|
|
}
|
|
|
|
private function loadXml(string $content): ?DOMDocument
|
|
{
|
|
$document = new DOMDocument();
|
|
$previous = libxml_use_internal_errors(true);
|
|
$loaded = $document->loadXML($content);
|
|
libxml_clear_errors();
|
|
libxml_use_internal_errors($previous);
|
|
|
|
return $loaded ? $document : null;
|
|
}
|
|
|
|
/**
|
|
* @return list<string>
|
|
*/
|
|
private function extractLocs(DOMDocument $document, string $nodeName): array
|
|
{
|
|
$xpath = new DOMXPath($document);
|
|
$nodes = $xpath->query('//*[local-name()="' . $nodeName . '"]/*[local-name()="loc"]');
|
|
$locs = [];
|
|
|
|
foreach ($nodes ?: [] as $node) {
|
|
$value = trim((string) $node->textContent);
|
|
if ($value !== '') {
|
|
$locs[] = $value;
|
|
}
|
|
}
|
|
|
|
return $locs;
|
|
}
|
|
|
|
/**
|
|
* @return list<string>
|
|
*/
|
|
private function extractImageLocs(DOMDocument $document): array
|
|
{
|
|
$xpath = new DOMXPath($document);
|
|
$nodes = $xpath->query('//*[local-name()="image"]/*[local-name()="loc"]');
|
|
$locs = [];
|
|
|
|
foreach ($nodes ?: [] as $node) {
|
|
$value = trim((string) $node->textContent);
|
|
if ($value !== '') {
|
|
$locs[] = $value;
|
|
}
|
|
}
|
|
|
|
return $locs;
|
|
}
|
|
|
|
/**
|
|
* @return list<string>
|
|
*/
|
|
private function extractNewsTitles(DOMDocument $document): array
|
|
{
|
|
$xpath = new DOMXPath($document);
|
|
$nodes = $xpath->query('//*[local-name()="title"]');
|
|
$titles = [];
|
|
|
|
foreach ($nodes ?: [] as $node) {
|
|
$value = trim((string) $node->textContent);
|
|
if ($value !== '') {
|
|
$titles[] = $value;
|
|
}
|
|
}
|
|
|
|
return $titles;
|
|
}
|
|
|
|
private function urlError(string $loc): ?string
|
|
{
|
|
$parts = parse_url($loc);
|
|
|
|
if (! is_array($parts) || ! isset($parts['scheme'], $parts['host'])) {
|
|
return 'Non-absolute URL emitted';
|
|
}
|
|
|
|
if (($parts['query'] ?? '') !== '') {
|
|
return 'Query-string URL emitted';
|
|
}
|
|
|
|
if (($parts['fragment'] ?? '') !== '') {
|
|
return 'Fragment URL emitted';
|
|
}
|
|
|
|
$path = '/' . ltrim((string) ($parts['path'] ?? '/'), '/');
|
|
|
|
foreach ((array) config('sitemaps.validation.forbidden_paths', []) as $forbidden) {
|
|
if ($forbidden !== '/' && str_contains($path, (string) $forbidden)) {
|
|
return 'Non-public path emitted';
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
} |