Implement creator studio and upload updates
This commit is contained in:
258
app/Services/Sitemaps/SitemapReleaseValidator.php
Normal file
258
app/Services/Sitemaps/SitemapReleaseValidator.php
Normal file
@@ -0,0 +1,258 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services\Sitemaps;
|
||||
|
||||
use DOMDocument;
|
||||
use DOMXPath;
|
||||
|
||||
final class SitemapReleaseValidator
|
||||
{
|
||||
public function __construct(
|
||||
private readonly SitemapBuildService $build,
|
||||
private readonly SitemapReleaseManager $releases,
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
public function validate(string $releaseId): array
|
||||
{
|
||||
$manifest = $this->releases->readManifest($releaseId);
|
||||
|
||||
if ($manifest === null) {
|
||||
return [
|
||||
'ok' => false,
|
||||
'release_id' => $releaseId,
|
||||
'errors' => ['Release manifest not found.'],
|
||||
];
|
||||
}
|
||||
|
||||
$errors = [];
|
||||
$families = (array) ($manifest['families'] ?? []);
|
||||
$documents = (array) ($manifest['documents'] ?? []);
|
||||
|
||||
$rootContent = $this->releases->getDocument($releaseId, SitemapCacheService::INDEX_DOCUMENT);
|
||||
$rootXml = is_string($rootContent) ? $this->loadXml($rootContent) : null;
|
||||
|
||||
if ($rootXml === null) {
|
||||
$errors[] = 'Root sitemap.xml is missing or invalid.';
|
||||
} else {
|
||||
$rootLocs = $this->extractLocs($rootXml, 'sitemap');
|
||||
$expectedRootLocs = array_map(
|
||||
fn (string $entryName): string => url('/sitemaps/' . $entryName . '.xml'),
|
||||
array_values(array_map(static fn (array $family): string => (string) ($family['entry_name'] ?? ''), $families)),
|
||||
);
|
||||
|
||||
if ($rootLocs !== $expectedRootLocs) {
|
||||
$errors[] = 'Root sitemap index does not match the manifest family entries.';
|
||||
}
|
||||
}
|
||||
|
||||
$reports = [];
|
||||
|
||||
foreach ($families as $familyName => $family) {
|
||||
$familyErrors = [];
|
||||
$familyWarnings = [];
|
||||
$seenLocs = [];
|
||||
$duplicates = [];
|
||||
|
||||
foreach ((array) ($family['documents'] ?? []) as $documentName) {
|
||||
$artifact = $this->releases->getDocument($releaseId, (string) $documentName);
|
||||
|
||||
if (! is_string($artifact) || $artifact === '') {
|
||||
$familyErrors[] = 'Missing artifact [' . $documentName . '].';
|
||||
continue;
|
||||
}
|
||||
|
||||
$artifactXml = $this->loadXml($artifact);
|
||||
if ($artifactXml === null) {
|
||||
$familyErrors[] = 'Invalid XML in artifact [' . $documentName . '].';
|
||||
continue;
|
||||
}
|
||||
|
||||
$expected = $documentName === SitemapCacheService::INDEX_DOCUMENT
|
||||
? $this->build->buildIndex(true, false, array_keys($families))
|
||||
: $this->build->buildNamed((string) $documentName, true, false);
|
||||
|
||||
if ($expected === null) {
|
||||
$familyErrors[] = 'Unable to rebuild expected document [' . $documentName . '] for validation.';
|
||||
continue;
|
||||
}
|
||||
|
||||
$expectedXml = $this->loadXml((string) $expected['content']);
|
||||
if ($expectedXml === null) {
|
||||
$familyErrors[] = 'Expected document [' . $documentName . '] could not be parsed.';
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((string) $expected['type'] === SitemapTarget::TYPE_INDEX) {
|
||||
if ($this->extractLocs($artifactXml, 'sitemap') !== $this->extractLocs($expectedXml, 'sitemap')) {
|
||||
$familyErrors[] = 'Index artifact [' . $documentName . '] does not match expected sitemap references.';
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$artifactLocs = $this->extractLocs($artifactXml, 'url');
|
||||
$expectedLocs = $this->extractLocs($expectedXml, 'url');
|
||||
|
||||
if ($artifactLocs !== $expectedLocs) {
|
||||
$familyErrors[] = 'URL artifact [' . $documentName . '] does not match expected canonical URLs.';
|
||||
}
|
||||
|
||||
foreach ($artifactLocs as $loc) {
|
||||
if (isset($seenLocs[$loc])) {
|
||||
$duplicates[$loc] = true;
|
||||
}
|
||||
|
||||
$seenLocs[$loc] = true;
|
||||
|
||||
$urlError = $this->urlError($loc);
|
||||
if ($urlError !== null) {
|
||||
$familyErrors[] = $urlError . ' [' . $loc . ']';
|
||||
}
|
||||
}
|
||||
|
||||
if ((string) $familyName === (string) config('sitemaps.news.google_variant_name', 'news-google')) {
|
||||
if ($this->extractNewsTitles($artifactXml) === []) {
|
||||
$familyErrors[] = 'Google News sitemap contains no valid news:title elements.';
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($this->extractImageLocs($artifactXml) as $imageLoc) {
|
||||
if (! preg_match('/^https?:\/\//i', $imageLoc)) {
|
||||
$familyWarnings[] = 'Non-absolute image URL [' . $imageLoc . ']';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($duplicates !== []) {
|
||||
$familyErrors[] = 'Duplicate URLs detected across family artifacts.';
|
||||
}
|
||||
|
||||
$reports[] = [
|
||||
'family' => $familyName,
|
||||
'documents' => count((array) ($family['documents'] ?? [])),
|
||||
'url_count' => (int) ($family['url_count'] ?? 0),
|
||||
'shard_count' => (int) ($family['shard_count'] ?? 0),
|
||||
'errors' => $familyErrors,
|
||||
'warnings' => $familyWarnings,
|
||||
];
|
||||
|
||||
foreach ($familyErrors as $familyError) {
|
||||
$errors[] = $familyName . ': ' . $familyError;
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'ok' => $errors === [],
|
||||
'release_id' => $releaseId,
|
||||
'errors' => $errors,
|
||||
'families' => $reports,
|
||||
'totals' => [
|
||||
'families' => count($families),
|
||||
'documents' => count($documents),
|
||||
'urls' => array_sum(array_map(static fn (array $family): int => (int) ($family['url_count'] ?? 0), $families)),
|
||||
'shards' => array_sum(array_map(static fn (array $family): int => (int) ($family['shard_count'] ?? 0), $families)),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
private function loadXml(string $content): ?DOMDocument
|
||||
{
|
||||
$document = new DOMDocument();
|
||||
$previous = libxml_use_internal_errors(true);
|
||||
$loaded = $document->loadXML($content);
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors($previous);
|
||||
|
||||
return $loaded ? $document : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return list<string>
|
||||
*/
|
||||
private function extractLocs(DOMDocument $document, string $nodeName): array
|
||||
{
|
||||
$xpath = new DOMXPath($document);
|
||||
$nodes = $xpath->query('//*[local-name()="' . $nodeName . '"]/*[local-name()="loc"]');
|
||||
$locs = [];
|
||||
|
||||
foreach ($nodes ?: [] as $node) {
|
||||
$value = trim((string) $node->textContent);
|
||||
if ($value !== '') {
|
||||
$locs[] = $value;
|
||||
}
|
||||
}
|
||||
|
||||
return $locs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return list<string>
|
||||
*/
|
||||
private function extractImageLocs(DOMDocument $document): array
|
||||
{
|
||||
$xpath = new DOMXPath($document);
|
||||
$nodes = $xpath->query('//*[local-name()="image"]/*[local-name()="loc"]');
|
||||
$locs = [];
|
||||
|
||||
foreach ($nodes ?: [] as $node) {
|
||||
$value = trim((string) $node->textContent);
|
||||
if ($value !== '') {
|
||||
$locs[] = $value;
|
||||
}
|
||||
}
|
||||
|
||||
return $locs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return list<string>
|
||||
*/
|
||||
private function extractNewsTitles(DOMDocument $document): array
|
||||
{
|
||||
$xpath = new DOMXPath($document);
|
||||
$nodes = $xpath->query('//*[local-name()="title"]');
|
||||
$titles = [];
|
||||
|
||||
foreach ($nodes ?: [] as $node) {
|
||||
$value = trim((string) $node->textContent);
|
||||
if ($value !== '') {
|
||||
$titles[] = $value;
|
||||
}
|
||||
}
|
||||
|
||||
return $titles;
|
||||
}
|
||||
|
||||
private function urlError(string $loc): ?string
|
||||
{
|
||||
$parts = parse_url($loc);
|
||||
|
||||
if (! is_array($parts) || ! isset($parts['scheme'], $parts['host'])) {
|
||||
return 'Non-absolute URL emitted';
|
||||
}
|
||||
|
||||
if (($parts['query'] ?? '') !== '') {
|
||||
return 'Query-string URL emitted';
|
||||
}
|
||||
|
||||
if (($parts['fragment'] ?? '') !== '') {
|
||||
return 'Fragment URL emitted';
|
||||
}
|
||||
|
||||
$path = '/' . ltrim((string) ($parts['path'] ?? '/'), '/');
|
||||
|
||||
foreach ((array) config('sitemaps.validation.forbidden_paths', []) as $forbidden) {
|
||||
if ($forbidden !== '/' && str_contains($path, (string) $forbidden)) {
|
||||
return 'Non-public path emitted';
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user