*/ public function validate(string $releaseId): array { $manifest = $this->releases->readManifest($releaseId); if ($manifest === null) { return [ 'ok' => false, 'release_id' => $releaseId, 'errors' => ['Release manifest not found.'], ]; } $errors = []; $families = (array) ($manifest['families'] ?? []); $documents = (array) ($manifest['documents'] ?? []); $rootContent = $this->releases->getDocument($releaseId, SitemapCacheService::INDEX_DOCUMENT); $rootXml = is_string($rootContent) ? $this->loadXml($rootContent) : null; if ($rootXml === null) { $errors[] = 'Root sitemap.xml is missing or invalid.'; } else { $rootLocs = $this->extractLocs($rootXml, 'sitemap'); $expectedRootLocs = array_map( fn (string $entryName): string => url('/sitemaps/' . $entryName . '.xml'), array_values(array_map(static fn (array $family): string => (string) ($family['entry_name'] ?? ''), $families)), ); if ($rootLocs !== $expectedRootLocs) { $errors[] = 'Root sitemap index does not match the manifest family entries.'; } } $reports = []; foreach ($families as $familyName => $family) { $familyErrors = []; $familyWarnings = []; $seenLocs = []; $duplicates = []; foreach ((array) ($family['documents'] ?? []) as $documentName) { $artifact = $this->releases->getDocument($releaseId, (string) $documentName); if (! is_string($artifact) || $artifact === '') { $familyErrors[] = 'Missing artifact [' . $documentName . '].'; continue; } $artifactXml = $this->loadXml($artifact); if ($artifactXml === null) { $familyErrors[] = 'Invalid XML in artifact [' . $documentName . '].'; continue; } $expected = $documentName === SitemapCacheService::INDEX_DOCUMENT ? $this->build->buildIndex(true, false, array_keys($families)) : $this->build->buildNamed((string) $documentName, true, false); if ($expected === null) { $familyErrors[] = 'Unable to rebuild expected document [' . $documentName . '] for validation.'; continue; } $expectedXml = $this->loadXml((string) $expected['content']); if ($expectedXml === null) { $familyErrors[] = 'Expected document [' . $documentName . '] could not be parsed.'; continue; } if ((string) $expected['type'] === SitemapTarget::TYPE_INDEX) { if ($this->extractLocs($artifactXml, 'sitemap') !== $this->extractLocs($expectedXml, 'sitemap')) { $familyErrors[] = 'Index artifact [' . $documentName . '] does not match expected sitemap references.'; } continue; } $artifactLocs = $this->extractLocs($artifactXml, 'url'); $expectedLocs = $this->extractLocs($expectedXml, 'url'); if ($artifactLocs !== $expectedLocs) { $familyErrors[] = 'URL artifact [' . $documentName . '] does not match expected canonical URLs.'; } foreach ($artifactLocs as $loc) { if (isset($seenLocs[$loc])) { $duplicates[$loc] = true; } $seenLocs[$loc] = true; $urlError = $this->urlError($loc); if ($urlError !== null) { $familyErrors[] = $urlError . ' [' . $loc . ']'; } } if ((string) $familyName === (string) config('sitemaps.news.google_variant_name', 'news-google')) { if ($this->extractNewsTitles($artifactXml) === []) { $familyErrors[] = 'Google News sitemap contains no valid news:title elements.'; } } foreach ($this->extractImageLocs($artifactXml) as $imageLoc) { if (! preg_match('/^https?:\/\//i', $imageLoc)) { $familyWarnings[] = 'Non-absolute image URL [' . $imageLoc . ']'; } } } if ($duplicates !== []) { $familyErrors[] = 'Duplicate URLs detected across family artifacts.'; } $reports[] = [ 'family' => $familyName, 'documents' => count((array) ($family['documents'] ?? [])), 'url_count' => (int) ($family['url_count'] ?? 0), 'shard_count' => (int) ($family['shard_count'] ?? 0), 'errors' => $familyErrors, 'warnings' => $familyWarnings, ]; foreach ($familyErrors as $familyError) { $errors[] = $familyName . ': ' . $familyError; } } return [ 'ok' => $errors === [], 'release_id' => $releaseId, 'errors' => $errors, 'families' => $reports, 'totals' => [ 'families' => count($families), 'documents' => count($documents), 'urls' => array_sum(array_map(static fn (array $family): int => (int) ($family['url_count'] ?? 0), $families)), 'shards' => array_sum(array_map(static fn (array $family): int => (int) ($family['shard_count'] ?? 0), $families)), ], ]; } private function loadXml(string $content): ?DOMDocument { $document = new DOMDocument(); $previous = libxml_use_internal_errors(true); $loaded = $document->loadXML($content); libxml_clear_errors(); libxml_use_internal_errors($previous); return $loaded ? $document : null; } /** * @return list */ private function extractLocs(DOMDocument $document, string $nodeName): array { $xpath = new DOMXPath($document); $nodes = $xpath->query('//*[local-name()="' . $nodeName . '"]/*[local-name()="loc"]'); $locs = []; foreach ($nodes ?: [] as $node) { $value = trim((string) $node->textContent); if ($value !== '') { $locs[] = $value; } } return $locs; } /** * @return list */ private function extractImageLocs(DOMDocument $document): array { $xpath = new DOMXPath($document); $nodes = $xpath->query('//*[local-name()="image"]/*[local-name()="loc"]'); $locs = []; foreach ($nodes ?: [] as $node) { $value = trim((string) $node->textContent); if ($value !== '') { $locs[] = $value; } } return $locs; } /** * @return list */ private function extractNewsTitles(DOMDocument $document): array { $xpath = new DOMXPath($document); $nodes = $xpath->query('//*[local-name()="title"]'); $titles = []; foreach ($nodes ?: [] as $node) { $value = trim((string) $node->textContent); if ($value !== '') { $titles[] = $value; } } return $titles; } private function urlError(string $loc): ?string { $parts = parse_url($loc); if (! is_array($parts) || ! isset($parts['scheme'], $parts['host'])) { return 'Non-absolute URL emitted'; } if (($parts['query'] ?? '') !== '') { return 'Query-string URL emitted'; } if (($parts['fragment'] ?? '') !== '') { return 'Fragment URL emitted'; } $path = '/' . ltrim((string) ($parts['path'] ?? '/'), '/'); foreach ((array) config('sitemaps.validation.forbidden_paths', []) as $forbidden) { if ($forbidden !== '/' && str_contains($path, (string) $forbidden)) { return 'Non-public path emitted'; } } return null; } }