$onlyFamilies * @return array */ public function validate(array $onlyFamilies = []): array { $families = $onlyFamilies !== [] ? array_values(array_filter($onlyFamilies, fn (string $family): bool => $this->registry->get($family) !== null)) : $this->build->enabledFamilies(); $expectedIndexLocs = array_map( static fn (SitemapIndexItem $item): string => $item->loc, array_values(array_filter( $this->index->items(), fn (SitemapIndexItem $item): bool => $this->isFamilySelected($families, $item->loc), )), ); $indexBuild = $this->build->buildIndex(true, false); $indexErrors = []; $indexXml = $this->loadXml($indexBuild['content']); if ($indexXml === null) { $indexErrors[] = 'The main sitemap index XML could not be parsed.'; } $actualIndexLocs = $indexXml ? $this->extractLocs($indexXml, 'sitemap') : []; if ($indexXml !== null && $actualIndexLocs !== $expectedIndexLocs) { $indexErrors[] = 'Main sitemap index child references do not match the expected shard-aware manifest.'; } $familyReports = []; $duplicates = []; $seenUrls = []; $totalUrlCount = 0; $totalShardCount = 0; foreach ($families as $family) { $builder = $this->registry->get($family); if ($builder === null) { continue; } $report = [ 'family' => $family, 'documents' => 0, 'url_count' => 0, 'shard_count' => max(1, $this->shards->shardCount($builder)), 'errors' => [], 'warnings' => [], ]; $totalShardCount += $report['shard_count']; foreach ($this->build->documentNamesForFamily($family, true) as $name) { $built = $this->build->buildNamed($name, true, false); if ($built === null) { $report['errors'][] = 'Unable to resolve sitemap [' . $name . '].'; continue; } $document = $this->loadXml($built['content']); if ($document === null) { $report['errors'][] = 'Invalid XML emitted for [' . $name . '].'; continue; } $report['documents']++; if ($built['type'] === SitemapTarget::TYPE_INDEX) { $expectedFamilyLocs = array_map( static fn (SitemapIndexItem $item): string => $item->loc, $this->index->itemsForBuilder($builder), ); $actualFamilyLocs = $this->extractLocs($document, 'sitemap'); if ($actualFamilyLocs !== $expectedFamilyLocs) { $report['errors'][] = 'Shard compatibility index [' . $name . '] does not reference the expected shard URLs.'; } continue; } $locs = $this->extractLocs($document, 'url'); $report['url_count'] += count($locs); $totalUrlCount += count($locs); foreach ($locs as $loc) { if (isset($seenUrls[$loc])) { $duplicates[$loc] = ($duplicates[$loc] ?? 1) + 1; } $seenUrls[$loc] = true; $reason = $this->urlError($family, $loc); if ($reason !== null) { $report['errors'][] = $reason . ' [' . $loc . ']'; } } foreach ($this->extractImageLocs($document) as $imageLoc) { if (! preg_match('/^https?:\/\//i', $imageLoc)) { $report['warnings'][] = 'Non-absolute image URL found [' . $imageLoc . ']'; } } } $familyReports[] = $report; } return [ 'ok' => $indexErrors === [] && $this->familyErrors($familyReports) === [] && $duplicates === [], 'index' => [ 'errors' => $indexErrors, 'url_count' => count($actualIndexLocs), ], 'families' => $familyReports, 'duplicates' => array_keys($duplicates), 'totals' => [ 'families' => count($familyReports), 'documents' => array_sum(array_map(static fn (array $report): int => (int) $report['documents'], $familyReports)), 'urls' => $totalUrlCount, 'shards' => $totalShardCount, ], ]; } /** * @param list> $reports * @return list */ private function familyErrors(array $reports): array { $errors = []; foreach ($reports as $report) { foreach ((array) ($report['errors'] ?? []) as $error) { $errors[] = (string) $error; } } return $errors; } private function loadXml(string $content): ?DOMDocument { $document = new DOMDocument(); $previous = libxml_use_internal_errors(true); $loaded = $document->loadXML($content); libxml_clear_errors(); libxml_use_internal_errors($previous); return $loaded ? $document : null; } /** * @return list */ private function extractLocs(DOMDocument $document, string $nodeName): array { $xpath = new DOMXPath($document); $nodes = $xpath->query('//*[local-name()="' . $nodeName . '"]/*[local-name()="loc"]'); $locs = []; foreach ($nodes ?: [] as $node) { $value = trim((string) $node->textContent); if ($value !== '') { $locs[] = $value; } } return $locs; } /** * @return list */ private function extractImageLocs(DOMDocument $document): array { $xpath = new DOMXPath($document); $nodes = $xpath->query('//*[local-name()="image"]/*[local-name()="loc"]'); $locs = []; foreach ($nodes ?: [] as $node) { $value = trim((string) $node->textContent); if ($value !== '') { $locs[] = $value; } } return $locs; } private function isFamilySelected(array $families, string $loc): bool { foreach ($families as $family) { if (str_contains($loc, '/sitemaps/' . $family . '.xml') || str_contains($loc, '/sitemaps/' . $family . '-')) { return true; } } return false; } private function urlError(string $family, string $loc): ?string { $parts = parse_url($loc); if (! is_array($parts) || ! isset($parts['scheme'], $parts['host'])) { return 'Non-absolute URL emitted'; } if (($parts['query'] ?? '') !== '') { return 'Query-string URL emitted'; } if (($parts['fragment'] ?? '') !== '') { return 'Fragment URL emitted'; } $path = '/' . ltrim((string) ($parts['path'] ?? '/'), '/'); foreach ((array) config('sitemaps.validation.forbidden_paths', []) as $forbidden) { if ($forbidden !== '/' && str_contains($path, (string) $forbidden)) { return 'Non-public path emitted'; } } return match ($family) { 'artworks' => $this->validateArtworkUrl($path), 'users' => $this->validateUserUrl($path), default => null, }; } private function validateArtworkUrl(string $path): ?string { if (! preg_match('~^/art/(\d+)(?:/[^/?#]+)?$~', $path, $matches)) { return 'Non-canonical artwork URL emitted'; } $artwork = Artwork::query()->public()->published()->find((int) $matches[1]); return $artwork === null ? 'Non-public artwork URL emitted' : null; } private function validateUserUrl(string $path): ?string { if (! preg_match('#^/@([A-Za-z0-9_\-]+)$#', $path, $matches)) { return 'Non-canonical user URL emitted'; } $username = strtolower((string) $matches[1]); $user = User::query() ->where('is_active', true) ->whereNull('deleted_at') ->whereRaw('LOWER(username) = ?', [$username]) ->first(); return $user === null ? 'Non-public user URL emitted' : null; } }