Files
SkinbaseNova/app/Http/Controllers/Api/LinkPreviewController.php

229 lines
7.5 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace App\Http\Controllers\Api;
use App\Http\Controllers\Controller;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\TransferException;
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Request;
class LinkPreviewController extends Controller
{
private const TIMEOUT = 8; // seconds
private const MAX_BYTES = 524_288; // 512 KB enough to get the <head>
private const USER_AGENT = 'Skinbase-LinkPreview/1.0 (+https://skinbase.org)';
/** Blocked IP ranges (SSRF protection). */
private const BLOCKED_CIDRS = [
'0.0.0.0/8',
'10.0.0.0/8',
'100.64.0.0/10',
'127.0.0.0/8',
'169.254.0.0/16',
'172.16.0.0/12',
'192.0.0.0/24',
'192.168.0.0/16',
'198.18.0.0/15',
'198.51.100.0/24',
'203.0.113.0/24',
'240.0.0.0/4',
'::1/128',
'fc00::/7',
'fe80::/10',
];
public function __invoke(Request $request): JsonResponse
{
$request->validate([
'url' => ['required', 'string', 'max:2048'],
]);
$rawUrl = trim((string) $request->input('url'));
// Must be http(s)
if (! preg_match('#^https?://#i', $rawUrl)) {
return response()->json(['error' => 'Invalid URL scheme.'], 422);
}
$parsed = parse_url($rawUrl);
$host = $parsed['host'] ?? '';
if (empty($host)) {
return response()->json(['error' => 'Invalid URL.'], 422);
}
// Resolve hostname and block private/loopback IPs (SSRF protection).
// NOTE: This check is not atomic with Guzzle's own DNS resolution — a
// DNS rebinding attack could theoretically pass this check and then
// resolve to an internal IP when Guzzle makes the actual request.
// Risk is low (requires attacker-controlled DNS with very short TTL),
// but this is a known limitation of the current approach.
$resolved = gethostbyname($host);
if ($this->isBlockedIp($resolved)) {
return response()->json(['error' => 'URL not allowed.'], 422);
}
try {
$client = new Client([
'timeout' => self::TIMEOUT,
'connect_timeout' => 4,
'allow_redirects' => ['max' => 5, 'strict' => false],
'headers' => [
'User-Agent' => self::USER_AGENT,
'Accept' => 'text/html,application/xhtml+xml',
],
'verify' => true,
]);
$response = $client->get($rawUrl);
$status = $response->getStatusCode();
if ($status < 200 || $status >= 400) {
return response()->json(['error' => 'Could not fetch URL.'], 422);
}
// Read up to MAX_BYTES we only need the HTML <head>
$body = '';
$stream = $response->getBody();
while (! $stream->eof() && strlen($body) < self::MAX_BYTES) {
$body .= $stream->read(4096);
}
$stream->close();
} catch (TransferException $e) {
return response()->json(['error' => 'Could not reach URL.'], 422);
}
$preview = $this->extractMeta($body, $rawUrl);
return response()->json($preview);
}
/** Extract OG / Twitter / fallback meta tags. */
private function extractMeta(string $html, string $originalUrl): array
{
// Limit to roughly the <head> block for speed
$head = substr($html, 0, 50_000);
$og = [];
// OG / Twitter meta tags
preg_match_all(
'/<meta\s[^>]*(?:property|name)\s*=\s*["\']([^"\']+)["\'][^>]*content\s*=\s*["\']([^"\']*)["\'][^>]*>/i',
$head,
$m1,
PREG_SET_ORDER,
);
preg_match_all(
'/<meta\s[^>]*content\s*=\s*["\']([^"\']*)["\'][^>]*(?:property|name)\s*=\s*["\']([^"\']+)["\'][^>]*>/i',
$head,
$m2,
PREG_SET_ORDER,
);
$allMeta = array_merge(
array_map(fn ($r) => ['key' => strtolower($r[1]), 'value' => $r[2]], $m1),
array_map(fn ($r) => ['key' => strtolower($r[2]), 'value' => $r[1]], $m2),
);
$map = [];
foreach ($allMeta as $entry) {
$map[$entry['key']] ??= $entry['value'];
}
// Canonical URL
$canonical = $originalUrl;
if (preg_match('/<link[^>]+rel\s*=\s*["\']canonical["\'][^>]+href\s*=\s*["\']([^"\']+)["\'][^>]*>/i', $head, $mc)) {
$canonical = $mc[1];
} elseif (preg_match('/<link[^>]+href\s*=\s*["\']([^"\']+)["\'][^>]+rel\s*=\s*["\']canonical["\'][^>]*>/i', $head, $mc)) {
$canonical = $mc[1];
}
// Title
$title = $map['og:title']
?? $map['twitter:title']
?? null;
if (! $title && preg_match('/<title[^>]*>([^<]+)<\/title>/i', $head, $mt)) {
$title = trim(html_entity_decode($mt[1]));
}
// Description
$description = $map['og:description']
?? $map['twitter:description']
?? $map['description']
?? null;
// Image
$image = $map['og:image']
?? $map['twitter:image']
?? $map['twitter:image:src']
?? null;
// Resolve relative image URL
if ($image && ! preg_match('#^https?://#i', $image)) {
$parsed = parse_url($originalUrl);
$base = ($parsed['scheme'] ?? 'https') . '://' . ($parsed['host'] ?? '');
$image = $base . '/' . ltrim($image, '/');
}
// Site name
$siteName = $map['og:site_name'] ?? parse_url($originalUrl, PHP_URL_HOST) ?? null;
return [
'url' => $canonical,
'title' => $title ? html_entity_decode($title) : null,
'description' => $description ? html_entity_decode($description) : null,
'image' => $image,
'site_name' => $siteName,
];
}
private function isBlockedIp(string $ip): bool
{
if (! filter_var($ip, FILTER_VALIDATE_IP)) {
return true; // could not resolve
}
foreach (self::BLOCKED_CIDRS as $cidr) {
if ($this->ipInCidr($ip, $cidr)) {
return true;
}
}
return false;
}
private function ipInCidr(string $ip, string $cidr): bool
{
[$subnet, $bits] = explode('/', $cidr) + [1 => 32];
// IPv6
if (str_contains($cidr, ':')) {
if (! filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
return false;
}
$ipBin = inet_pton($ip);
$subnetBin = inet_pton($subnet);
if ($ipBin === false || $subnetBin === false) {
return false;
}
$bits = (int) $bits;
$mask = str_repeat("\xff", (int) ($bits / 8));
$remain = $bits % 8;
if ($remain) {
$mask .= chr(0xff << (8 - $remain));
}
$mask = str_pad($mask, strlen($subnetBin), "\x00");
return ($ipBin & $mask) === ($subnetBin & $mask);
}
// IPv4
if (! filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
return false;
}
$ipLong = ip2long($ip);
$subnetLong = ip2long($subnet);
$maskLong = $bits == 32 ? -1 : ~((1 << (32 - (int) $bits)) - 1);
return ($ipLong & $maskLong) === ($subnetLong & $maskLong);
}
}