first commit

2026-03-21 09:09:28 +01:00
commit 8da669c0e1
23 changed files with 1812 additions and 0 deletions
--- a/clip/Dockerfile
+++ b/clip/Dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY clip/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+
+COPY clip /app
+COPY common /app/common
+
+ENV PYTHONUNBUFFERED=1
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/clip/main.py
+++ b/clip/main.py
@@ -0,0 +1,161 @@
+from __future__ import annotations
+
+import os
+from typing import List, Optional
+
+import torch
+import open_clip
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from pydantic import BaseModel, Field
+import numpy as np
+
+from common.image_io import fetch_url_bytes, bytes_to_pil, ImageLoadError
+
+MODEL_NAME = os.getenv("MODEL_NAME", "ViT-B-32")
+MODEL_PRETRAINED = os.getenv("MODEL_PRETRAINED", "openai")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Starter vocab (replace with DB-driven vocab later)
+TAGS: List[str] = [
+    "wallpaper", "4k wallpaper", "8k wallpaper",
+    "cyberpunk", "neon", "city", "night", "sci-fi", "space",
+    "fantasy", "anime", "digital art", "abstract", "minimal",
+    "landscape", "nature", "mountains", "forest", "ocean", "sunset",
+    "photography", "portrait", "architecture", "cars", "gaming",
+]
+
+app = FastAPI(title="Skinbase CLIP Service", version="1.0.0")
+
+model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=MODEL_PRETRAINED)
+tokenizer = open_clip.get_tokenizer(MODEL_NAME)
+model = model.to(DEVICE).eval()
+
+
+class AnalyzeRequest(BaseModel):
+    url: Optional[str] = None
+    limit: int = Field(default=5, ge=1, le=50)
+    threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
+
+
+class EmbedRequest(BaseModel):
+    url: Optional[str] = None
+    backend: Optional[str] = Field(default="openclip", regex="^(openclip|hf)$")
+    model: Optional[str] = None
+    pretrained: Optional[str] = None
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok", "device": DEVICE, "model": MODEL_NAME, "pretrained": MODEL_PRETRAINED}
+
+
+def _analyze_image_bytes(data: bytes, limit: int, threshold: Optional[float]):
+    img = bytes_to_pil(data)
+    image_input = preprocess(img).unsqueeze(0).to(DEVICE)
+    text = tokenizer(TAGS).to(DEVICE)
+
+    with torch.no_grad():
+        image_features = model.encode_image(image_input)
+        text_features = model.encode_text(text)
+
+        # Normalize so dot product approximates cosine similarity
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+
+        logits = (image_features @ text_features.T)
+        probs = logits.softmax(dim=-1)
+
+    topk = probs[0].topk(min(limit, len(TAGS)))
+
+    results = []
+    for score, idx in zip(topk.values, topk.indices):
+        conf = float(score)
+        if threshold is not None and conf < float(threshold):
+            continue
+        results.append({"tag": TAGS[int(idx)], "confidence": conf})
+
+    return {"tags": results, "model": MODEL_NAME, "dim": int(text_features.shape[-1])}
+
+
+def _embed_image_bytes(data: bytes, backend: str = "openclip", model_name: Optional[str] = None, pretrained: Optional[str] = None):
+    img = bytes_to_pil(data)
+
+    if backend == "openclip":
+        # prefer already-loaded model when matching global config
+        use_model_name = model_name or MODEL_NAME
+        use_pretrained = pretrained or MODEL_PRETRAINED
+        if use_model_name == MODEL_NAME and use_pretrained == MODEL_PRETRAINED:
+            _model = model
+            _preprocess = preprocess
+            device = DEVICE
+        else:
+            import open_clip as _oc
+            _model, _, _preprocess = _oc.create_model_and_transforms(use_model_name, pretrained=use_pretrained)
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            _model = _model.to(device).eval()
+
+        image_input = _preprocess(img).unsqueeze(0).to(device)
+        with torch.no_grad():
+            image_features = _model.encode_image(image_input)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        vec = image_features.cpu().numpy()[0]
+
+    else:
+        # HuggingFace CLIP backend
+        from transformers import CLIPProcessor, CLIPModel
+
+        hf_model_name = model_name or "openai/clip-vit-base-patch32"
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        hf_model = CLIPModel.from_pretrained(hf_model_name).to(device).eval()
+        processor = CLIPProcessor.from_pretrained(hf_model_name)
+        inputs = processor(images=img, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            feats = hf_model.get_image_features(**inputs)
+            feats = feats / feats.norm(dim=-1, keepdim=True)
+        vec = feats.cpu().numpy()[0]
+
+    return {"vector": vec.tolist(), "dim": int(np.asarray(vec).shape[-1]), "backend": backend, "model": model_name or (MODEL_NAME if backend == "openclip" else None)}
+
+
+@app.post("/analyze")
+def analyze(req: AnalyzeRequest):
+    if not req.url:
+        raise HTTPException(400, "url is required")
+    try:
+        data = fetch_url_bytes(req.url)
+        return _analyze_image_bytes(data, req.limit, req.threshold)
+    except ImageLoadError as e:
+        raise HTTPException(400, str(e))
+
+
+@app.post("/analyze/file")
+async def analyze_file(
+    file: UploadFile = File(...),
+    limit: int = Form(5),
+    threshold: Optional[float] = Form(None),
+):
+    data = await file.read()
+    return _analyze_image_bytes(data, int(limit), threshold)
+
+
+@app.post("/embed")
+def embed(req: EmbedRequest):
+    if not req.url:
+        raise HTTPException(400, "url is required")
+    try:
+        data = fetch_url_bytes(req.url)
+        return _embed_image_bytes(data, backend=req.backend, model_name=req.model, pretrained=req.pretrained)
+    except ImageLoadError as e:
+        raise HTTPException(400, str(e))
+
+
+@app.post("/embed/file")
+async def embed_file(
+    file: UploadFile = File(...),
+    backend: str = Form("openclip"),
+    model: Optional[str] = Form(None),
+    pretrained: Optional[str] = Form(None),
+):
+    data = await file.read()
+    return _embed_image_bytes(data, backend=backend, model_name=model, pretrained=pretrained)
--- a/clip/requirements.txt
+++ b/clip/requirements.txt
@@ -0,0 +1,10 @@
+fastapi==0.115.5
+uvicorn[standard]==0.30.6
+python-multipart==0.0.9
+requests==2.32.3
+pillow==10.4.0
+torch==2.4.1
+torchvision==0.19.1
+open_clip_torch==2.26.1
+transformers
+numpy
--- a/clip/vectorize.py
+++ b/clip/vectorize.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+import argparse
+import os
+from typing import Tuple
+
+import numpy as np
+import torch
+from PIL import Image
+
+from common.image_io import fetch_url_bytes, bytes_to_pil, ImageLoadError
+
+try:
+    import open_clip
+except Exception:
+    open_clip = None
+
+try:
+    from transformers import CLIPProcessor, CLIPModel
+except Exception:
+    CLIPModel = None
+    CLIPProcessor = None
+
+
+def load_openclip(model_name: str = "ViT-B-32", pretrained: str = "openai") -> Tuple:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if open_clip is None:
+        raise RuntimeError("open_clip is not installed")
+    model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
+    model = model.to(device).eval()
+    return model, preprocess, device
+
+
+def embed_openclip(model, preprocess, device, pil_image: Image.Image) -> np.ndarray:
+    image_input = preprocess(pil_image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        image_features = model.encode_image(image_input)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+    return image_features.cpu().numpy()[0]
+
+
+def load_hf_clip(model_name: str = "openai/clip-vit-base-patch32") -> Tuple:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if CLIPModel is None or CLIPProcessor is None:
+        raise RuntimeError("transformers (CLIP) is not installed")
+    model = CLIPModel.from_pretrained(model_name).to(device).eval()
+    processor = CLIPProcessor.from_pretrained(model_name)
+    return model, processor, device
+
+
+def embed_hf_clip(model, processor, device, pil_image: Image.Image) -> np.ndarray:
+    inputs = processor(images=pil_image, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        feats = model.get_image_features(**inputs)
+        feats = feats / feats.norm(dim=-1, keepdim=True)
+    return feats.cpu().numpy()[0]
+
+
+def load_image(path_or_url: str) -> Image.Image:
+    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
+        data = fetch_url_bytes(path_or_url)
+        return bytes_to_pil(data)
+    else:
+        return Image.open(path_or_url).convert("RGB")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Vectorize an image using CLIP (open_clip or HuggingFace)")
+    parser.add_argument("input", help="Path to image file or URL")
+    parser.add_argument("--backend", choices=("openclip", "hf"), default="openclip")
+    parser.add_argument("--model", default=None, help="Model name (backend-specific)")
+    parser.add_argument("--pretrained", default="openai", help="open_clip pretrained source (openclip backend)")
+    parser.add_argument("--out", default=None, help="Output .npy path (defaults to stdout)")
+    args = parser.parse_args()
+
+    try:
+        img = load_image(args.input)
+    except ImageLoadError as e:
+        raise SystemExit(f"Failed to load image: {e}")
+
+    if args.backend == "openclip":
+        model_name = args.model or os.getenv("MODEL_NAME", "ViT-B-32")
+        pretrained = args.pretrained
+        model, preprocess, device = load_openclip(model_name, pretrained=pretrained)
+        vec = embed_openclip(model, preprocess, device, img)
+    else:
+        model_name = args.model or "openai/clip-vit-base-patch32"
+        model, processor, device = load_hf_clip(model_name)
+        vec = embed_hf_clip(model, processor, device, img)
+
+    vec = np.asarray(vec, dtype=np.float32)
+
+    if args.out:
+        np.save(args.out, vec)
+        print(f"Saved vector shape={vec.shape} to {args.out}")
+    else:
+        # Print a short summary and the vector length. Full vector to stdout can be large.
+        print(f"vector_shape={vec.shape}")
+        print(np.array2string(vec, precision=6, separator=", "))
+
+
+if __name__ == "__main__":
+    main()