llm: add FastAPI shim, gateway LLM endpoints, tests, and docs

2026-04-12 09:41:21 +02:00
parent baf497b015
commit 59c9584250
15 changed files with 1779 additions and 11 deletions
--- a/llm/Dockerfile
+++ b/llm/Dockerfile
@@ -0,0 +1,53 @@
+FROM debian:bookworm-slim AS builder
+
+ARG LLAMA_CPP_REPO=https://github.com/ggml-org/llama.cpp.git
+ARG LLAMA_CPP_REF=
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake \
+    git \
+  && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+RUN git clone --depth 1 ${LLAMA_CPP_REPO} llama.cpp \
+  && if [ -n "${LLAMA_CPP_REF}" ]; then cd llama.cpp && git fetch --depth 1 origin "${LLAMA_CPP_REF}" && git checkout "${LLAMA_CPP_REF}"; fi
+
+WORKDIR /src/llama.cpp
+RUN cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON \
+  && cmake --build build --config Release --target llama-server -j"$(nproc)"
+
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bash \
+    ca-certificates \
+    curl \
+    libgomp1 \
+  && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY llm/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+
+COPY --from=builder /src/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
+COPY llm/main.py /app/main.py
+COPY llm/entrypoint.sh /entrypoint.sh
+
+RUN chmod +x /entrypoint.sh /usr/local/bin/llama-server
+
+ENV MODEL_PATH=/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf \
+    LLM_MODEL_NAME=qwen3-1.7b-instruct-q4_k_m \
+    LLM_CONTEXT_SIZE=4096 \
+    LLM_THREADS=4 \
+    LLM_GPU_LAYERS=0 \
+    LLM_PORT=8080 \
+    LLAMA_SERVER_PORT=8081 \
+    LLM_STARTUP_TIMEOUT=120 \
+    LLM_EXTRA_ARGS=
+
+EXPOSE 8080
+
+ENTRYPOINT ["/entrypoint.sh"]
--- a/llm/entrypoint.sh
+++ b/llm/entrypoint.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -eu
+
+MODEL_PATH="${MODEL_PATH:-/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf}"
+LLM_MODEL_NAME="${LLM_MODEL_NAME:-qwen3-1.7b-instruct-q4_k_m}"
+LLM_CONTEXT_SIZE="${LLM_CONTEXT_SIZE:-4096}"
+LLM_THREADS="${LLM_THREADS:-4}"
+LLM_GPU_LAYERS="${LLM_GPU_LAYERS:-0}"
+LLM_PORT="${LLM_PORT:-8080}"
+LLAMA_SERVER_PORT="${LLAMA_SERVER_PORT:-8081}"
+
+if [ ! -f "$MODEL_PATH" ]; then
+  echo "llm startup failed: model file not found at $MODEL_PATH" >&2
+  echo "Mount a GGUF model into ./models/qwen3 and set MODEL_PATH if the filename differs." >&2
+  exit 1
+fi
+
+if [ ! -r "$MODEL_PATH" ]; then
+  echo "llm startup failed: model file is not readable at $MODEL_PATH" >&2
+  exit 1
+fi
+
+echo "Starting llm shim model=$LLM_MODEL_NAME model_path=$MODEL_PATH public_port=$LLM_PORT upstream_port=$LLAMA_SERVER_PORT ctx=$LLM_CONTEXT_SIZE threads=$LLM_THREADS gpu_layers=$LLM_GPU_LAYERS"
+
+exec python -m uvicorn main:app --host 0.0.0.0 --port "$LLM_PORT"
--- a/llm/main.py
+++ b/llm/main.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import shlex
+import subprocess
+import time
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import httpx
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+logger = logging.getLogger("llm")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
+
+MODEL_PATH = os.getenv("MODEL_PATH", "/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf")
+LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME", "qwen3-1.7b-instruct-q4_k_m")
+LLM_CONTEXT_SIZE = int(os.getenv("LLM_CONTEXT_SIZE", "4096"))
+LLM_THREADS = int(os.getenv("LLM_THREADS", "4"))
+LLM_GPU_LAYERS = int(os.getenv("LLM_GPU_LAYERS", "0"))
+LLAMA_SERVER_PORT = int(os.getenv("LLAMA_SERVER_PORT", "8081"))
+LLM_STARTUP_TIMEOUT = float(os.getenv("LLM_STARTUP_TIMEOUT", "120"))
+LLM_EXTRA_ARGS = os.getenv("LLM_EXTRA_ARGS", "")
+
+_llama_process: subprocess.Popen[bytes] | None = None
+_http_client: httpx.AsyncClient | None = None
+
+
+def _upstream_base_url() -> str:
+    return f"http://127.0.0.1:{LLAMA_SERVER_PORT}"
+
+
+def _ensure_http_client() -> httpx.AsyncClient:
+    if _http_client is None:
+        raise RuntimeError("HTTP client not initialised")
+    return _http_client
+
+
+def _validate_model_path() -> None:
+    model_file = Path(MODEL_PATH)
+    if not model_file.is_file():
+        raise RuntimeError(f"model file not found at {MODEL_PATH}")
+    if not os.access(model_file, os.R_OK):
+        raise RuntimeError(f"model file is not readable at {MODEL_PATH}")
+
+
+def _build_llama_command() -> list[str]:
+    command = [
+        "/usr/local/bin/llama-server",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        str(LLAMA_SERVER_PORT),
+        "--model",
+        MODEL_PATH,
+        "--alias",
+        LLM_MODEL_NAME,
+        "--ctx-size",
+        str(LLM_CONTEXT_SIZE),
+        "--threads",
+        str(LLM_THREADS),
+        "--n-gpu-layers",
+        str(LLM_GPU_LAYERS),
+    ]
+    if LLM_EXTRA_ARGS.strip():
+        command.extend(shlex.split(LLM_EXTRA_ARGS))
+    return command
+
+
+def _llama_running() -> bool:
+    return _llama_process is not None and _llama_process.poll() is None
+
+
+async def _wait_for_llama_ready() -> None:
+    deadline = time.monotonic() + LLM_STARTUP_TIMEOUT
+    last_error: Optional[Exception] = None
+
+    while time.monotonic() < deadline:
+        if _llama_process is not None and _llama_process.poll() is not None:
+            raise RuntimeError(f"llama-server exited with code {_llama_process.poll()}")
+
+        try:
+            response = await _ensure_http_client().get(f"{_upstream_base_url()}/v1/models", timeout=5)
+            if response.status_code == 200:
+                logger.info("llm service: llama-server ready")
+                return
+        except Exception as exc:
+            last_error = exc
+
+        await asyncio.sleep(1)
+
+    raise RuntimeError(f"llama-server did not become ready within {LLM_STARTUP_TIMEOUT}s: {last_error}")
+
+
+async def _stop_llama_process() -> None:
+    global _llama_process
+
+    if _llama_process is None:
+        return
+
+    if _llama_process.poll() is None:
+        _llama_process.terminate()
+        try:
+            await asyncio.to_thread(_llama_process.wait, timeout=10)
+        except subprocess.TimeoutExpired:
+            _llama_process.kill()
+            await asyncio.to_thread(_llama_process.wait, timeout=5)
+
+    _llama_process = None
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global _http_client, _llama_process
+
+    _validate_model_path()
+    _http_client = httpx.AsyncClient(timeout=httpx.Timeout(120, connect=5))
+
+    command = _build_llama_command()
+    logger.info("llm service: starting llama-server model=%s ctx=%s threads=%s gpu_layers=%s upstream_port=%s", LLM_MODEL_NAME, LLM_CONTEXT_SIZE, LLM_THREADS, LLM_GPU_LAYERS, LLAMA_SERVER_PORT)
+    _llama_process = subprocess.Popen(command)
+
+    try:
+        await _wait_for_llama_ready()
+        yield
+    finally:
+        await _stop_llama_process()
+        if _http_client is not None:
+            await _http_client.aclose()
+            _http_client = None
+
+
+app = FastAPI(title="Skinbase LLM Service", version="1.0.0", lifespan=lifespan)
+
+
+def _health_payload(status: str) -> Dict[str, Any]:
+    return {
+        "status": status,
+        "model": Path(MODEL_PATH).name,
+        "model_alias": LLM_MODEL_NAME,
+        "context_size": LLM_CONTEXT_SIZE,
+        "threads": LLM_THREADS,
+        "gpu_layers": LLM_GPU_LAYERS,
+    }
+
+
+async def _proxy_request(method: str, path: str, *, body: bytes | None = None) -> Dict[str, Any]:
+    if not _llama_running():
+        raise HTTPException(status_code=503, detail="llama-server is not running")
+
+    headers = {"content-type": "application/json"} if body is not None else None
+    try:
+        response = await _ensure_http_client().request(
+            method,
+            f"{_upstream_base_url()}{path}",
+            content=body,
+            headers=headers,
+            timeout=httpx.Timeout(120, connect=5),
+        )
+    except httpx.TimeoutException as exc:
+        raise HTTPException(status_code=504, detail=f"llama-server timed out: {exc}")
+    except httpx.RequestError as exc:
+        raise HTTPException(status_code=503, detail=f"llama-server unavailable: {exc}")
+
+    if response.status_code >= 400:
+        detail: Any
+        try:
+            detail = response.json()
+        except Exception:
+            detail = response.text[:1000]
+        raise HTTPException(status_code=response.status_code, detail=detail)
+
+    try:
+        return response.json()
+    except Exception as exc:
+        raise HTTPException(status_code=502, detail=f"llama-server returned invalid JSON: {exc}")
+
+
+@app.exception_handler(HTTPException)
+async def handle_http_exception(_: Request, exc: HTTPException):
+    return JSONResponse(status_code=exc.status_code, content={"error": {"code": "llm_service_error", "message": str(exc.detail)}})
+
+
+@app.get("/health")
+async def health():
+    if not _llama_running():
+        return JSONResponse(status_code=503, content=_health_payload("unavailable"))
+
+    try:
+        response = await _ensure_http_client().get(f"{_upstream_base_url()}/v1/models", timeout=5)
+        if response.status_code != 200:
+            return JSONResponse(status_code=503, content=_health_payload("degraded"))
+    except Exception:
+        return JSONResponse(status_code=503, content=_health_payload("degraded"))
+
+    return _health_payload("ok")
+
+
+@app.get("/v1/models")
+async def list_models():
+    return await _proxy_request("GET", "/v1/models")
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    body = await request.body()
+    return await _proxy_request("POST", "/v1/chat/completions", body=body)
--- a/llm/requirements.txt
+++ b/llm/requirements.txt
@@ -0,0 +1,3 @@
+fastapi==0.115.5
+uvicorn[standard]==0.30.6
+httpx==0.27.2