llm: add FastAPI shim, gateway LLM endpoints, tests, and docs

2026-04-12 09:41:21 +02:00
parent baf497b015
commit 59c9584250
15 changed files with 1779 additions and 11 deletions
--- a/llm/Dockerfile
+++ b/llm/Dockerfile
@@ -0,0 +1,53 @@
+FROM debian:bookworm-slim AS builder
+
+ARG LLAMA_CPP_REPO=https://github.com/ggml-org/llama.cpp.git
+ARG LLAMA_CPP_REF=
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake \
+    git \
+  && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+RUN git clone --depth 1 ${LLAMA_CPP_REPO} llama.cpp \
+  && if [ -n "${LLAMA_CPP_REF}" ]; then cd llama.cpp && git fetch --depth 1 origin "${LLAMA_CPP_REF}" && git checkout "${LLAMA_CPP_REF}"; fi
+
+WORKDIR /src/llama.cpp
+RUN cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON \
+  && cmake --build build --config Release --target llama-server -j"$(nproc)"
+
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bash \
+    ca-certificates \
+    curl \
+    libgomp1 \
+  && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY llm/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+
+COPY --from=builder /src/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server
+COPY llm/main.py /app/main.py
+COPY llm/entrypoint.sh /entrypoint.sh
+
+RUN chmod +x /entrypoint.sh /usr/local/bin/llama-server
+
+ENV MODEL_PATH=/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf \
+    LLM_MODEL_NAME=qwen3-1.7b-instruct-q4_k_m \
+    LLM_CONTEXT_SIZE=4096 \
+    LLM_THREADS=4 \
+    LLM_GPU_LAYERS=0 \
+    LLM_PORT=8080 \
+    LLAMA_SERVER_PORT=8081 \
+    LLM_STARTUP_TIMEOUT=120 \
+    LLM_EXTRA_ARGS=
+
+EXPOSE 8080
+
+ENTRYPOINT ["/entrypoint.sh"]