From: Stefan Gasser Date: Sat, 17 Jan 2026 13:10:36 +0000 (+0100) Subject: All-in-one Docker image with prebuilt language support (#42) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=d50099e8d0c2d4f49174e51521580ea804d16a07;p=sgasser-llm-shield.git All-in-one Docker image with prebuilt language support (#42) * All-in-one Docker image with dev mode support Simplify deployment with a single container that includes both the proxy and Presidio PII detection. No git clone required - just docker run. Changes: - Combined Dockerfile: Multi-stage build with Presidio + Bun in one container - supervisord: Process manager for running both services - Prebuilt images: ghcr.io/sgasser/pasteguard:en (~2.7GB) and :eu (~12GB) - Release workflow: GitHub Actions to build en/eu images on version tags - Dev mode: docker compose up presidio -d for local development with hot-reload - Updated docs: curl-based installation, no cloning needed for production - Language models: Changed from _md to _lg for better PII detection accuracy Image tags: - en (default/latest): English only - eu: European languages (en, de, es, fr, it, nl, pl, pt, ro) * Move Docker files into docker/ directory Reorganize Docker-related files for cleaner project structure: - Dockerfile → docker/Dockerfile - supervisord.conf → docker/supervisord.conf - presidio/ → docker/presidio/ Keep docker-compose.yml in root for convenience (docker compose up works). Update all path references in workflow, configs, and docs. * Fix CI: update Dockerfile path * Include config.example.yaml in Docker image for zero-config startup * Simplify docs: zero-config quickstart * Add missing ENV detection entities to config example Add ENV_PASSWORD, ENV_SECRET, and CONNECTION_STRING to the secrets_detection entities section to match what's documented and implemented. * Auto-configure languages per Docker image - Add PASTEGUARD_LANGUAGES env var to Dockerfile (set from LANGUAGES build arg) - Update config.example.yaml to use env var with fallback to 'en' - Support comma-separated string for languages config (e.g., "en,de,fr") - EN image now auto-enables English, EU image auto-enables all 9 EU languages Users can still override via config.yaml with array syntax if needed. * Update docs: languages are auto-configured per image * Clarify runtime vs build-time env vars in docs * Update docs: languages are auto-configured per Docker image * Remove confusing env var override example from docs * List specific EU languages in docs instead of 'All 9' --- diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8400186..301a330 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,4 +38,4 @@ jobs: - uses: actions/checkout@v4 - name: Test Docker build - run: docker build -t pasteguard:test . \ No newline at end of file + run: docker build -f docker/Dockerfile -t pasteguard:test . \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..8e8fcaf --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,63 @@ +name: Release + +on: + push: + tags: ["v*"] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build: + name: Build (${{ matrix.tag }}) + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + strategy: + matrix: + include: + - tag: en + languages: "en" + latest: true + - tag: eu + languages: "en,de,es,fr,it,nl,pl,pt,ro" + latest: false + steps: + - name: Free disk space + uses: endersonmenezes/free-disk-space@v3 + with: + remove_android: true + remove_dotnet: true + remove_haskell: true + remove_tool_cache: true + + - uses: actions/checkout@v6 + + - uses: docker/setup-buildx-action@v3 + + - uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract version from tag + id: version + run: echo "version=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + + - uses: docker/build-push-action@v6 + with: + context: . + file: docker/Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + build-args: LANGUAGES=${{ matrix.languages }} + tags: | + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ matrix.tag }} + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}-${{ matrix.tag }} + ${{ matrix.latest && format('{0}/{1}:latest', env.REGISTRY, env.IMAGE_NAME) || '' }} + ${{ matrix.latest && format('{0}/{1}:{2}', env.REGISTRY, env.IMAGE_NAME, steps.version.outputs.version) || '' }} + cache-from: type=gha,scope=${{ matrix.tag }} + cache-to: type=gha,mode=max,scope=${{ matrix.tag }} diff --git a/CLAUDE.md b/CLAUDE.md index 837c9ce..68bedcb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -60,20 +60,25 @@ See @config.example.yaml for full configuration. ## Setup -**Production:** `docker compose up -d` - -**Development:** +**Production:** ```bash cp config.example.yaml config.yaml -docker compose up presidio-analyzer -d -bun install && bun run dev +docker compose up -d +``` + +**Development:** Presidio in Docker, Bun locally with hot-reload: +```bash +docker compose up presidio -d +bun run dev ``` -**Dependencies:** -- Presidio (port 5002) - required -- Ollama (port 11434) - route mode only +**Multi-language:** Use EU image or build custom: +```bash +PASTEGUARD_TAG=eu docker compose up -d +LANGUAGES=en,de,ja docker compose up -d --build +``` -**Multi-language PII:** Build with `LANGUAGES=en,de,fr docker compose build`. See @presidio/languages.yaml for 24 available languages. +See @docker/presidio/languages.yaml for 24 available languages. ## Testing diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7fe6e18..7e69d7c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,7 +7,7 @@ Thank you for considering contributing to PasteGuard! 1. Fork and clone the repository 2. Install dependencies: `bun install` 3. Copy config: `cp config.example.yaml config.yaml` -4. Start Presidio: `docker compose up presidio-analyzer -d` +4. Start Presidio: `docker compose up presidio -d` 5. Run dev server: `bun run dev` ## Code Quality diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 3e2741e..0000000 --- a/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM oven/bun:1-slim - -WORKDIR /app - -# Install dependencies -COPY package.json bun.lock ./ -RUN bun install --frozen-lockfile --production - -# Copy source -COPY src ./src -COPY tsconfig.json ./ - -EXPOSE 3000 - -CMD ["bun", "run", "src/index.ts"] diff --git a/README.md b/README.md index 0631791..c7db376 100644 --- a/README.md +++ b/README.md @@ -19,14 +19,7 @@
-

- PasteGuard Demo -

-

- Your App → PasteGuard → OpenAI — PII never reaches external servers -

- -
+PasteGuard Dashboard ## What is PasteGuard? @@ -82,20 +75,22 @@ PasteGuard sits between your app and your provider. It's OpenAI-compatible — j ## Quick Start ```bash -git clone https://github.com/sgasser/pasteguard.git -cd pasteguard -cp config.example.yaml config.yaml -docker compose up -d +docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:en ``` Point your app to `http://localhost:3000/openai/v1` instead of `https://api.openai.com/v1`. Dashboard: [http://localhost:3000/dashboard](http://localhost:3000/dashboard) -PasteGuard Dashboard -

Every request logged with masked content preview

+### Multiple Languages + +For European languages (German, Spanish, French, Italian, Dutch, Polish, Portuguese, Romanian): + +```bash +docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:eu +``` -For multiple languages, configuration options, and more: **[Read the docs →](https://pasteguard.com/docs/quickstart)** +For custom config, persistent logs, or other languages: **[Read the docs →](https://pasteguard.com/docs/installation)** ## Integrations @@ -129,9 +124,6 @@ Works with any OpenAI-compatible tool: - GitHub tokens - JWT tokens - Bearer tokens -- Env passwords -- Env secrets -- Connection strings ## Tech Stack diff --git a/config.example.yaml b/config.example.yaml index d21ceb7..6f09440 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -43,19 +43,19 @@ pii_detection: # Supported languages for PII detection # Auto-detects language from input text and uses appropriate model # - # Languages must match what was installed during docker build: - # LANGUAGES=en,de docker-compose build + # Docker: Uses PASTEGUARD_LANGUAGES env var (auto-configured per image) + # Local: Uncomment the array below and comment out the env var line # # Available (24 languages): ca, zh, hr, da, nl, en, fi, fr, de, el, # it, ja, ko, lt, mk, nb, pl, pt, ro, ru, sl, es, sv, uk - # See presidio/languages.yaml for full list with details - languages: - - en - # Add more languages to match your Docker build: - # - de - # - fr - # - es - # - it + # See docker/presidio/languages.yaml for full list with details + languages: ${PASTEGUARD_LANGUAGES:-en} + # languages: + # - en + # - de + # - fr + # - es + # - it # Fallback language if detected language is not in the list above fallback_language: en @@ -106,9 +106,9 @@ secrets_detection: # - BEARER_TOKEN: Bearer tokens in Authorization-style contexts # # Environment Variables (opt-in): - # - ENV_PASSWORD: Password variables (DB_PASSWORD=..., ADMIN_PWD=...) - # - ENV_SECRET: Secret variables (APP_SECRET=..., JWT_SECRET=...) - # - CONNECTION_STRING: Database URLs with credentials (postgres://user:pass@host/db) + # - ENV_PASSWORD: DB_PASSWORD=..., ADMIN_PWD=... (8+ char values) + # - ENV_SECRET: APP_SECRET=..., JWT_SECRET=... (8+ char values) + # - CONNECTION_STRING: postgres://user:pass@host, mongodb://... entities: - OPENSSH_PRIVATE_KEY - PEM_PRIVATE_KEY @@ -118,7 +118,6 @@ secrets_detection: # - API_KEY_GITHUB # - JWT_TOKEN # - BEARER_TOKEN - # Uncomment to detect environment variable credentials: # - ENV_PASSWORD # - ENV_SECRET # - CONNECTION_STRING diff --git a/docker-compose.yml b/docker-compose.yml index f891957..7462c2b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,36 +1,52 @@ +# PasteGuard - Docker Compose (All-in-One Image) +# +# Production: +# docker compose up -d +# +# Development (only Presidio, run Bun locally with hot-reload): +# docker compose up presidio -d +# bun run dev +# +# European languages: +# PASTEGUARD_TAG=eu docker compose up -d +# +# Custom languages (local build): +# LANGUAGES=en,de,ja docker compose up -d --build + services: + # Production: Full all-in-one pasteguard: - build: . + image: ghcr.io/sgasser/pasteguard:${PASTEGUARD_TAG:-en} + build: + context: . + dockerfile: docker/Dockerfile + args: + LANGUAGES: ${LANGUAGES:-en} ports: - "3000:3000" env_file: - - .env - environment: - - PRESIDIO_URL=http://presidio-analyzer:3000 + - path: .env + required: false volumes: - ./config.yaml:/app/config.yaml:ro - ./data:/app/data - depends_on: - presidio-analyzer: - condition: service_healthy restart: unless-stopped - presidio-analyzer: + # Development: Only Presidio (for local Bun with hot-reload) + presidio: + profiles: ["dev"] + image: ghcr.io/sgasser/pasteguard:${PASTEGUARD_TAG:-en} build: - context: ./presidio + context: . + dockerfile: docker/Dockerfile args: - # Languages to install for PII detection - # Available: ca, zh, hr, da, nl, en, fi, fr, de, el, it, ja, ko, - # lt, mk, nb, pl, pt, ro, ru, sl, es, sv, uk - # See presidio/languages.yaml for full list - # Example: LANGUAGES=en,de,fr docker-compose build LANGUAGES: ${LANGUAGES:-en} ports: - - "5002:3000" + - "5002:5002" + environment: + - START_APP=false + - PORT=5002 + - WORKERS=1 healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3000/health')"] - interval: 30s - timeout: 10s - retries: 5 - start_period: 60s + disable: true restart: unless-stopped diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..16aaa36 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,104 @@ +# PasteGuard - All-in-One Image +# Single container with Proxy + PII Detection +# +# Build: docker build -f docker/Dockerfile --build-arg LANGUAGES=en -t pasteguard:en . +# Run: docker run -p 3000:3000 -v ./config.yaml:/app/config.yaml -v ./data:/app/data pasteguard:en + +ARG LANGUAGES="en" + +# ============================================================================= +# Stage 1: Generate Presidio configuration files +# ============================================================================= +FROM python:3.11-slim AS generator + +WORKDIR /build + +RUN pip install --no-cache-dir pyyaml + +COPY docker/presidio/languages.yaml /build/ +COPY docker/presidio/generate-configs.py /build/ + +ARG LANGUAGES +RUN python generate-configs.py \ + --languages="${LANGUAGES}" \ + --registry=/build/languages.yaml \ + --output=/output + +# ============================================================================= +# Stage 2: Build Bun application +# ============================================================================= +FROM oven/bun:1-slim AS bun-builder + +WORKDIR /app + +COPY package.json bun.lock ./ +RUN bun install --frozen-lockfile --production + +COPY src ./src +COPY tsconfig.json ./ + +# ============================================================================= +# Stage 3: Final combined image +# ============================================================================= +FROM mcr.microsoft.com/presidio-analyzer:latest + +ARG LANGUAGES + +# Install supervisor for process management +RUN apt-get update && apt-get install -y --no-install-recommends \ + supervisor \ + curl \ + unzip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust only if Japanese is included +RUN if echo "${LANGUAGES}" | grep -q "ja"; then \ + apt-get update && apt-get install -y --no-install-recommends build-essential \ + && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install Bun +RUN curl -fsSL https://bun.sh/install | bash +ENV PATH="/root/.bun/bin:${PATH}" + +# Copy Presidio configuration +COPY --from=generator /output/nlp-config.yaml /usr/bin/presidio_analyzer/conf/default.yaml +COPY --from=generator /output/recognizers-config.yaml /usr/bin/presidio_analyzer/conf/default_recognizers.yaml +COPY --from=generator /output/analyzer-config.yaml /usr/bin/presidio_analyzer/conf/default_analyzer.yaml + +# Install spaCy models +COPY --from=generator /output/install-models.sh /tmp/ +RUN chmod +x /tmp/install-models.sh && /tmp/install-models.sh && rm /tmp/install-models.sh + +# Copy Bun application +WORKDIR /app +COPY --from=bun-builder /app/node_modules ./node_modules +COPY --from=bun-builder /app/src ./src +COPY --from=bun-builder /app/package.json ./ +COPY --from=bun-builder /app/tsconfig.json ./ +COPY config.example.yaml ./ + +# Create data directory +RUN mkdir -p /app/data + +# Copy supervisor configuration +COPY docker/supervisord.conf /etc/supervisor/conf.d/pasteguard.conf + +# Environment defaults +ENV PRESIDIO_URL=http://localhost:5002 +ENV PORT=5002 +ENV WORKERS=1 +ENV START_APP=true +ENV PASTEGUARD_LANGUAGES=${LANGUAGES} + +EXPOSE 3000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:3000/health || exit 1 + +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/pasteguard.conf"] diff --git a/presidio/scripts/generate-configs.py b/docker/presidio/generate-configs.py similarity index 91% rename from presidio/scripts/generate-configs.py rename to docker/presidio/generate-configs.py index 02be7b4..2444c5f 100644 --- a/presidio/scripts/generate-configs.py +++ b/docker/presidio/generate-configs.py @@ -57,11 +57,25 @@ def generate_nlp_config(languages: list[str], registry: dict) -> dict: "models": models, "ner_model_configuration": { "model_to_presidio_entity_mapping": { + # Standard labels (most languages) "PER": "PERSON", "PERSON": "PERSON", "LOC": "LOCATION", "GPE": "LOCATION", "ORG": "ORGANIZATION", + # Polish (NKJP corpus) + "persName": "PERSON", + "placeName": "LOCATION", + "geogName": "LOCATION", + "orgName": "ORGANIZATION", + # Korean + "PS": "PERSON", + "LC": "LOCATION", + "OG": "ORGANIZATION", + # Swedish + "PRS": "PERSON", + # Norwegian + "GPE_LOC": "LOCATION", }, "low_confidence_score_multiplier": 0.4, "low_score_entity_names": ["ORG"], @@ -148,7 +162,8 @@ def generate_install_script(languages: list[str], registry: dict) -> str: model = registry["languages"][lang]["model"] url = f"https://github.com/explosion/spacy-models/releases/download/{model}-{version}/{model}-{version}-py3-none-any.whl" lines.append(f'echo "Installing {model} for {lang}..."') - lines.append(f"pip install --no-cache-dir {url}") + # Use poetry run pip to install in the correct virtual environment + lines.append(f"poetry run pip install --no-cache-dir {url}") lines.append("") lines.append('echo "All models installed successfully"') diff --git a/presidio/languages.yaml b/docker/presidio/languages.yaml similarity index 79% rename from presidio/languages.yaml rename to docker/presidio/languages.yaml index b27bbd4..750707e 100644 --- a/presidio/languages.yaml +++ b/docker/presidio/languages.yaml @@ -1,10 +1,7 @@ # PasteGuard Language Registry -# All 24 spaCy languages with trained pipelines +# 24 spaCy languages with large models for accurate PII detection # -# Usage: Set LANGUAGES build arg to select which to install -# LANGUAGES=en,de docker-compose build -# -# To add a custom language, add an entry here with model name +# Usage: LANGUAGES=en,de docker compose build spacy_version: "3.8.0" @@ -16,31 +13,31 @@ languages: # Catalan ca: name: Catalan - model: ca_core_news_md + model: ca_core_news_lg phone_context: [telèfon, número, mòbil, trucada, trucar] # Chinese zh: name: Chinese - model: zh_core_web_md + model: zh_core_web_lg phone_context: [电话, 手机, 号码, 打电话, 电话号码, 手机号码] # Croatian hr: name: Croatian - model: hr_core_news_md + model: hr_core_news_lg phone_context: [telefon, broj, mobitel, poziv, nazovi, zvati] # Danish da: name: Danish - model: da_core_news_md + model: da_core_news_lg phone_context: [telefon, nummer, mobil, mobiltelefon, opkald, ringe] # Dutch nl: name: Dutch - model: nl_core_news_md + model: nl_core_news_lg phone_context: [telefoon, nummer, mobiel, mobieltje, GSM, bellen] # English (Presidio defaults) @@ -52,13 +49,13 @@ languages: # Finnish fi: name: Finnish - model: fi_core_news_md + model: fi_core_news_lg phone_context: [puhelin, numero, kännykkä, matkapuhelin, soittaa, puhelinnumero] # French fr: name: French - model: fr_core_news_md + model: fr_core_news_lg phone_context: [téléphone, numéro, portable, mobile, appeler, tél] # German @@ -70,89 +67,89 @@ languages: # Greek el: name: Greek - model: el_core_news_md + model: el_core_news_lg phone_context: [τηλέφωνο, αριθμός, κινητό, κλήση, τηλεφωνώ, καλώ] # Italian it: name: Italian - model: it_core_news_md + model: it_core_news_lg phone_context: [telefono, numero, cellulare, telefonino, chiamare, chiamata] # Japanese ja: name: Japanese - model: ja_core_news_md + model: ja_core_news_lg phone_context: [電話, 携帯, 番号, スマホ, ケータイ, 電話番号] # Korean ko: name: Korean - model: ko_core_news_md + model: ko_core_news_lg phone_context: [전화, 휴대폰, 핸드폰, 번호, 전화번호, 통화] # Lithuanian lt: name: Lithuanian - model: lt_core_news_md + model: lt_core_news_lg phone_context: [telefonas, numeris, mobilusis, skambutis, skambinti] # Macedonian mk: name: Macedonian - model: mk_core_news_md + model: mk_core_news_lg phone_context: [телефон, број, мобилен, повик, звони] # Norwegian Bokmål nb: name: Norwegian - model: nb_core_news_md + model: nb_core_news_lg phone_context: [telefon, nummer, mobil, mobiltelefon, samtale, ringe] # Polish pl: name: Polish - model: pl_core_news_md + model: pl_core_news_lg phone_context: [telefon, numer, komórka, komórkowy, dzwoń, zadzwoń] # Portuguese pt: name: Portuguese - model: pt_core_news_md + model: pt_core_news_lg phone_context: [telefone, número, celular, telemóvel, ligar, telefonar] # Romanian ro: name: Romanian - model: ro_core_news_md + model: ro_core_news_lg phone_context: [telefon, număr, mobil, apel, suna] # Russian ru: name: Russian - model: ru_core_news_md + model: ru_core_news_lg phone_context: [телефон, номер, мобильник, мобила, сотовый, звонок] # Slovenian sl: name: Slovenian - model: sl_core_news_md + model: sl_core_news_lg phone_context: [telefon, številka, mobilnik, mobilec, klic, pokliči] # Spanish es: name: Spanish - model: es_core_news_md + model: es_core_news_lg phone_context: [teléfono, número, móvil, celular, llamar, llamada] # Swedish sv: name: Swedish - model: sv_core_news_md + model: sv_core_news_lg phone_context: [telefon, nummer, mobil, mobiltelefon, samtal, ringa] # Ukrainian uk: name: Ukrainian - model: uk_core_news_md + model: uk_core_news_lg phone_context: [телефон, номер, мобільний, мобілка, дзвінок, дзвони] diff --git a/docker/supervisord.conf b/docker/supervisord.conf new file mode 100644 index 0000000..d3d0f89 --- /dev/null +++ b/docker/supervisord.conf @@ -0,0 +1,32 @@ +[supervisord] +nodaemon=true +user=root +logfile=/var/log/supervisor/supervisord.log +pidfile=/var/run/supervisord.pid +loglevel=info + +[program:presidio] +command=poetry run gunicorn -w %(ENV_WORKERS)s -b 0.0.0.0:%(ENV_PORT)s --timeout 300 --preload "app:create_app()" +directory=/usr/bin +autostart=true +autorestart=true +startsecs=10 +startretries=3 +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=10 + +[program:pasteguard] +command=/root/.bun/bin/bun run src/index.ts +directory=/app +autostart=%(ENV_START_APP)s +autorestart=true +startsecs=5 +startretries=3 +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +priority=20 diff --git a/docs/concepts/pii-detection.mdx b/docs/concepts/pii-detection.mdx index f2c0036..915475d 100644 --- a/docs/concepts/pii-detection.mdx +++ b/docs/concepts/pii-detection.mdx @@ -29,15 +29,20 @@ PasteGuard supports 24 languages. The language is auto-detected from your input **Available languages:** Catalan, Chinese, Croatian, Danish, Dutch, English, Finnish, French, German, Greek, Italian, Japanese, Korean, Lithuanian, Macedonian, Norwegian, Polish, Portuguese, Romanian, Russian, Slovenian, Spanish, Swedish, Ukrainian -### Configure Languages +### Docker Images -Languages must be installed during Docker build: +Languages are auto-configured per image — no config changes needed: + +- **`:en` image** → English only +- **`:eu` image** → English, German, Spanish, French, Italian, Dutch, Polish, Portuguese, Romanian + +For custom languages, build locally: ```bash -LANGUAGES=en,de,fr docker compose build +LANGUAGES=en,de,ja docker compose up -d --build ``` -If only one language is specified, language detection is skipped for better performance. +If only one language is configured, language detection is skipped for better performance. ## Confidence Scoring diff --git a/docs/configuration/pii-detection.mdx b/docs/configuration/pii-detection.mdx index 2097aef..3c8ad6f 100644 --- a/docs/configuration/pii-detection.mdx +++ b/docs/configuration/pii-detection.mdx @@ -8,7 +8,7 @@ description: Configure PII detection settings ```yaml pii_detection: presidio_url: http://localhost:5002 - languages: [en, de] + languages: ${PASTEGUARD_LANGUAGES:-en} # Auto-configured per Docker image fallback_language: en score_threshold: 0.7 entities: @@ -26,41 +26,51 @@ pii_detection: | Option | Default | Description | |--------|---------|-------------| | `presidio_url` | `http://localhost:5002` | Presidio analyzer URL | -| `languages` | `[en]` | Languages to detect. Must match Docker build | +| `languages` | (per image) | Languages to detect. Auto-configured in Docker images | | `fallback_language` | `en` | Fallback if detected language not in list | | `score_threshold` | `0.7` | Minimum confidence (0.0-1.0) | | `entities` | See below | Entity types to detect | ## Languages -Languages must be installed during Docker build: +Languages are auto-configured per Docker image: + +- **`:en` image** → English only +- **`:eu` image** → English, German, Spanish, French, Italian, Dutch, Polish, Portuguese, Romanian + +For custom language builds: ```bash -LANGUAGES=en,de,fr docker compose build +LANGUAGES=en,de,ja docker compose up -d --build ``` Available languages (24): `ca`, `zh`, `hr`, `da`, `nl`, `en`, `fi`, `fr`, `de`, `el`, `it`, `ja`, `ko`, `lt`, `mk`, `nb`, `pl`, `pt`, `ro`, `ru`, `sl`, `es`, `sv`, `uk` -### Single Language +### Override Languages -If only one language is specified, language detection is skipped for better performance: +For local development or custom setups, override via config: ```yaml pii_detection: - languages: [en] + languages: + - en + - de ``` ### Fallback Language -If the detected language isn't in your list, the fallback is used: +If the detected language isn't in your configured list, the fallback is used: ```yaml pii_detection: - languages: [en, de] - fallback_language: en # Used for French text, etc. + fallback_language: en # Used for unsupported languages ``` +### Performance + +If only one language is configured, language detection is skipped for better performance. + ## Entities | Entity | Examples | diff --git a/docs/installation.mdx b/docs/installation.mdx new file mode 100644 index 0000000..328e6c0 --- /dev/null +++ b/docs/installation.mdx @@ -0,0 +1,112 @@ +--- +title: Installation +description: Docker images and deployment options +--- + +# Installation + +PasteGuard provides prebuilt Docker images for quick deployment. No build step required. + +## Docker Image + +PasteGuard is a single all-in-one container that includes both the proxy and PII detection: + +``` +ghcr.io/sgasser/pasteguard +``` + +| Tag | Languages | Size | Use Case | +|-----|-----------|------|----------| +| `en` / `latest` | English | ~2.7GB | Default, English-only teams | +| `eu` | en, de, es, fr, it, nl, pl, pt, ro | ~12GB | European businesses | + +## Quick Start + +**Zero-config (works out of the box):** + +```bash +docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:en +``` + +**With custom config and persistent logs:** + +```bash +curl -O https://raw.githubusercontent.com/sgasser/pasteguard/main/config.example.yaml +mv config.example.yaml config.yaml +mkdir -p data +docker run -d --name pasteguard -p 3000:3000 \ + -v ./config.yaml:/app/config.yaml:ro \ + -v ./data:/app/data \ + ghcr.io/sgasser/pasteguard:en +``` + +**Verify:** + +```bash +curl http://localhost:3000/health +``` + +Dashboard: [http://localhost:3000/dashboard](http://localhost:3000/dashboard) + +## European Languages + +For German, Spanish, French, Italian, Dutch, Polish, Portuguese, and Romanian: + +```bash +docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:eu +``` + +Languages are auto-configured per image — no config changes needed. The EU image automatically enables all 9 European languages. + +## Custom Language Builds + +For languages not in prebuilt images (Nordic, Asian, Eastern European), clone and build: + +```bash +git clone https://github.com/sgasser/pasteguard.git +cd pasteguard +LANGUAGES=en,de,ja docker compose up -d --build +``` + +### Available Languages (24) + +| Code | Language | Code | Language | +|------|----------|------|----------| +| `en` | English | `ja` | Japanese | +| `de` | German | `ko` | Korean | +| `fr` | French | `zh` | Chinese | +| `es` | Spanish | `sv` | Swedish | +| `it` | Italian | `da` | Danish | +| `nl` | Dutch | `nb` | Norwegian | +| `pt` | Portuguese | `fi` | Finnish | +| `pl` | Polish | `el` | Greek | +| `ru` | Russian | `ro` | Romanian | +| `uk` | Ukrainian | `hr` | Croatian | +| `ca` | Catalan | `sl` | Slovenian | +| `lt` | Lithuanian | `mk` | Macedonian | + +## Environment Variables + +**Runtime (docker run):** + +| Variable | Default | Description | +|----------|---------|-------------| +| `PASTEGUARD_LANGUAGES` | (per image) | Override enabled languages at runtime | + +**Build-time (docker compose build):** + +| Variable | Default | Description | +|----------|---------|-------------| +| `PASTEGUARD_TAG` | `en` | Image tag for docker compose | +| `LANGUAGES` | `en` | Languages to include when building locally | + +## Next Steps + + + + Make your first request + + + Customize PasteGuard + + diff --git a/docs/mint.json b/docs/mint.json index a73e45c..a8801bd 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -34,7 +34,7 @@ "navigation": [ { "group": "Getting Started", - "pages": ["introduction", "quickstart", "integrations"] + "pages": ["introduction", "quickstart", "installation", "integrations"] }, { "group": "Concepts", diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index e37bd0f..1d98b8b 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -8,34 +8,16 @@ description: Get PasteGuard running in 2 minutes ## 1. Start PasteGuard ```bash -git clone https://github.com/sgasser/pasteguard.git -cd pasteguard -cp config.example.yaml config.yaml -docker compose up -d +docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:en ``` -PasteGuard runs on `http://localhost:3000`. Dashboard at `http://localhost:3000/dashboard`. +That's it! PasteGuard runs on `http://localhost:3000` with sensible defaults. - -By default, only English is installed (~1.5GB image). To add more languages: +Dashboard: [http://localhost:3000/dashboard](http://localhost:3000/dashboard) -```bash -# English + German + French (~2.5GB) -LANGUAGES=en,de,fr docker compose up -d --build -``` - -Update `config.yaml` to match: - -```yaml -pii_detection: - languages: - - en - - de - - fr -``` - -See [PII Detection Config](/configuration/pii-detection) for all 24 available languages. - + +For custom configuration, European languages, or persistent logs, see [Installation](/installation). + ## 2. Make a Request @@ -119,6 +101,9 @@ Open `http://localhost:3000/dashboard` in your browser to see: ## What's Next? + + Docker images and language options + How PII masking works @@ -128,7 +113,4 @@ Open `http://localhost:3000/dashboard` in your browser to see: Customize detection and providers - - Explore the API - diff --git a/presidio/Dockerfile b/presidio/Dockerfile deleted file mode 100644 index a99b387..0000000 --- a/presidio/Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ -# PasteGuard Presidio Analyzer -# Multi-language PII detection with configurable language support -# -# Build with specific languages: -# docker build --build-arg LANGUAGES=en,de,fr -t presidio-analyzer . -# -# Or via docker-compose: -# LANGUAGES=en,de docker-compose build presidio-analyzer - -ARG LANGUAGES="en" - -# ============================================================================= -# Stage 1: Generate configuration files from language selection -# ============================================================================= -FROM python:3.11-slim AS generator - -WORKDIR /build - -# Install PyYAML for config generation -RUN pip install --no-cache-dir pyyaml - -# Copy registry and generator script -COPY languages.yaml /build/ -COPY scripts/generate-configs.py /build/ - -# Generate configs for selected languages -ARG LANGUAGES -RUN python generate-configs.py \ - --languages="${LANGUAGES}" \ - --registry=/build/languages.yaml \ - --output=/output - -# ============================================================================= -# Stage 2: Final Presidio Analyzer image -# ============================================================================= -FROM mcr.microsoft.com/presidio-analyzer:latest - -# Copy generated configuration files -COPY --from=generator /output/nlp-config.yaml /usr/bin/presidio_analyzer/conf/default.yaml -COPY --from=generator /output/recognizers-config.yaml /usr/bin/presidio_analyzer/conf/default_recognizers.yaml -COPY --from=generator /output/analyzer-config.yaml /usr/bin/presidio_analyzer/conf/default_analyzer.yaml - -# Copy and run model installation script -COPY --from=generator /output/install-models.sh /tmp/ -RUN chmod +x /tmp/install-models.sh && /tmp/install-models.sh && rm /tmp/install-models.sh - -# Use --preload to load models once in master process (shared via copy-on-write) -# Timeout 300s for initial model loading, workers start fast after preload -CMD ["/bin/sh", "-c", "poetry run gunicorn -w $WORKERS -b 0.0.0.0:$PORT --timeout 300 --preload 'app:create_app()'"] diff --git a/src/config.ts b/src/config.ts index 0cfe236..88698f1 100644 --- a/src/config.ts +++ b/src/config.ts @@ -24,7 +24,7 @@ const MaskingSchema = z.object({ }); // All 25 spaCy languages with trained pipelines -// See presidio/languages.yaml for full list +// See docker/presidio/languages.yaml for full list const SupportedLanguages = [ "ca", // Catalan "zh", // Chinese @@ -54,9 +54,20 @@ const SupportedLanguages = [ const LanguageEnum = z.enum(SupportedLanguages); +// Accept either array or comma-separated string for languages +// This allows using env vars like PASTEGUARD_LANGUAGES=en,de,fr +const LanguagesSchema = z + .union([z.array(LanguageEnum), z.string()]) + .transform((val) => { + if (Array.isArray(val)) return val; + return val.split(",").map((s) => s.trim()) as (typeof SupportedLanguages)[number][]; + }) + .pipe(z.array(LanguageEnum)) + .default(["en"]); + const PIIDetectionSchema = z.object({ presidio_url: z.string().url(), - languages: z.array(LanguageEnum).default(["en"]), + languages: LanguagesSchema, fallback_language: LanguageEnum.default("en"), score_threshold: z.coerce.number().min(0).max(1).default(0.7), entities: z