- uses: actions/checkout@v4
- name: Test Docker build
- run: docker build -t pasteguard:test .
\ No newline at end of file
+ run: docker build -f docker/Dockerfile -t pasteguard:test .
\ No newline at end of file
--- /dev/null
+name: Release
+
+on:
+ push:
+ tags: ["v*"]
+
+env:
+ REGISTRY: ghcr.io
+ IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+ build:
+ name: Build (${{ matrix.tag }})
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ packages: write
+ strategy:
+ matrix:
+ include:
+ - tag: en
+ languages: "en"
+ latest: true
+ - tag: eu
+ languages: "en,de,es,fr,it,nl,pl,pt,ro"
+ latest: false
+ steps:
+ - name: Free disk space
+ uses: endersonmenezes/free-disk-space@v3
+ with:
+ remove_android: true
+ remove_dotnet: true
+ remove_haskell: true
+ remove_tool_cache: true
+
+ - uses: actions/checkout@v6
+
+ - uses: docker/setup-buildx-action@v3
+
+ - uses: docker/login-action@v3
+ with:
+ registry: ${{ env.REGISTRY }}
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Extract version from tag
+ id: version
+ run: echo "version=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
+
+ - uses: docker/build-push-action@v6
+ with:
+ context: .
+ file: docker/Dockerfile
+ platforms: linux/amd64,linux/arm64
+ push: true
+ build-args: LANGUAGES=${{ matrix.languages }}
+ tags: |
+ ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ matrix.tag }}
+ ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}-${{ matrix.tag }}
+ ${{ matrix.latest && format('{0}/{1}:latest', env.REGISTRY, env.IMAGE_NAME) || '' }}
+ ${{ matrix.latest && format('{0}/{1}:{2}', env.REGISTRY, env.IMAGE_NAME, steps.version.outputs.version) || '' }}
+ cache-from: type=gha,scope=${{ matrix.tag }}
+ cache-to: type=gha,mode=max,scope=${{ matrix.tag }}
## Setup
-**Production:** `docker compose up -d`
-
-**Development:**
+**Production:**
```bash
cp config.example.yaml config.yaml
-docker compose up presidio-analyzer -d
-bun install && bun run dev
+docker compose up -d
+```
+
+**Development:** Presidio in Docker, Bun locally with hot-reload:
+```bash
+docker compose up presidio -d
+bun run dev
```
-**Dependencies:**
-- Presidio (port 5002) - required
-- Ollama (port 11434) - route mode only
+**Multi-language:** Use EU image or build custom:
+```bash
+PASTEGUARD_TAG=eu docker compose up -d
+LANGUAGES=en,de,ja docker compose up -d --build
+```
-**Multi-language PII:** Build with `LANGUAGES=en,de,fr docker compose build`. See @presidio/languages.yaml for 24 available languages.
+See @docker/presidio/languages.yaml for 24 available languages.
## Testing
1. Fork and clone the repository
2. Install dependencies: `bun install`
3. Copy config: `cp config.example.yaml config.yaml`
-4. Start Presidio: `docker compose up presidio-analyzer -d`
+4. Start Presidio: `docker compose up presidio -d`
5. Run dev server: `bun run dev`
## Code Quality
+++ /dev/null
-FROM oven/bun:1-slim
-
-WORKDIR /app
-
-# Install dependencies
-COPY package.json bun.lock ./
-RUN bun install --frozen-lockfile --production
-
-# Copy source
-COPY src ./src
-COPY tsconfig.json ./
-
-EXPOSE 3000
-
-CMD ["bun", "run", "src/index.ts"]
<br/>
-<p align="center">
- <img src="assets/demo.gif" width="720" alt="PasteGuard Demo">
-</p>
-<p align="center">
- <em>Your App → PasteGuard → OpenAI — PII never reaches external servers</em>
-</p>
-
-<br/>
+<img src="assets/dashboard.png" width="100%" alt="PasteGuard Dashboard">
## What is PasteGuard?
## Quick Start
```bash
-git clone https://github.com/sgasser/pasteguard.git
-cd pasteguard
-cp config.example.yaml config.yaml
-docker compose up -d
+docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:en
```
Point your app to `http://localhost:3000/openai/v1` instead of `https://api.openai.com/v1`.
Dashboard: [http://localhost:3000/dashboard](http://localhost:3000/dashboard)
-<img src="assets/dashboard.png" width="100%" alt="PasteGuard Dashboard">
-<p><em>Every request logged with masked content preview</em></p>
+### Multiple Languages
+
+For European languages (German, Spanish, French, Italian, Dutch, Polish, Portuguese, Romanian):
+
+```bash
+docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:eu
+```
-For multiple languages, configuration options, and more: **[Read the docs →](https://pasteguard.com/docs/quickstart)**
+For custom config, persistent logs, or other languages: **[Read the docs →](https://pasteguard.com/docs/installation)**
## Integrations
- GitHub tokens
- JWT tokens
- Bearer tokens
-- Env passwords
-- Env secrets
-- Connection strings
## Tech Stack
# Supported languages for PII detection
# Auto-detects language from input text and uses appropriate model
#
- # Languages must match what was installed during docker build:
- # LANGUAGES=en,de docker-compose build
+ # Docker: Uses PASTEGUARD_LANGUAGES env var (auto-configured per image)
+ # Local: Uncomment the array below and comment out the env var line
#
# Available (24 languages): ca, zh, hr, da, nl, en, fi, fr, de, el,
# it, ja, ko, lt, mk, nb, pl, pt, ro, ru, sl, es, sv, uk
- # See presidio/languages.yaml for full list with details
- languages:
- - en
- # Add more languages to match your Docker build:
- # - de
- # - fr
- # - es
- # - it
+ # See docker/presidio/languages.yaml for full list with details
+ languages: ${PASTEGUARD_LANGUAGES:-en}
+ # languages:
+ # - en
+ # - de
+ # - fr
+ # - es
+ # - it
# Fallback language if detected language is not in the list above
fallback_language: en
# - BEARER_TOKEN: Bearer tokens in Authorization-style contexts
#
# Environment Variables (opt-in):
- # - ENV_PASSWORD: Password variables (DB_PASSWORD=..., ADMIN_PWD=...)
- # - ENV_SECRET: Secret variables (APP_SECRET=..., JWT_SECRET=...)
- # - CONNECTION_STRING: Database URLs with credentials (postgres://user:pass@host/db)
+ # - ENV_PASSWORD: DB_PASSWORD=..., ADMIN_PWD=... (8+ char values)
+ # - ENV_SECRET: APP_SECRET=..., JWT_SECRET=... (8+ char values)
+ # - CONNECTION_STRING: postgres://user:pass@host, mongodb://...
entities:
- OPENSSH_PRIVATE_KEY
- PEM_PRIVATE_KEY
# - API_KEY_GITHUB
# - JWT_TOKEN
# - BEARER_TOKEN
- # Uncomment to detect environment variable credentials:
# - ENV_PASSWORD
# - ENV_SECRET
# - CONNECTION_STRING
+# PasteGuard - Docker Compose (All-in-One Image)
+#
+# Production:
+# docker compose up -d
+#
+# Development (only Presidio, run Bun locally with hot-reload):
+# docker compose up presidio -d
+# bun run dev
+#
+# European languages:
+# PASTEGUARD_TAG=eu docker compose up -d
+#
+# Custom languages (local build):
+# LANGUAGES=en,de,ja docker compose up -d --build
+
services:
+ # Production: Full all-in-one
pasteguard:
- build: .
+ image: ghcr.io/sgasser/pasteguard:${PASTEGUARD_TAG:-en}
+ build:
+ context: .
+ dockerfile: docker/Dockerfile
+ args:
+ LANGUAGES: ${LANGUAGES:-en}
ports:
- "3000:3000"
env_file:
- - .env
- environment:
- - PRESIDIO_URL=http://presidio-analyzer:3000
+ - path: .env
+ required: false
volumes:
- ./config.yaml:/app/config.yaml:ro
- ./data:/app/data
- depends_on:
- presidio-analyzer:
- condition: service_healthy
restart: unless-stopped
- presidio-analyzer:
+ # Development: Only Presidio (for local Bun with hot-reload)
+ presidio:
+ profiles: ["dev"]
+ image: ghcr.io/sgasser/pasteguard:${PASTEGUARD_TAG:-en}
build:
- context: ./presidio
+ context: .
+ dockerfile: docker/Dockerfile
args:
- # Languages to install for PII detection
- # Available: ca, zh, hr, da, nl, en, fi, fr, de, el, it, ja, ko,
- # lt, mk, nb, pl, pt, ro, ru, sl, es, sv, uk
- # See presidio/languages.yaml for full list
- # Example: LANGUAGES=en,de,fr docker-compose build
LANGUAGES: ${LANGUAGES:-en}
ports:
- - "5002:3000"
+ - "5002:5002"
+ environment:
+ - START_APP=false
+ - PORT=5002
+ - WORKERS=1
healthcheck:
- test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3000/health')"]
- interval: 30s
- timeout: 10s
- retries: 5
- start_period: 60s
+ disable: true
restart: unless-stopped
--- /dev/null
+# PasteGuard - All-in-One Image
+# Single container with Proxy + PII Detection
+#
+# Build: docker build -f docker/Dockerfile --build-arg LANGUAGES=en -t pasteguard:en .
+# Run: docker run -p 3000:3000 -v ./config.yaml:/app/config.yaml -v ./data:/app/data pasteguard:en
+
+ARG LANGUAGES="en"
+
+# =============================================================================
+# Stage 1: Generate Presidio configuration files
+# =============================================================================
+FROM python:3.11-slim AS generator
+
+WORKDIR /build
+
+RUN pip install --no-cache-dir pyyaml
+
+COPY docker/presidio/languages.yaml /build/
+COPY docker/presidio/generate-configs.py /build/
+
+ARG LANGUAGES
+RUN python generate-configs.py \
+ --languages="${LANGUAGES}" \
+ --registry=/build/languages.yaml \
+ --output=/output
+
+# =============================================================================
+# Stage 2: Build Bun application
+# =============================================================================
+FROM oven/bun:1-slim AS bun-builder
+
+WORKDIR /app
+
+COPY package.json bun.lock ./
+RUN bun install --frozen-lockfile --production
+
+COPY src ./src
+COPY tsconfig.json ./
+
+# =============================================================================
+# Stage 3: Final combined image
+# =============================================================================
+FROM mcr.microsoft.com/presidio-analyzer:latest
+
+ARG LANGUAGES
+
+# Install supervisor for process management
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ supervisor \
+ curl \
+ unzip \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Rust only if Japanese is included
+RUN if echo "${LANGUAGES}" | grep -q "ja"; then \
+ apt-get update && apt-get install -y --no-install-recommends build-essential \
+ && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*; \
+ fi
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Bun
+RUN curl -fsSL https://bun.sh/install | bash
+ENV PATH="/root/.bun/bin:${PATH}"
+
+# Copy Presidio configuration
+COPY --from=generator /output/nlp-config.yaml /usr/bin/presidio_analyzer/conf/default.yaml
+COPY --from=generator /output/recognizers-config.yaml /usr/bin/presidio_analyzer/conf/default_recognizers.yaml
+COPY --from=generator /output/analyzer-config.yaml /usr/bin/presidio_analyzer/conf/default_analyzer.yaml
+
+# Install spaCy models
+COPY --from=generator /output/install-models.sh /tmp/
+RUN chmod +x /tmp/install-models.sh && /tmp/install-models.sh && rm /tmp/install-models.sh
+
+# Copy Bun application
+WORKDIR /app
+COPY --from=bun-builder /app/node_modules ./node_modules
+COPY --from=bun-builder /app/src ./src
+COPY --from=bun-builder /app/package.json ./
+COPY --from=bun-builder /app/tsconfig.json ./
+COPY config.example.yaml ./
+
+# Create data directory
+RUN mkdir -p /app/data
+
+# Copy supervisor configuration
+COPY docker/supervisord.conf /etc/supervisor/conf.d/pasteguard.conf
+
+# Environment defaults
+ENV PRESIDIO_URL=http://localhost:5002
+ENV PORT=5002
+ENV WORKERS=1
+ENV START_APP=true
+ENV PASTEGUARD_LANGUAGES=${LANGUAGES}
+
+EXPOSE 3000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+ CMD curl -f http://localhost:3000/health || exit 1
+
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/pasteguard.conf"]
"models": models,
"ner_model_configuration": {
"model_to_presidio_entity_mapping": {
+ # Standard labels (most languages)
"PER": "PERSON",
"PERSON": "PERSON",
"LOC": "LOCATION",
"GPE": "LOCATION",
"ORG": "ORGANIZATION",
+ # Polish (NKJP corpus)
+ "persName": "PERSON",
+ "placeName": "LOCATION",
+ "geogName": "LOCATION",
+ "orgName": "ORGANIZATION",
+ # Korean
+ "PS": "PERSON",
+ "LC": "LOCATION",
+ "OG": "ORGANIZATION",
+ # Swedish
+ "PRS": "PERSON",
+ # Norwegian
+ "GPE_LOC": "LOCATION",
},
"low_confidence_score_multiplier": 0.4,
"low_score_entity_names": ["ORG"],
model = registry["languages"][lang]["model"]
url = f"https://github.com/explosion/spacy-models/releases/download/{model}-{version}/{model}-{version}-py3-none-any.whl"
lines.append(f'echo "Installing {model} for {lang}..."')
- lines.append(f"pip install --no-cache-dir {url}")
+ # Use poetry run pip to install in the correct virtual environment
+ lines.append(f"poetry run pip install --no-cache-dir {url}")
lines.append("")
lines.append('echo "All models installed successfully"')
# PasteGuard Language Registry
-# All 24 spaCy languages with trained pipelines
+# 24 spaCy languages with large models for accurate PII detection
#
-# Usage: Set LANGUAGES build arg to select which to install
-# LANGUAGES=en,de docker-compose build
-#
-# To add a custom language, add an entry here with model name
+# Usage: LANGUAGES=en,de docker compose build
spacy_version: "3.8.0"
# Catalan
ca:
name: Catalan
- model: ca_core_news_md
+ model: ca_core_news_lg
phone_context: [telèfon, número, mòbil, trucada, trucar]
# Chinese
zh:
name: Chinese
- model: zh_core_web_md
+ model: zh_core_web_lg
phone_context: [电话, 手机, 号码, 打电话, 电话号码, 手机号码]
# Croatian
hr:
name: Croatian
- model: hr_core_news_md
+ model: hr_core_news_lg
phone_context: [telefon, broj, mobitel, poziv, nazovi, zvati]
# Danish
da:
name: Danish
- model: da_core_news_md
+ model: da_core_news_lg
phone_context: [telefon, nummer, mobil, mobiltelefon, opkald, ringe]
# Dutch
nl:
name: Dutch
- model: nl_core_news_md
+ model: nl_core_news_lg
phone_context: [telefoon, nummer, mobiel, mobieltje, GSM, bellen]
# English (Presidio defaults)
# Finnish
fi:
name: Finnish
- model: fi_core_news_md
+ model: fi_core_news_lg
phone_context: [puhelin, numero, kännykkä, matkapuhelin, soittaa, puhelinnumero]
# French
fr:
name: French
- model: fr_core_news_md
+ model: fr_core_news_lg
phone_context: [téléphone, numéro, portable, mobile, appeler, tél]
# German
# Greek
el:
name: Greek
- model: el_core_news_md
+ model: el_core_news_lg
phone_context: [τηλέφωνο, αριθμός, κινητό, κλήση, τηλεφωνώ, καλώ]
# Italian
it:
name: Italian
- model: it_core_news_md
+ model: it_core_news_lg
phone_context: [telefono, numero, cellulare, telefonino, chiamare, chiamata]
# Japanese
ja:
name: Japanese
- model: ja_core_news_md
+ model: ja_core_news_lg
phone_context: [電話, 携帯, 番号, スマホ, ケータイ, 電話番号]
# Korean
ko:
name: Korean
- model: ko_core_news_md
+ model: ko_core_news_lg
phone_context: [전화, 휴대폰, 핸드폰, 번호, 전화번호, 통화]
# Lithuanian
lt:
name: Lithuanian
- model: lt_core_news_md
+ model: lt_core_news_lg
phone_context: [telefonas, numeris, mobilusis, skambutis, skambinti]
# Macedonian
mk:
name: Macedonian
- model: mk_core_news_md
+ model: mk_core_news_lg
phone_context: [телефон, број, мобилен, повик, звони]
# Norwegian Bokmål
nb:
name: Norwegian
- model: nb_core_news_md
+ model: nb_core_news_lg
phone_context: [telefon, nummer, mobil, mobiltelefon, samtale, ringe]
# Polish
pl:
name: Polish
- model: pl_core_news_md
+ model: pl_core_news_lg
phone_context: [telefon, numer, komórka, komórkowy, dzwoń, zadzwoń]
# Portuguese
pt:
name: Portuguese
- model: pt_core_news_md
+ model: pt_core_news_lg
phone_context: [telefone, número, celular, telemóvel, ligar, telefonar]
# Romanian
ro:
name: Romanian
- model: ro_core_news_md
+ model: ro_core_news_lg
phone_context: [telefon, număr, mobil, apel, suna]
# Russian
ru:
name: Russian
- model: ru_core_news_md
+ model: ru_core_news_lg
phone_context: [телефон, номер, мобильник, мобила, сотовый, звонок]
# Slovenian
sl:
name: Slovenian
- model: sl_core_news_md
+ model: sl_core_news_lg
phone_context: [telefon, številka, mobilnik, mobilec, klic, pokliči]
# Spanish
es:
name: Spanish
- model: es_core_news_md
+ model: es_core_news_lg
phone_context: [teléfono, número, móvil, celular, llamar, llamada]
# Swedish
sv:
name: Swedish
- model: sv_core_news_md
+ model: sv_core_news_lg
phone_context: [telefon, nummer, mobil, mobiltelefon, samtal, ringa]
# Ukrainian
uk:
name: Ukrainian
- model: uk_core_news_md
+ model: uk_core_news_lg
phone_context: [телефон, номер, мобільний, мобілка, дзвінок, дзвони]
--- /dev/null
+[supervisord]
+nodaemon=true
+user=root
+logfile=/var/log/supervisor/supervisord.log
+pidfile=/var/run/supervisord.pid
+loglevel=info
+
+[program:presidio]
+command=poetry run gunicorn -w %(ENV_WORKERS)s -b 0.0.0.0:%(ENV_PORT)s --timeout 300 --preload "app:create_app()"
+directory=/usr/bin
+autostart=true
+autorestart=true
+startsecs=10
+startretries=3
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=10
+
+[program:pasteguard]
+command=/root/.bun/bin/bun run src/index.ts
+directory=/app
+autostart=%(ENV_START_APP)s
+autorestart=true
+startsecs=5
+startretries=3
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=20
**Available languages:** Catalan, Chinese, Croatian, Danish, Dutch, English, Finnish, French, German, Greek, Italian, Japanese, Korean, Lithuanian, Macedonian, Norwegian, Polish, Portuguese, Romanian, Russian, Slovenian, Spanish, Swedish, Ukrainian
-### Configure Languages
+### Docker Images
-Languages must be installed during Docker build:
+Languages are auto-configured per image — no config changes needed:
+
+- **`:en` image** → English only
+- **`:eu` image** → English, German, Spanish, French, Italian, Dutch, Polish, Portuguese, Romanian
+
+For custom languages, build locally:
```bash
-LANGUAGES=en,de,fr docker compose build
+LANGUAGES=en,de,ja docker compose up -d --build
```
-If only one language is specified, language detection is skipped for better performance.
+If only one language is configured, language detection is skipped for better performance.
## Confidence Scoring
```yaml
pii_detection:
presidio_url: http://localhost:5002
- languages: [en, de]
+ languages: ${PASTEGUARD_LANGUAGES:-en} # Auto-configured per Docker image
fallback_language: en
score_threshold: 0.7
entities:
| Option | Default | Description |
|--------|---------|-------------|
| `presidio_url` | `http://localhost:5002` | Presidio analyzer URL |
-| `languages` | `[en]` | Languages to detect. Must match Docker build |
+| `languages` | (per image) | Languages to detect. Auto-configured in Docker images |
| `fallback_language` | `en` | Fallback if detected language not in list |
| `score_threshold` | `0.7` | Minimum confidence (0.0-1.0) |
| `entities` | See below | Entity types to detect |
## Languages
-Languages must be installed during Docker build:
+Languages are auto-configured per Docker image:
+
+- **`:en` image** → English only
+- **`:eu` image** → English, German, Spanish, French, Italian, Dutch, Polish, Portuguese, Romanian
+
+For custom language builds:
```bash
-LANGUAGES=en,de,fr docker compose build
+LANGUAGES=en,de,ja docker compose up -d --build
```
Available languages (24):
`ca`, `zh`, `hr`, `da`, `nl`, `en`, `fi`, `fr`, `de`, `el`, `it`, `ja`, `ko`, `lt`, `mk`, `nb`, `pl`, `pt`, `ro`, `ru`, `sl`, `es`, `sv`, `uk`
-### Single Language
+### Override Languages
-If only one language is specified, language detection is skipped for better performance:
+For local development or custom setups, override via config:
```yaml
pii_detection:
- languages: [en]
+ languages:
+ - en
+ - de
```
### Fallback Language
-If the detected language isn't in your list, the fallback is used:
+If the detected language isn't in your configured list, the fallback is used:
```yaml
pii_detection:
- languages: [en, de]
- fallback_language: en # Used for French text, etc.
+ fallback_language: en # Used for unsupported languages
```
+### Performance
+
+If only one language is configured, language detection is skipped for better performance.
+
## Entities
| Entity | Examples |
--- /dev/null
+---
+title: Installation
+description: Docker images and deployment options
+---
+
+# Installation
+
+PasteGuard provides prebuilt Docker images for quick deployment. No build step required.
+
+## Docker Image
+
+PasteGuard is a single all-in-one container that includes both the proxy and PII detection:
+
+```
+ghcr.io/sgasser/pasteguard
+```
+
+| Tag | Languages | Size | Use Case |
+|-----|-----------|------|----------|
+| `en` / `latest` | English | ~2.7GB | Default, English-only teams |
+| `eu` | en, de, es, fr, it, nl, pl, pt, ro | ~12GB | European businesses |
+
+## Quick Start
+
+**Zero-config (works out of the box):**
+
+```bash
+docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:en
+```
+
+**With custom config and persistent logs:**
+
+```bash
+curl -O https://raw.githubusercontent.com/sgasser/pasteguard/main/config.example.yaml
+mv config.example.yaml config.yaml
+mkdir -p data
+docker run -d --name pasteguard -p 3000:3000 \
+ -v ./config.yaml:/app/config.yaml:ro \
+ -v ./data:/app/data \
+ ghcr.io/sgasser/pasteguard:en
+```
+
+**Verify:**
+
+```bash
+curl http://localhost:3000/health
+```
+
+Dashboard: [http://localhost:3000/dashboard](http://localhost:3000/dashboard)
+
+## European Languages
+
+For German, Spanish, French, Italian, Dutch, Polish, Portuguese, and Romanian:
+
+```bash
+docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:eu
+```
+
+Languages are auto-configured per image — no config changes needed. The EU image automatically enables all 9 European languages.
+
+## Custom Language Builds
+
+For languages not in prebuilt images (Nordic, Asian, Eastern European), clone and build:
+
+```bash
+git clone https://github.com/sgasser/pasteguard.git
+cd pasteguard
+LANGUAGES=en,de,ja docker compose up -d --build
+```
+
+### Available Languages (24)
+
+| Code | Language | Code | Language |
+|------|----------|------|----------|
+| `en` | English | `ja` | Japanese |
+| `de` | German | `ko` | Korean |
+| `fr` | French | `zh` | Chinese |
+| `es` | Spanish | `sv` | Swedish |
+| `it` | Italian | `da` | Danish |
+| `nl` | Dutch | `nb` | Norwegian |
+| `pt` | Portuguese | `fi` | Finnish |
+| `pl` | Polish | `el` | Greek |
+| `ru` | Russian | `ro` | Romanian |
+| `uk` | Ukrainian | `hr` | Croatian |
+| `ca` | Catalan | `sl` | Slovenian |
+| `lt` | Lithuanian | `mk` | Macedonian |
+
+## Environment Variables
+
+**Runtime (docker run):**
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `PASTEGUARD_LANGUAGES` | (per image) | Override enabled languages at runtime |
+
+**Build-time (docker compose build):**
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `PASTEGUARD_TAG` | `en` | Image tag for docker compose |
+| `LANGUAGES` | `en` | Languages to include when building locally |
+
+## Next Steps
+
+<CardGroup cols={2}>
+ <Card title="Quickstart" icon="rocket" href="/quickstart">
+ Make your first request
+ </Card>
+ <Card title="Configuration" icon="gear" href="/configuration/overview">
+ Customize PasteGuard
+ </Card>
+</CardGroup>
"navigation": [
{
"group": "Getting Started",
- "pages": ["introduction", "quickstart", "integrations"]
+ "pages": ["introduction", "quickstart", "installation", "integrations"]
},
{
"group": "Concepts",
## 1. Start PasteGuard
```bash
-git clone https://github.com/sgasser/pasteguard.git
-cd pasteguard
-cp config.example.yaml config.yaml
-docker compose up -d
+docker run -d --name pasteguard -p 3000:3000 ghcr.io/sgasser/pasteguard:en
```
-PasteGuard runs on `http://localhost:3000`. Dashboard at `http://localhost:3000/dashboard`.
+That's it! PasteGuard runs on `http://localhost:3000` with sensible defaults.
-<Accordion title="Multiple Languages">
-By default, only English is installed (~1.5GB image). To add more languages:
+Dashboard: [http://localhost:3000/dashboard](http://localhost:3000/dashboard)
-```bash
-# English + German + French (~2.5GB)
-LANGUAGES=en,de,fr docker compose up -d --build
-```
-
-Update `config.yaml` to match:
-
-```yaml
-pii_detection:
- languages:
- - en
- - de
- - fr
-```
-
-See [PII Detection Config](/configuration/pii-detection) for all 24 available languages.
-</Accordion>
+<Note>
+For custom configuration, European languages, or persistent logs, see [Installation](/installation).
+</Note>
## 2. Make a Request
## What's Next?
<CardGroup cols={2}>
+ <Card title="Installation" icon="download" href="/installation">
+ Docker images and language options
+ </Card>
<Card title="Mask Mode" icon="eye-slash" href="/concepts/mask-mode">
How PII masking works
</Card>
<Card title="Configuration" icon="gear" href="/configuration/overview">
Customize detection and providers
</Card>
- <Card title="API Reference" icon="code" href="/api-reference/chat-completions">
- Explore the API
- </Card>
</CardGroup>
+++ /dev/null
-# PasteGuard Presidio Analyzer
-# Multi-language PII detection with configurable language support
-#
-# Build with specific languages:
-# docker build --build-arg LANGUAGES=en,de,fr -t presidio-analyzer .
-#
-# Or via docker-compose:
-# LANGUAGES=en,de docker-compose build presidio-analyzer
-
-ARG LANGUAGES="en"
-
-# =============================================================================
-# Stage 1: Generate configuration files from language selection
-# =============================================================================
-FROM python:3.11-slim AS generator
-
-WORKDIR /build
-
-# Install PyYAML for config generation
-RUN pip install --no-cache-dir pyyaml
-
-# Copy registry and generator script
-COPY languages.yaml /build/
-COPY scripts/generate-configs.py /build/
-
-# Generate configs for selected languages
-ARG LANGUAGES
-RUN python generate-configs.py \
- --languages="${LANGUAGES}" \
- --registry=/build/languages.yaml \
- --output=/output
-
-# =============================================================================
-# Stage 2: Final Presidio Analyzer image
-# =============================================================================
-FROM mcr.microsoft.com/presidio-analyzer:latest
-
-# Copy generated configuration files
-COPY --from=generator /output/nlp-config.yaml /usr/bin/presidio_analyzer/conf/default.yaml
-COPY --from=generator /output/recognizers-config.yaml /usr/bin/presidio_analyzer/conf/default_recognizers.yaml
-COPY --from=generator /output/analyzer-config.yaml /usr/bin/presidio_analyzer/conf/default_analyzer.yaml
-
-# Copy and run model installation script
-COPY --from=generator /output/install-models.sh /tmp/
-RUN chmod +x /tmp/install-models.sh && /tmp/install-models.sh && rm /tmp/install-models.sh
-
-# Use --preload to load models once in master process (shared via copy-on-write)
-# Timeout 300s for initial model loading, workers start fast after preload
-CMD ["/bin/sh", "-c", "poetry run gunicorn -w $WORKERS -b 0.0.0.0:$PORT --timeout 300 --preload 'app:create_app()'"]
});
// All 25 spaCy languages with trained pipelines
-// See presidio/languages.yaml for full list
+// See docker/presidio/languages.yaml for full list
const SupportedLanguages = [
"ca", // Catalan
"zh", // Chinese
const LanguageEnum = z.enum(SupportedLanguages);
+// Accept either array or comma-separated string for languages
+// This allows using env vars like PASTEGUARD_LANGUAGES=en,de,fr
+const LanguagesSchema = z
+ .union([z.array(LanguageEnum), z.string()])
+ .transform((val) => {
+ if (Array.isArray(val)) return val;
+ return val.split(",").map((s) => s.trim()) as (typeof SupportedLanguages)[number][];
+ })
+ .pipe(z.array(LanguageEnum))
+ .default(["en"]);
+
const PIIDetectionSchema = z.object({
presidio_url: z.string().url(),
- languages: z.array(LanguageEnum).default(["en"]),
+ languages: LanguagesSchema,
fallback_language: LanguageEnum.default("en"),
score_threshold: z.coerce.number().min(0).max(1).default(0.7),
entities: z