From: Stefan Gasser Date: Tue, 20 Jan 2026 20:52:02 +0000 (+0100) Subject: Add whitelist config for masking exclusions (#53) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=e3c4c300a6f970649bf40005c193d6e520eb4fb0;p=sgasser-llm-shield.git Add whitelist config for masking exclusions (#53) Adds masking.whitelist config option to exclude specific text patterns from PII masking. Useful for preventing false positives on known text like company names or product identifiers. - Add whitelist property to MaskingSchema (default: empty array) - Add filterWhitelistedEntities function to filter detected PII - Patterns match if detected text is contained in whitelist entry or whitelist entry is contained in detected text --- diff --git a/config.example.yaml b/config.example.yaml index f7c5faa..319d059 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -42,6 +42,10 @@ masking: show_markers: false marker_text: "[protected]" + # Text patterns that are never masked (protects against false positives) + # whitelist: + # - "Company Name Inc." + # PII Detection settings (Microsoft Presidio) pii_detection: presidio_url: ${PRESIDIO_URL:-http://localhost:5002} diff --git a/src/config.ts b/src/config.ts index f22f698..f26ac32 100644 --- a/src/config.ts +++ b/src/config.ts @@ -22,6 +22,7 @@ const OpenAIProviderSchema = z.object({ const MaskingSchema = z.object({ show_markers: z.boolean().default(false), marker_text: z.string().default("[protected]"), + whitelist: z.array(z.string()).default([]), }); const LanguageEnum = z.enum(SUPPORTED_LANGUAGES); diff --git a/src/pii/detect.test.ts b/src/pii/detect.test.ts index 9b2169b..0e60030 100644 --- a/src/pii/detect.test.ts +++ b/src/pii/detect.test.ts @@ -1,7 +1,7 @@ import { afterEach, describe, expect, mock, test } from "bun:test"; import { openaiExtractor } from "../masking/extractors/openai"; import type { OpenAIMessage, OpenAIRequest } from "../providers/openai/types"; -import { PIIDetector } from "./detect"; +import { filterWhitelistedEntities, PIIDetector } from "./detect"; const originalFetch = globalThis.fetch; @@ -210,4 +210,51 @@ describe("PIIDetector", () => { expect(healthy).toBe(false); }); }); + + describe("filterWhitelistedEntities", () => { + test("filters entities matching whitelist pattern", () => { + const text = "You are Claude Code, Anthropic's official CLI for Claude."; + const entities = [{ entity_type: "PERSON", start: 8, end: 14, score: 0.9 }]; + const whitelist = ["You are Claude Code, Anthropic's official CLI for Claude."]; + + const result = filterWhitelistedEntities(text, entities, whitelist); + + expect(result).toHaveLength(0); + }); + + test("keeps entities not in whitelist", () => { + const text = "Contact John Doe at john@example.com"; + const entities = [ + { entity_type: "PERSON", start: 8, end: 16, score: 0.9 }, + { entity_type: "EMAIL_ADDRESS", start: 20, end: 36, score: 0.95 }, + ]; + const whitelist = ["Claude"]; + + const result = filterWhitelistedEntities(text, entities, whitelist); + + expect(result).toHaveLength(2); + }); + + test("filters when entity text is contained in whitelist pattern", () => { + const text = "Hello Claude, how are you?"; + const entities = [{ entity_type: "PERSON", start: 6, end: 12, score: 0.85 }]; + const whitelist = ["You are Claude Code"]; + + const result = filterWhitelistedEntities(text, entities, whitelist); + + expect(result).toHaveLength(0); + }); + + test("returns all entities when whitelist is empty", () => { + const text = "Contact Claude at claude@example.com"; + const entities = [ + { entity_type: "PERSON", start: 8, end: 14, score: 0.9 }, + { entity_type: "EMAIL_ADDRESS", start: 18, end: 36, score: 0.95 }, + ]; + + const result = filterWhitelistedEntities(text, entities, []); + + expect(result).toHaveLength(2); + }); + }); }); diff --git a/src/pii/detect.ts b/src/pii/detect.ts index eb07a44..13e3c29 100644 --- a/src/pii/detect.ts +++ b/src/pii/detect.ts @@ -10,6 +10,21 @@ export interface PIIEntity { score: number; } +export function filterWhitelistedEntities( + text: string, + entities: PIIEntity[], + whitelist: string[], +): PIIEntity[] { + if (whitelist.length === 0) return entities; + + return entities.filter((entity) => { + const detectedText = text.slice(entity.start, entity.end); + return !whitelist.some( + (pattern) => pattern.includes(detectedText) || detectedText.includes(pattern), + ); + }); +} + interface AnalyzeRequest { text: string; language: string; @@ -103,6 +118,7 @@ export class PIIDetector { const scanRoles = config.pii_detection.scan_roles ? new Set(config.pii_detection.scan_roles) : null; + const whitelist = config.masking.whitelist; const spanEntities: PIIEntity[][] = await Promise.all( spans.map(async (span) => { @@ -110,7 +126,8 @@ export class PIIDetector { return []; } if (!span.text) return []; - return this.detectPII(span.text, langResult.language); + const entities = await this.detectPII(span.text, langResult.language); + return filterWhitelistedEntities(span.text, entities, whitelist); }), ); diff --git a/src/pii/mask.test.ts b/src/pii/mask.test.ts index 39ba8b8..ff54a2a 100644 --- a/src/pii/mask.test.ts +++ b/src/pii/mask.test.ts @@ -17,11 +17,13 @@ import { const defaultConfig: MaskingConfig = { show_markers: false, marker_text: "[protected]", + whitelist: [], }; const configWithMarkers: MaskingConfig = { show_markers: true, marker_text: "[protected]", + whitelist: [], }; /** Helper to create a minimal request from messages */ diff --git a/src/providers/openai/stream-transformer.test.ts b/src/providers/openai/stream-transformer.test.ts index df9134c..797e738 100644 --- a/src/providers/openai/stream-transformer.test.ts +++ b/src/providers/openai/stream-transformer.test.ts @@ -6,6 +6,7 @@ import { createUnmaskingStream } from "./stream-transformer"; const defaultConfig: MaskingConfig = { show_markers: false, marker_text: "[protected]", + whitelist: [], }; /**