From: Stefan Gasser Date: Sat, 17 Jan 2026 19:32:54 +0000 (+0100) Subject: Add per-part PII/secrets detection for multimodal messages (#47) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=050273513d27dc4e97ffad6807122bde22199e61;p=sgasser-llm-shield.git Add per-part PII/secrets detection for multimodal messages (#47) Each text block in multimodal messages is now analyzed separately rather than concatenating all text together. This enables precise masking while preserving message structure. Changes: - Per-part detection for both PII and secrets - Symmetric pii/ and secrets/ module structure - Shared utilities in utils/message-transform.ts - Rename "redact" → "mask" for consistency - Centralize MaskResult, Span interfaces Breaking changes: - Header: X-PasteGuard-Secrets-Redacted → X-PasteGuard-Secrets-Masked - Config: secrets_detection.action "redact" → "mask" --- diff --git a/config.example.yaml b/config.example.yaml index 6f09440..ec1b4e3 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -86,10 +86,10 @@ secrets_detection: enabled: true # Action to take when secrets are detected: - # redact: Replace secrets with placeholders, unmask in response (default) + # mask: Replace secrets with placeholders, unmask in response (default) # block: Block the request with HTTP 400 # route_local: Route to local provider (only works in route mode) - action: redact + action: mask # Secret types to detect # Private Keys (enabled by default): diff --git a/docs/api-reference/chat-completions.mdx b/docs/api-reference/chat-completions.mdx index cd922be..d98673f 100644 --- a/docs/api-reference/chat-completions.mdx +++ b/docs/api-reference/chat-completions.mdx @@ -123,4 +123,4 @@ PasteGuard adds headers to indicate PII and secrets handling: | `X-PasteGuard-Language-Fallback` | `true` if configured language was not available | | `X-PasteGuard-Secrets-Detected` | `true` if secrets were found | | `X-PasteGuard-Secrets-Types` | Comma-separated list of detected secret types | -| `X-PasteGuard-Secrets-Redacted` | `true` if secrets were redacted | +| `X-PasteGuard-Secrets-Masked` | `true` if secrets were masked | diff --git a/docs/concepts/secrets-detection.mdx b/docs/concepts/secrets-detection.mdx index 0966b01..0daca06 100644 --- a/docs/concepts/secrets-detection.mdx +++ b/docs/concepts/secrets-detection.mdx @@ -5,7 +5,7 @@ description: Detect and protect private keys, API keys, tokens, and environment # Secrets Detection -PasteGuard detects secrets before PII detection and can block, redact, or route requests containing sensitive credentials. +PasteGuard detects secrets before PII detection and can block, mask, or route requests containing sensitive credentials. ## Supported Secret Types @@ -43,15 +43,15 @@ PasteGuard detects secrets before PII detection and can block, redact, or route | Action | Description | |--------|-------------| -| `redact` | Replace secrets with placeholders, restore in response (default) | +| `mask` | Replace secrets with placeholders, restore in response (default) | | `block` | Return HTTP 400, request never reaches LLM | | `route_local` | Route to local LLM (requires route mode) | -### Redact (Default) +### Mask (Default) ```yaml secrets_detection: - action: redact + action: mask ``` Secrets are replaced with placeholders and restored in the response (like PII masking). @@ -85,8 +85,8 @@ X-PasteGuard-Secrets-Detected: true X-PasteGuard-Secrets-Types: OPENSSH_PRIVATE_KEY,API_KEY_OPENAI ``` -If secrets were redacted: +If secrets were masked: ``` -X-PasteGuard-Secrets-Redacted: true +X-PasteGuard-Secrets-Masked: true ``` diff --git a/docs/configuration/secrets-detection.mdx b/docs/configuration/secrets-detection.mdx index c813523..d5b3f62 100644 --- a/docs/configuration/secrets-detection.mdx +++ b/docs/configuration/secrets-detection.mdx @@ -8,7 +8,7 @@ description: Configure detection of private keys, API keys, tokens, and environm ```yaml secrets_detection: enabled: true - action: redact + action: mask entities: - OPENSSH_PRIVATE_KEY - PEM_PRIVATE_KEY @@ -21,7 +21,7 @@ secrets_detection: | Option | Default | Description | |--------|---------|-------------| | `enabled` | `true` | Enable secrets detection | -| `action` | `redact` | Action when secrets found | +| `action` | `mask` | Action when secrets found | | `entities` | Private keys | Secret types to detect | | `max_scan_chars` | `200000` | Max characters to scan (0 = unlimited) | | `log_detected_types` | `true` | Log detected types (never logs content) | @@ -30,15 +30,15 @@ secrets_detection: | Action | Description | |--------|-------------| -| `redact` | Replace secrets with placeholders, restore in response (default) | +| `mask` | Replace secrets with placeholders, restore in response (default) | | `block` | Return HTTP 400, request never reaches LLM | | `route_local` | Route to local LLM (requires route mode) | -### Redact (Default) +### Mask (Default) ```yaml secrets_detection: - action: redact + action: mask ``` ### Block diff --git a/src/config.ts b/src/config.ts index 88698f1..461c188 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,6 +1,7 @@ import { existsSync, readFileSync, statSync } from "node:fs"; import { parse as parseYaml } from "yaml"; import { z } from "zod"; +import { SUPPORTED_LANGUAGES } from "./constants/languages"; // Schema definitions @@ -23,36 +24,7 @@ const MaskingSchema = z.object({ marker_text: z.string().default("[protected]"), }); -// All 25 spaCy languages with trained pipelines -// See docker/presidio/languages.yaml for full list -const SupportedLanguages = [ - "ca", // Catalan - "zh", // Chinese - "hr", // Croatian - "da", // Danish - "nl", // Dutch - "en", // English - "fi", // Finnish - "fr", // French - "de", // German - "el", // Greek - "it", // Italian - "ja", // Japanese - "ko", // Korean - "lt", // Lithuanian - "mk", // Macedonian - "nb", // Norwegian - "pl", // Polish - "pt", // Portuguese - "ro", // Romanian - "ru", // Russian - "sl", // Slovenian - "es", // Spanish - "sv", // Swedish - "uk", // Ukrainian -] as const; - -const LanguageEnum = z.enum(SupportedLanguages); +const LanguageEnum = z.enum(SUPPORTED_LANGUAGES); // Accept either array or comma-separated string for languages // This allows using env vars like PASTEGUARD_LANGUAGES=en,de,fr @@ -60,7 +32,7 @@ const LanguagesSchema = z .union([z.array(LanguageEnum), z.string()]) .transform((val) => { if (Array.isArray(val)) return val; - return val.split(",").map((s) => s.trim()) as (typeof SupportedLanguages)[number][]; + return val.split(",").map((s) => s.trim()) as (typeof SUPPORTED_LANGUAGES)[number][]; }) .pipe(z.array(LanguageEnum)) .default(["en"]); @@ -121,7 +93,7 @@ const SecretEntityTypes = [ const SecretsDetectionSchema = z.object({ enabled: z.boolean().default(true), - action: z.enum(["block", "redact", "route_local"]).default("redact"), + action: z.enum(["block", "mask", "route_local"]).default("mask"), entities: z.array(z.enum(SecretEntityTypes)).default(["OPENSSH_PRIVATE_KEY", "PEM_PRIVATE_KEY"]), max_scan_chars: z.coerce.number().int().min(0).default(200000), log_detected_types: z.boolean().default(true), @@ -165,7 +137,7 @@ const ConfigSchema = z }, { message: - "secrets_detection.action 'route_local' is not compatible with mode 'mask'. Use mode 'route' or change secrets_detection.action to 'block' or 'redact'", + "secrets_detection.action 'route_local' is not compatible with mode 'mask'. Use mode 'route' or change secrets_detection.action to 'block' or 'mask'", }, ); diff --git a/src/constants/languages.ts b/src/constants/languages.ts new file mode 100644 index 0000000..56b214f --- /dev/null +++ b/src/constants/languages.ts @@ -0,0 +1,32 @@ +/** + * All 24 spaCy languages with trained pipelines + * See docker/presidio/languages.yaml for full list + */ +export const SUPPORTED_LANGUAGES = [ + "ca", // Catalan + "zh", // Chinese + "hr", // Croatian + "da", // Danish + "nl", // Dutch + "en", // English + "fi", // Finnish + "fr", // French + "de", // German + "el", // Greek + "it", // Italian + "ja", // Japanese + "ko", // Korean + "lt", // Lithuanian + "mk", // Macedonian + "nb", // Norwegian + "pl", // Polish + "pt", // Portuguese + "ro", // Romanian + "ru", // Russian + "sl", // Slovenian + "es", // Spanish + "sv", // Swedish + "uk", // Ukrainian +] as const; + +export type SupportedLanguage = (typeof SUPPORTED_LANGUAGES)[number]; diff --git a/src/index.ts b/src/index.ts index 003ea38..c041135 100644 --- a/src/index.ts +++ b/src/index.ts @@ -4,12 +4,12 @@ import { createMiddleware } from "hono/factory"; import { HTTPException } from "hono/http-exception"; import { logger } from "hono/logger"; import { getConfig } from "./config"; +import { getPIIDetector } from "./pii/detect"; import { dashboardRoutes } from "./routes/dashboard"; import { healthRoutes } from "./routes/health"; import { infoRoutes } from "./routes/info"; import { proxyRoutes } from "./routes/proxy"; import { getLogger } from "./services/logger"; -import { getPIIDetector } from "./services/pii-detector"; type Variables = { requestId: string; @@ -106,9 +106,7 @@ async function validateStartup() { if (config.secrets_detection.action === "route_local" && config.mode === "mask") { console.error("\n❌ Configuration error detected!\n"); console.error(" secrets_detection.action 'route_local' is not compatible with mode 'mask'."); - console.error( - " Use mode 'route' or change secrets_detection.action to 'block' or 'redact'.\n", - ); + console.error(" Use mode 'route' or change secrets_detection.action to 'block' or 'mask'.\n"); console.error("[STARTUP] ✗ Invalid configuration. Exiting for safety."); process.exit(1); } diff --git a/src/services/pii-detector.test.ts b/src/pii/detect.test.ts similarity index 79% rename from src/services/pii-detector.test.ts rename to src/pii/detect.test.ts index 6c748d7..46be2ff 100644 --- a/src/services/pii-detector.test.ts +++ b/src/pii/detect.test.ts @@ -1,5 +1,5 @@ import { afterEach, describe, expect, mock, test } from "bun:test"; -import { PIIDetector } from "./pii-detector"; +import { PIIDetector } from "./detect"; const originalFetch = globalThis.fetch; @@ -62,10 +62,16 @@ describe("PIIDetector", () => { const result = await detector.analyzeMessages(messages); expect(result.hasPII).toBe(true); - expect(result.entitiesByMessage).toHaveLength(3); - expect(result.entitiesByMessage[0]).toHaveLength(1); - expect(result.entitiesByMessage[1]).toHaveLength(1); - expect(result.entitiesByMessage[2]).toHaveLength(1); + // Per-message, per-part: messageEntities[msgIdx][partIdx] = entities + expect(result.messageEntities).toHaveLength(3); + // Each message has 1 part (string content) + expect(result.messageEntities[0]).toHaveLength(1); + expect(result.messageEntities[1]).toHaveLength(1); + expect(result.messageEntities[2]).toHaveLength(1); + // Each part has 1 entity + expect(result.messageEntities[0][0]).toHaveLength(1); + expect(result.messageEntities[1][0]).toHaveLength(1); + expect(result.messageEntities[2][0]).toHaveLength(1); }); test("detects PII in system message when user message has none", async () => { @@ -82,8 +88,8 @@ describe("PIIDetector", () => { const result = await detector.analyzeMessages(messages); expect(result.hasPII).toBe(true); - expect(result.entitiesByMessage[0]).toHaveLength(1); - expect(result.entitiesByMessage[0][0].entity_type).toBe("PERSON"); + expect(result.messageEntities[0][0]).toHaveLength(1); + expect(result.messageEntities[0][0][0].entity_type).toBe("PERSON"); }); test("detects PII in earlier user message", async () => { @@ -101,7 +107,7 @@ describe("PIIDetector", () => { const result = await detector.analyzeMessages(messages); expect(result.hasPII).toBe(true); - expect(result.entitiesByMessage[0]).toHaveLength(1); + expect(result.messageEntities[0][0]).toHaveLength(1); }); test("returns empty result for no messages", async () => { @@ -111,8 +117,8 @@ describe("PIIDetector", () => { const result = await detector.analyzeMessages([]); expect(result.hasPII).toBe(false); - expect(result.entitiesByMessage).toHaveLength(0); - expect(result.newEntities).toHaveLength(0); + expect(result.messageEntities).toHaveLength(0); + expect(result.allEntities).toHaveLength(0); }); test("handles multimodal content", async () => { @@ -134,7 +140,12 @@ describe("PIIDetector", () => { const result = await detector.analyzeMessages(messages); expect(result.hasPII).toBe(true); - expect(result.entitiesByMessage[0]).toHaveLength(1); + // Multimodal message has 2 parts + expect(result.messageEntities[0]).toHaveLength(2); + // First part (text) has 1 entity + expect(result.messageEntities[0][0]).toHaveLength(1); + // Second part (image) has no entities + expect(result.messageEntities[0][1]).toHaveLength(0); }); test("skips messages with empty content", async () => { @@ -150,8 +161,10 @@ describe("PIIDetector", () => { const result = await detector.analyzeMessages(messages); - expect(result.entitiesByMessage).toHaveLength(2); - expect(result.entitiesByMessage[0]).toHaveLength(0); + expect(result.messageEntities).toHaveLength(2); + // First message (empty string) has 1 part with no entities + expect(result.messageEntities[0]).toHaveLength(1); + expect(result.messageEntities[0][0]).toHaveLength(0); }); }); diff --git a/src/services/pii-detector.ts b/src/pii/detect.ts similarity index 75% rename from src/services/pii-detector.ts rename to src/pii/detect.ts index 444f130..ae078f5 100644 --- a/src/services/pii-detector.ts +++ b/src/pii/detect.ts @@ -1,6 +1,6 @@ import { getConfig } from "../config"; +import { getLanguageDetector, type SupportedLanguage } from "../services/language-detector"; import { extractTextContent, type MessageContent } from "../utils/content"; -import { getLanguageDetector, type SupportedLanguage } from "./language-detector"; export interface PIIEntity { entity_type: string; @@ -16,10 +16,16 @@ interface AnalyzeRequest { score_threshold?: number; } +/** + * Per-message, per-part PII detection result + * Structure: messageEntities[msgIdx][partIdx] = entities for that part + */ export interface PIIDetectionResult { hasPII: boolean; - entitiesByMessage: PIIEntity[][]; - newEntities: PIIEntity[]; + /** Per-message, per-part entities */ + messageEntities: PIIEntity[][][]; + /** Flattened list of all entities (for summary/logging) */ + allEntities: PIIEntity[]; scanTimeMs: number; language: SupportedLanguage; languageFallback: boolean; @@ -78,33 +84,65 @@ export class PIIDetector { } } + /** + * Analyzes messages for PII with per-part granularity + * + * For string content, entities are in messageEntities[msgIdx][0]. + * For array content (multimodal), each text part is scanned separately. + */ async analyzeMessages( messages: Array<{ role: string; content: MessageContent }>, ): Promise { const startTime = Date.now(); const config = getConfig(); + // Detect language from the last user message const lastUserMsg = messages.findLast((m) => m.role === "user"); const langText = lastUserMsg ? extractTextContent(lastUserMsg.content) : ""; const langResult = langText ? getLanguageDetector().detect(langText) : { language: config.pii_detection.fallback_language, usedFallback: true }; - const scannedRoles = ["system", "developer", "user", "assistant"]; + const scannedRoles = ["system", "developer", "user", "assistant", "tool"]; - const entitiesByMessage = await Promise.all( - messages.map((message) => { - const text = extractTextContent(message.content); - return text && scannedRoles.includes(message.role) - ? this.detectPII(text, langResult.language) - : Promise.resolve([]); + // Detect PII per message, per content part + const messageEntities: PIIEntity[][][] = await Promise.all( + messages.map(async (message) => { + if (!scannedRoles.includes(message.role)) { + return []; + } + + // String content → wrap in single-element array + if (typeof message.content === "string") { + const entities = message.content + ? await this.detectPII(message.content, langResult.language) + : []; + return [entities]; + } + + // Array content (multimodal) → per-part detection + if (Array.isArray(message.content)) { + return await Promise.all( + message.content.map(async (part) => { + if (part.type === "text" && typeof part.text === "string") { + return await this.detectPII(part.text, langResult.language); + } + return []; + }), + ); + } + + // Null/undefined content + return []; }), ); + const allEntities = messageEntities.flat(2); + return { - hasPII: entitiesByMessage.some((e) => e.length > 0), - entitiesByMessage, - newEntities: entitiesByMessage.flat(), + hasPII: allEntities.length > 0, + messageEntities, + allEntities, scanTimeMs: Date.now() - startTime, language: langResult.language, languageFallback: langResult.usedFallback, diff --git a/src/pii/mask.test.ts b/src/pii/mask.test.ts new file mode 100644 index 0000000..fdd582e --- /dev/null +++ b/src/pii/mask.test.ts @@ -0,0 +1,347 @@ +import { describe, expect, test } from "bun:test"; +import type { MaskingConfig } from "../config"; +import type { ChatMessage } from "../services/llm-client"; +import { createPIIResult } from "../test-utils/detection-results"; +import type { PIIEntity } from "./detect"; +import { + createMaskingContext, + flushMaskingBuffer, + mask, + maskMessages, + unmask, + unmaskResponse, + unmaskStreamChunk, +} from "./mask"; + +const defaultConfig: MaskingConfig = { + show_markers: false, + marker_text: "[protected]", +}; + +const configWithMarkers: MaskingConfig = { + show_markers: true, + marker_text: "[protected]", +}; + +describe("PII placeholder format", () => { + test("uses [[TYPE_N]] format", () => { + const entities: PIIEntity[] = [{ entity_type: "EMAIL_ADDRESS", start: 0, end: 16, score: 1.0 }]; + const result = mask("john@example.com", entities); + + expect(result.masked).toBe("[[EMAIL_ADDRESS_1]]"); + }); + + test("increments counter per entity type", () => { + const entities: PIIEntity[] = [ + { entity_type: "EMAIL_ADDRESS", start: 0, end: 7, score: 1.0 }, + { entity_type: "EMAIL_ADDRESS", start: 12, end: 19, score: 1.0 }, + ]; + + const result = mask("a@b.com and c@d.com", entities); + + expect(result.masked).toBe("[[EMAIL_ADDRESS_1]] and [[EMAIL_ADDRESS_2]]"); + }); + + test("tracks different entity types separately", () => { + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 0, end: 11, score: 0.9 }, + { entity_type: "EMAIL_ADDRESS", start: 13, end: 26, score: 1.0 }, + ]; + + const result = mask("Hans Müller: hans@firma.de", entities); + + expect(result.masked).toBe("[[PERSON_1]]: [[EMAIL_ADDRESS_1]]"); + }); +}); + +describe("marker feature", () => { + test("adds markers when show_markers is true", () => { + const context = createMaskingContext(); + context.mapping["[[EMAIL_ADDRESS_1]]"] = "john@example.com"; + + const result = unmask("Email: [[EMAIL_ADDRESS_1]]", context, configWithMarkers); + expect(result).toBe("Email: [protected]john@example.com"); + }); + + test("no markers when show_markers is false", () => { + const context = createMaskingContext(); + context.mapping["[[EMAIL_ADDRESS_1]]"] = "john@example.com"; + + const result = unmask("Email: [[EMAIL_ADDRESS_1]]", context, defaultConfig); + expect(result).toBe("Email: john@example.com"); + }); + + test("markers work with streaming", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "John Doe"; + + const { output } = unmaskStreamChunk("", "Hello [[PERSON_1]]!", context, configWithMarkers); + expect(output).toBe("Hello [protected]John Doe!"); + }); + + test("markers work with response unmasking", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "John Doe"; + + const response = { + id: "test", + object: "chat.completion" as const, + created: 1234567890, + model: "gpt-4", + choices: [ + { + index: 0, + message: { role: "assistant" as const, content: "Hello [[PERSON_1]]" }, + finish_reason: "stop" as const, + }, + ], + }; + + const result = unmaskResponse(response, context, configWithMarkers); + expect(result.choices[0].message.content).toBe("Hello [protected]John Doe"); + }); +}); + +describe("maskMessages with PIIDetectionResult", () => { + test("masks multiple messages using detection result", () => { + const messages: ChatMessage[] = [ + { role: "user", content: "My email is test@example.com" }, + { role: "assistant", content: "Got it" }, + { role: "user", content: "Also john@test.com" }, + ]; + + const detection = createPIIResult([ + [[{ entity_type: "EMAIL_ADDRESS", start: 12, end: 28, score: 1.0 }]], + [[]], + [[{ entity_type: "EMAIL_ADDRESS", start: 5, end: 18, score: 1.0 }]], + ]); + + const { masked, context } = maskMessages(messages, detection); + + expect(masked[0].content).toBe("My email is [[EMAIL_ADDRESS_1]]"); + expect(masked[1].content).toBe("Got it"); + expect(masked[2].content).toBe("Also [[EMAIL_ADDRESS_2]]"); + expect(context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("test@example.com"); + expect(context.mapping["[[EMAIL_ADDRESS_2]]"]).toBe("john@test.com"); + }); + + test("handles multimodal content", () => { + const messages: ChatMessage[] = [ + { + role: "user", + content: [ + { type: "text", text: "Contact john@test.com" }, + { type: "image_url", image_url: { url: "https://example.com/img.jpg" } }, + ], + }, + ]; + + const detection = createPIIResult([ + [[{ entity_type: "EMAIL_ADDRESS", start: 8, end: 21, score: 1.0 }], []], + ]); + + const { masked } = maskMessages(messages, detection); + + const content = masked[0].content as Array<{ type: string; text?: string }>; + expect(content[0].text).toBe("Contact [[EMAIL_ADDRESS_1]]"); + expect(content[1].type).toBe("image_url"); + }); +}); + +describe("streaming with PII placeholders", () => { + test("buffers partial [[TYPE placeholder", () => { + const context = createMaskingContext(); + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; + + const { output, remainingBuffer } = unmaskStreamChunk( + "", + "Hello [[EMAIL_ADD", + context, + defaultConfig, + ); + + expect(output).toBe("Hello "); + expect(remainingBuffer).toBe("[[EMAIL_ADD"); + }); + + test("completes buffered placeholder across chunks", () => { + const context = createMaskingContext(); + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; + + const { output, remainingBuffer } = unmaskStreamChunk( + "[[EMAIL_ADD", + "RESS_1]] there", + context, + defaultConfig, + ); + + expect(output).toBe("test@test.com there"); + expect(remainingBuffer).toBe(""); + }); + + test("flushes remaining buffer at end of stream", () => { + const context = createMaskingContext(); + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; + + const flushed = flushMaskingBuffer("[[EMAIL_ADD", context, defaultConfig); + expect(flushed).toBe("[[EMAIL_ADD"); + }); +}); + +describe("PII conflict resolution", () => { + test("handles overlapping entities with same start - keeps longer", () => { + const text = "Given Eric's feedback"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 6, end: 10, score: 0.85 }, + { entity_type: "PERSON", start: 6, end: 12, score: 0.8 }, + ]; + + const { masked, context } = mask(text, entities); + + expect(masked).toBe("Given [[PERSON_1]] feedback"); + expect(context.mapping["[[PERSON_1]]"]).toBe("Eric's"); + }); + + test("handles partially overlapping entities of same type - merges them", () => { + const text = "Contact John Smith Jones please"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 8, end: 18, score: 0.9 }, + { entity_type: "PERSON", start: 13, end: 25, score: 0.7 }, + ]; + + const { masked } = mask(text, entities); + + expect(masked).toBe("Contact [[PERSON_1]]please"); + }); + + test("keeps adjacent non-overlapping entities", () => { + const text = "HansMüller"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, + { entity_type: "PERSON", start: 4, end: 10, score: 0.9 }, + ]; + + const { masked } = mask(text, entities); + + expect(masked).toBe("[[PERSON_1]][[PERSON_2]]"); + }); +}); + +describe("mask -> unmask roundtrip", () => { + test("preserves original data through roundtrip", () => { + const originalText = "Contact Hans Müller at hans@firma.de or call +49123456789"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 8, end: 19, score: 0.9 }, + { entity_type: "EMAIL_ADDRESS", start: 23, end: 36, score: 1.0 }, + { entity_type: "PHONE_NUMBER", start: 45, end: 57, score: 0.95 }, + ]; + + const { masked, context } = mask(originalText, entities); + + expect(masked).not.toContain("Hans Müller"); + expect(masked).not.toContain("hans@firma.de"); + expect(masked).not.toContain("+49123456789"); + + const llmResponse = `I see ${masked.match(/\[\[PERSON_1\]\]/)?.[0]}, email ${masked.match(/\[\[EMAIL_ADDRESS_1\]\]/)?.[0]}`; + const unmasked = unmask(llmResponse, context, defaultConfig); + + expect(unmasked).toContain("Hans Müller"); + expect(unmasked).toContain("hans@firma.de"); + }); +}); + +describe("HTML context handling", () => { + test("unmasks placeholders in HTML without encoding issues", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "Dr. Sarah Chen"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "sarah.chen@hospital.org"; + + const htmlResponse = `

Contact [[PERSON_1]] at [[EMAIL_ADDRESS_1]]

`; + const result = unmask(htmlResponse, context, defaultConfig); + + expect(result).toBe("

Contact Dr. Sarah Chen at sarah.chen@hospital.org

"); + }); + + test("works with complex HTML structures", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "Dr. Sarah Chen"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "sarah@hospital.org"; + + const complexHtml = ` +
+

[[PERSON_1]]

+ [[EMAIL_ADDRESS_1]] +
+ `; + + const result = unmask(complexHtml, context, defaultConfig); + + expect(result).toContain("Dr. Sarah Chen"); + expect(result).toContain("sarah@hospital.org"); + expect(result).not.toContain("[["); + }); +}); + +describe("unmaskResponse", () => { + test("unmasks all choices in response", () => { + const context = createMaskingContext(); + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; + context.mapping["[[PERSON_1]]"] = "John Doe"; + + const response = { + id: "chatcmpl-123", + object: "chat.completion" as const, + created: 1234567890, + model: "gpt-4", + choices: [ + { + index: 0, + message: { + role: "assistant" as const, + content: "Contact [[PERSON_1]] at [[EMAIL_ADDRESS_1]]", + }, + finish_reason: "stop" as const, + }, + ], + usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 }, + }; + + const result = unmaskResponse(response, context, defaultConfig); + + expect(result.choices[0].message.content).toBe("Contact John Doe at test@test.com"); + expect(result.id).toBe("chatcmpl-123"); + expect(result.model).toBe("gpt-4"); + }); +}); + +describe("edge cases", () => { + test("handles unicode in masked text", () => { + const text = "Kontakt: François Müller"; + const entities: PIIEntity[] = [{ entity_type: "PERSON", start: 9, end: 24, score: 0.9 }]; + + const { masked, context } = mask(text, entities); + expect(masked).toBe("Kontakt: [[PERSON_1]]"); + + const unmasked = unmask(masked, context, defaultConfig); + expect(unmasked).toBe("Kontakt: François Müller"); + }); + + test("handles empty text", () => { + const { masked, context } = mask("", []); + expect(masked).toBe(""); + expect(unmask("", context, defaultConfig)).toBe(""); + }); + + test("reuses placeholder for duplicate values", () => { + const text = "a@b.com and again a@b.com"; + const entities: PIIEntity[] = [ + { entity_type: "EMAIL_ADDRESS", start: 0, end: 7, score: 1.0 }, + { entity_type: "EMAIL_ADDRESS", start: 18, end: 25, score: 1.0 }, + ]; + + const result = mask(text, entities); + + expect(result.masked).toBe("[[EMAIL_ADDRESS_1]] and again [[EMAIL_ADDRESS_1]]"); + expect(Object.keys(result.context.mapping)).toHaveLength(1); + }); +}); diff --git a/src/pii/mask.ts b/src/pii/mask.ts new file mode 100644 index 0000000..13d1c13 --- /dev/null +++ b/src/pii/mask.ts @@ -0,0 +1,131 @@ +import type { MaskingConfig } from "../config"; +import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client"; +import { resolveConflicts } from "../utils/conflict-resolver"; +import { + createPlaceholderContext, + flushBuffer, + incrementAndGenerate, + type MaskResult, + type PlaceholderContext, + processStreamChunk, + replaceWithPlaceholders, + restorePlaceholders, + restoreResponsePlaceholders, + transformMessagesPerPart, +} from "../utils/message-transform"; +import { + generatePlaceholder as generatePlaceholderFromFormat, + PII_PLACEHOLDER_FORMAT, +} from "../utils/placeholders"; +import type { PIIDetectionResult, PIIEntity } from "./detect"; + +export type { MaskResult } from "../utils/message-transform"; + +/** + * Creates a new masking context for a request + */ +export function createMaskingContext(): PlaceholderContext { + return createPlaceholderContext(); +} + +/** + * Generates a placeholder for a PII entity type + */ +function generatePlaceholder(entityType: string, context: PlaceholderContext): string { + return incrementAndGenerate(entityType, context, (type, count) => + generatePlaceholderFromFormat(PII_PLACEHOLDER_FORMAT, type, count), + ); +} + +/** + * Creates formatValue function from masking config + */ +function getFormatValue(config: MaskingConfig): ((original: string) => string) | undefined { + return config.show_markers ? (original: string) => `${config.marker_text}${original}` : undefined; +} + +/** + * Masks PII entities in text, replacing them with placeholders + */ +export function mask( + text: string, + entities: PIIEntity[], + context?: PlaceholderContext, +): MaskResult { + const ctx = context || createMaskingContext(); + const masked = replaceWithPlaceholders( + text, + entities, + ctx, + (e) => e.entity_type, + generatePlaceholder, + resolveConflicts, + ); + return { masked, context: ctx }; +} + +/** + * Unmasks text by replacing placeholders with original values + * + * Optionally adds markers to indicate protected content + */ +export function unmask(text: string, context: PlaceholderContext, config: MaskingConfig): string { + return restorePlaceholders(text, context, getFormatValue(config)); +} + +/** + * Masks messages using per-part entity detection results + * + * Uses transformMessagesPerPart for the common iteration pattern. + */ +export function maskMessages( + messages: ChatMessage[], + detection: PIIDetectionResult, +): { masked: ChatMessage[]; context: PlaceholderContext } { + const context = createMaskingContext(); + + const masked = transformMessagesPerPart( + messages, + detection.messageEntities, + (text, entities, ctx) => mask(text, entities, ctx).masked, + context, + ); + + return { masked, context }; +} + +/** + * Streaming unmask helper - processes chunks and unmasks when complete placeholders are found + * + * Returns the unmasked portion and any remaining buffer that might contain partial placeholders + */ +export function unmaskStreamChunk( + buffer: string, + newChunk: string, + context: PlaceholderContext, + config: MaskingConfig, +): { output: string; remainingBuffer: string } { + return processStreamChunk(buffer, newChunk, context, (text, ctx) => unmask(text, ctx, config)); +} + +/** + * Flushes remaining buffer at end of stream + */ +export function flushMaskingBuffer( + buffer: string, + context: PlaceholderContext, + config: MaskingConfig, +): string { + return flushBuffer(buffer, context, (text, ctx) => unmask(text, ctx, config)); +} + +/** + * Unmasks a chat completion response by replacing placeholders in all choices + */ +export function unmaskResponse( + response: ChatCompletionResponse, + context: PlaceholderContext, + config: MaskingConfig, +): ChatCompletionResponse { + return restoreResponsePlaceholders(response, context, getFormatValue(config)); +} diff --git a/src/routes/info.ts b/src/routes/info.ts index 7b3ab3d..76d525c 100644 --- a/src/routes/info.ts +++ b/src/routes/info.ts @@ -1,8 +1,8 @@ import { Hono } from "hono"; import pkg from "../../package.json"; import { getConfig } from "../config"; +import { getPIIDetector } from "../pii/detect"; import { getRouter } from "../services/decision"; -import { getPIIDetector } from "../services/pii-detector"; export const infoRoutes = new Hono(); diff --git a/src/routes/proxy.ts b/src/routes/proxy.ts index e275d21..99b91b5 100644 --- a/src/routes/proxy.ts +++ b/src/routes/proxy.ts @@ -4,12 +4,9 @@ import { Hono } from "hono"; import { proxy } from "hono/proxy"; import { z } from "zod"; import { getConfig, type MaskingConfig } from "../config"; -import { - detectSecrets, - extractTextFromRequest, - type SecretsDetectionResult, -} from "../secrets/detect"; -import { type RedactionContext, redactSecrets, unredactResponse } from "../secrets/redact"; +import { unmaskResponse as unmaskPIIResponse } from "../pii/mask"; +import { detectSecretsInMessages, type MessageSecretsResult } from "../secrets/detect"; +import { maskMessages as maskSecretsMessages, unmaskSecretsResponse } from "../secrets/mask"; import { getRouter, type MaskDecision, type RoutingDecision } from "../services/decision"; import { type ChatCompletionRequest, @@ -19,9 +16,9 @@ import { type LLMResult, } from "../services/llm-client"; import { logRequest, type RequestLogData } from "../services/logger"; -import { unmaskResponse } from "../services/masking"; import { createUnmaskingStream } from "../services/stream-transformer"; -import { type ContentPart, extractTextContent } from "../utils/content"; +import { extractTextContent } from "../utils/content"; +import type { PlaceholderContext } from "../utils/message-transform"; // Request validation schema const ChatCompletionSchema = z @@ -57,7 +54,7 @@ function createErrorLogData( statusCode: number, errorMessage: string, decision?: RoutingDecision, - secretsResult?: SecretsDetectionResult, + secretsResult?: MessageSecretsResult, maskedContent?: string, ): RequestLogData { const config = getConfig(); @@ -68,7 +65,7 @@ function createErrorLogData( model: body.model || "unknown", piiDetected: decision?.piiResult.hasPII ?? false, entities: decision - ? [...new Set(decision.piiResult.newEntities.map((e) => e.entity_type))] + ? [...new Set(decision.piiResult.allEntities.map((e) => e.entity_type))] : [], latencyMs: Date.now() - startTime, scanTimeMs: decision?.piiResult.scanTimeMs ?? 0, @@ -110,14 +107,13 @@ proxyRoutes.post( const router = getRouter(); // Track secrets detection state for response handling - let secretsResult: SecretsDetectionResult | undefined; - let redactionContext: RedactionContext | undefined; - let secretsRedacted = false; + let secretsResult: MessageSecretsResult | undefined; + let secretsMaskingContext: PlaceholderContext | undefined; + let secretsMasked = false; - // Secrets detection runs before PII detection + // Secrets detection runs before PII detection (per-part) if (config.secrets_detection.enabled) { - const text = extractTextFromRequest(body); - secretsResult = detectSecrets(text, config.secrets_detection); + secretsResult = detectSecretsInMessages(body.messages, config.secrets_detection); if (secretsResult.detected) { const secretTypes = secretsResult.matches.map((m) => m.type); @@ -125,16 +121,14 @@ proxyRoutes.post( // Block action - return 400 error if (config.secrets_detection.action === "block") { - // Set headers before returning error c.header("X-PasteGuard-Secrets-Detected", "true"); c.header("X-PasteGuard-Secrets-Types", secretTypesStr); - // Log metadata only (no secret content) logRequest( { timestamp: new Date().toISOString(), mode: config.mode, - provider: "openai", // Note: Request never reached provider + provider: "openai", model: body.model || "unknown", piiDetected: false, entities: [], @@ -161,12 +155,12 @@ proxyRoutes.post( ); } - // Redact action - replace secrets with placeholders and continue - if (config.secrets_detection.action === "redact") { - const redactedMessages = redactMessagesWithSecrets(body.messages, secretsResult); - body = { ...body, messages: redactedMessages.messages }; - redactionContext = redactedMessages.context; - secretsRedacted = true; + // Mask action - replace secrets with placeholders (per-part) + if (config.secrets_detection.action === "mask") { + const result = maskSecretsMessages(body.messages, secretsResult); + body = { ...body, messages: result.masked }; + secretsMaskingContext = result.context; + secretsMasked = true; } // route_local action is handled in handleCompletion via secretsResult @@ -204,134 +198,12 @@ proxyRoutes.post( startTime, router, secretsResult, - redactionContext, - secretsRedacted, + secretsMaskingContext, + secretsMasked, ); }, ); -/** - * Redacts secrets in all messages based on detection result - * Returns redacted messages and the redaction context for unredaction - */ -function redactMessagesWithSecrets( - messages: ChatMessage[], - secretsResult: SecretsDetectionResult, -): { messages: ChatMessage[]; context: RedactionContext } { - // Build a map of message content to redactions - // Since we concatenated all messages with \n, we need to track positions per message - let currentOffset = 0; - const messagePositions: { start: number; end: number }[] = []; - - for (const msg of messages) { - const text = extractTextContent(msg.content); - const length = text.length; - messagePositions.push({ start: currentOffset, end: currentOffset + length }); - currentOffset += length + 1; // +1 for \n separator - } - - // Create redaction context - let context: RedactionContext = { - mapping: {}, - reverseMapping: {}, - counters: {}, - }; - - // Apply redactions to each message - const redactedMessages = messages.map((msg, i) => { - // Handle null/undefined content - if (!msg.content) { - return msg; - } - - // Handle array content (multimodal messages) - if (Array.isArray(msg.content)) { - const msgPos = messagePositions[i]; - - // Filter redactions for this message - const messageRedactions = (secretsResult.redactions || []) - .filter((r) => r.start >= msgPos.start && r.end <= msgPos.end) - .map((r) => ({ - ...r, - start: r.start - msgPos.start, - end: r.end - msgPos.start, - })); - - if (messageRedactions.length === 0) { - return msg; - } - - // Track offset position within the concatenated text for this message - // (matches how extractTextContent joins parts with \n) - let partOffset = 0; - - // Redact only text parts of array content with proper offset tracking - const redactedContent = msg.content.map((part: ContentPart) => { - if (part.type === "text" && typeof part.text === "string") { - const partLength = part.text.length; - - // Find redactions that apply to this specific part - const partRedactions = messageRedactions - .filter((r) => r.start < partOffset + partLength && r.end > partOffset) - .map((r) => ({ - ...r, - start: Math.max(0, r.start - partOffset), - end: Math.min(partLength, r.end - partOffset), - })); - - if (partRedactions.length > 0) { - const { redacted, context: updatedContext } = redactSecrets( - part.text, - partRedactions, - context, - ); - context = updatedContext; - partOffset += partLength + 1; // +1 for \n separator - return { ...part, text: redacted }; - } - - partOffset += partLength + 1; // +1 for \n separator - return part; - } - return part; - }); - - return { ...msg, content: redactedContent }; - } - - // Handle string content (text-only messages) - if (typeof msg.content !== "string") { - return msg; - } - - const msgPos = messagePositions[i]; - - // Filter redactions that fall within this message's position - const messageRedactions = (secretsResult.redactions || []) - .filter((r) => r.start >= msgPos.start && r.end <= msgPos.end) - .map((r) => ({ - ...r, - start: r.start - msgPos.start, - end: r.end - msgPos.start, - })); - - if (messageRedactions.length === 0) { - return msg; - } - - const { redacted, context: updatedContext } = redactSecrets( - msg.content, - messageRedactions, - context, - ); - context = updatedContext; - - return { ...msg, content: redacted }; - }); - - return { messages: redactedMessages, context }; -} - /** * Handle chat completion for both route and mask modes */ @@ -341,9 +213,9 @@ async function handleCompletion( decision: RoutingDecision, startTime: number, router: ReturnType, - secretsResult?: SecretsDetectionResult, - redactionContext?: RedactionContext, - secretsRedacted?: boolean, + secretsResult?: MessageSecretsResult, + secretsMaskingContext?: PlaceholderContext, + secretsMasked?: boolean, ) { const client = router.getClient(decision.provider); const maskingConfig = router.getMaskingConfig(); @@ -377,8 +249,8 @@ async function handleCompletion( c.header("X-PasteGuard-Secrets-Detected", "true"); c.header("X-PasteGuard-Secrets-Types", secretsTypes.join(",")); } - if (secretsRedacted) { - c.header("X-PasteGuard-Secrets-Redacted", "true"); + if (secretsMasked) { + c.header("X-PasteGuard-Secrets-Masked", "true"); } try { @@ -394,7 +266,7 @@ async function handleCompletion( maskingConfig, secretsDetected, secretsTypes, - redactionContext, + secretsMaskingContext, ); } @@ -407,7 +279,7 @@ async function handleCompletion( maskingConfig, secretsDetected, secretsTypes, - redactionContext, + secretsMaskingContext, ); } catch (error) { console.error("LLM request error:", error); @@ -476,7 +348,7 @@ function handleStreamingResponse( maskingConfig: MaskingConfig, secretsDetected?: boolean, secretsTypes?: string[], - redactionContext?: RedactionContext, + secretsMaskingContext?: PlaceholderContext, ) { logRequest( createLogData( @@ -497,14 +369,14 @@ function handleStreamingResponse( // Determine if we need to transform the stream const needsPIIUnmasking = isMaskDecision(decision); - const needsSecretsUnredaction = redactionContext !== undefined; + const needsSecretsUnmasking = secretsMaskingContext !== undefined; - if (needsPIIUnmasking || needsSecretsUnredaction) { + if (needsPIIUnmasking || needsSecretsUnmasking) { const unmaskingStream = createUnmaskingStream( result.response, needsPIIUnmasking ? decision.maskingContext : undefined, maskingConfig, - redactionContext, + secretsMaskingContext, ); return c.body(unmaskingStream); } @@ -524,7 +396,7 @@ function handleJsonResponse( maskingConfig: MaskingConfig, secretsDetected?: boolean, secretsTypes?: string[], - redactionContext?: RedactionContext, + secretsMaskingContext?: PlaceholderContext, ) { logRequest( createLogData( @@ -543,12 +415,12 @@ function handleJsonResponse( // First unmask PII if needed if (isMaskDecision(decision)) { - response = unmaskResponse(response, decision.maskingContext, maskingConfig); + response = unmaskPIIResponse(response, decision.maskingContext, maskingConfig); } - // Then unredact secrets if needed - if (redactionContext) { - response = unredactResponse(response, redactionContext); + // Then unmask secrets if needed + if (secretsMaskingContext) { + response = unmaskSecretsResponse(response, secretsMaskingContext); } return c.json(response); @@ -572,7 +444,7 @@ function createLogData( provider: decision.provider, model: result.model, piiDetected: decision.piiResult.hasPII, - entities: [...new Set(decision.piiResult.newEntities.map((e) => e.entity_type))], + entities: [...new Set(decision.piiResult.allEntities.map((e) => e.entity_type))], latencyMs: Date.now() - startTime, scanTimeMs: decision.piiResult.scanTimeMs, promptTokens: response?.usage?.prompt_tokens, diff --git a/src/secrets/detect.test.ts b/src/secrets/detect.test.ts index c73506c..0337d83 100644 --- a/src/secrets/detect.test.ts +++ b/src/secrets/detect.test.ts @@ -1,7 +1,6 @@ import { describe, expect, test } from "bun:test"; import type { SecretsDetectionConfig } from "../config"; -import type { ChatCompletionRequest } from "../services/llm-client"; -import { detectSecrets, extractTextFromRequest } from "./detect"; +import { detectSecrets } from "./detect"; const defaultConfig: SecretsDetectionConfig = { enabled: true, @@ -50,8 +49,8 @@ describe("detectSecrets", () => { expect(result.matches).toHaveLength(1); expect(result.matches[0].type).toBe("OPENSSH_PRIVATE_KEY"); expect(result.matches[0].count).toBe(1); - expect(result.redactions).toBeDefined(); - expect(result.redactions?.length).toBe(1); + expect(result.locations).toBeDefined(); + expect(result.locations?.length).toBe(1); }); test("detects RSA private key", () => { @@ -85,7 +84,7 @@ describe("detectSecrets", () => { expect(result.matches).toHaveLength(1); expect(result.matches[0].type).toBe("OPENSSH_PRIVATE_KEY"); expect(result.matches[0].count).toBe(2); - expect(result.redactions?.length).toBe(2); + expect(result.locations?.length).toBe(2); }); test("detects multiple secrets of different types", () => { @@ -169,13 +168,13 @@ describe("detectSecrets", () => { expect(result.matches[0].count).toBe(1); // Should be 1, not 2 }); - test("redactions are sorted by start position descending", () => { + test("locations are sorted by start position descending", () => { const text = `${opensshKey}\n\n${rsaKey}`; const result = detectSecrets(text, defaultConfig); - expect(result.redactions).toBeDefined(); - if (result.redactions && result.redactions.length > 1) { - for (let i = 0; i < result.redactions.length - 1; i++) { - expect(result.redactions[i].start).toBeGreaterThan(result.redactions[i + 1].start); + expect(result.locations).toBeDefined(); + if (result.locations && result.locations.length > 1) { + for (let i = 0; i < result.locations.length - 1; i++) { + expect(result.locations[i].start).toBeGreaterThan(result.locations[i + 1].start); } } }); @@ -203,8 +202,8 @@ describe("detectSecrets - API Keys", () => { expect(result.matches).toHaveLength(1); expect(result.matches[0].type).toBe("API_KEY_OPENAI"); expect(result.matches[0].count).toBe(1); - expect(result.redactions).toBeDefined(); - expect(result.redactions?.[0].type).toBe("API_KEY_OPENAI"); + expect(result.locations).toBeDefined(); + expect(result.locations?.[0].type).toBe("API_KEY_OPENAI"); }); test("detects AWS access key", () => { @@ -408,13 +407,13 @@ ADMIN_PWD=adminpass123`; expect(result.detected).toBe(false); }); - test("redaction positions are correct", () => { + test("location positions are correct", () => { const text = "config: DB_PASSWORD=mysecretpassword123 here"; const result = detectSecrets(text, passwordConfig); - expect(result.redactions).toBeDefined(); - expect(result.redactions?.length).toBe(1); - const redacted = text.slice(result.redactions![0].start, result.redactions![0].end); - expect(redacted).toBe("DB_PASSWORD=mysecretpassword123"); + expect(result.locations).toBeDefined(); + expect(result.locations?.length).toBe(1); + const matched = text.slice(result.locations![0].start, result.locations![0].end); + expect(matched).toBe("DB_PASSWORD=mysecretpassword123"); }); }); @@ -481,13 +480,13 @@ SESSION_SECRET=session_key_here`; expect(result.detected).toBe(false); }); - test("redaction positions are correct", () => { + test("location positions are correct", () => { const text = "export APP_SECRET=mysupersecretvalue123 # comment"; const result = detectSecrets(text, secretConfig); - expect(result.redactions).toBeDefined(); - expect(result.redactions?.length).toBe(1); - const redacted = text.slice(result.redactions![0].start, result.redactions![0].end); - expect(redacted).toBe("APP_SECRET=mysupersecretvalue123"); + expect(result.locations).toBeDefined(); + expect(result.locations?.length).toBe(1); + const matched = text.slice(result.locations![0].start, result.locations![0].end); + expect(matched).toBe("APP_SECRET=mysupersecretvalue123"); }); }); @@ -596,13 +595,13 @@ CACHE=redis://default:pass@redis:6379`; expect(result.detected).toBe(false); }); - test("redaction covers full connection string", () => { + test("location covers full connection string", () => { const text = "export DB=postgres://admin:secret123@db.example.com:5432/prod"; const result = detectSecrets(text, connConfig); - expect(result.redactions).toBeDefined(); - expect(result.redactions?.length).toBe(1); - const redacted = text.slice(result.redactions![0].start, result.redactions![0].end); - expect(redacted).toBe("postgres://admin:secret123@db.example.com:5432/prod"); + expect(result.locations).toBeDefined(); + expect(result.locations?.length).toBe(1); + const matched = text.slice(result.locations![0].start, result.locations![0].end); + expect(matched).toBe("postgres://admin:secret123@db.example.com:5432/prod"); }); }); @@ -636,81 +635,16 @@ ${rsaKey} expect(result.matches.length).toBeGreaterThanOrEqual(4); }); - test("redaction positions are correct for all types", () => { + test("location positions are correct for all types", () => { const text = `Key: ${awsAccessKey} and ${githubToken}`; const result = detectSecrets(text, allConfig); - expect(result.redactions).toBeDefined(); - expect(result.redactions?.length).toBe(2); + expect(result.locations).toBeDefined(); + expect(result.locations?.length).toBe(2); - // Verify redactions point to correct positions - for (const redaction of result.redactions || []) { - const extracted = text.slice(redaction.start, redaction.end); + // Verify locations point to correct positions + for (const location of result.locations || []) { + const extracted = text.slice(location.start, location.end); expect(extracted.length).toBeGreaterThan(10); } }); }); - -describe("extractTextFromRequest", () => { - test("extracts text from simple messages", () => { - const request: ChatCompletionRequest = { - messages: [ - { role: "user", content: "Hello world" }, - { role: "assistant", content: "Hi there" }, - ], - }; - const text = extractTextFromRequest(request); - expect(text).toBe("Hello world\nHi there"); - }); - - test("extracts text from system messages", () => { - const request: ChatCompletionRequest = { - messages: [ - { role: "system", content: "You are helpful" }, - { role: "user", content: "Hello" }, - ], - }; - const text = extractTextFromRequest(request); - expect(text).toBe("You are helpful\nHello"); - }); - - test("filters out empty messages", () => { - const request: ChatCompletionRequest = { - messages: [ - { role: "user", content: "Hello" }, - { role: "assistant", content: "" }, - { role: "user", content: "World" }, - ], - }; - const text = extractTextFromRequest(request); - expect(text).toBe("Hello\nWorld"); - }); - - test("handles single message", () => { - const request: ChatCompletionRequest = { - messages: [{ role: "user", content: "Test" }], - }; - const text = extractTextFromRequest(request); - expect(text).toBe("Test"); - }); - - test("handles empty messages array", () => { - const request: ChatCompletionRequest = { - messages: [], - }; - const text = extractTextFromRequest(request); - expect(text).toBe(""); - }); - - test("extracts all message content in order", () => { - const request: ChatCompletionRequest = { - messages: [ - { role: "system", content: "System" }, - { role: "user", content: "User1" }, - { role: "assistant", content: "Assistant" }, - { role: "user", content: "User2" }, - ], - }; - const text = extractTextFromRequest(request); - expect(text).toBe("System\nUser1\nAssistant\nUser2"); - }); -}); diff --git a/src/secrets/detect.ts b/src/secrets/detect.ts index 241a846..36e5edd 100644 --- a/src/secrets/detect.ts +++ b/src/secrets/detect.ts @@ -1,32 +1,22 @@ import type { SecretsDetectionConfig } from "../config"; -import type { ChatCompletionRequest } from "../services/llm-client"; -import { extractTextContent } from "../utils/content"; +import type { ChatMessage } from "../services/llm-client"; +import type { ContentPart } from "../utils/content"; import { patternDetectors } from "./patterns"; -import type { SecretsDetectionResult, SecretsMatch, SecretsRedaction } from "./patterns/types"; +import type { + MessageSecretsResult, + SecretLocation, + SecretsDetectionResult, + SecretsMatch, +} from "./patterns/types"; -// Re-export types from patterns module for backwards compatibility export type { + MessageSecretsResult, SecretEntityType, + SecretLocation, SecretsDetectionResult, SecretsMatch, - SecretsRedaction, } from "./patterns/types"; -/** - * Extracts all text content from an OpenAI chat completion request - * - * Concatenates content from all messages (system, user, assistant) for secrets scanning. - * Handles both string content (text-only) and array content (multimodal messages). - * - * Returns concatenated text for secrets scanning. - */ -export function extractTextFromRequest(body: ChatCompletionRequest): string { - return body.messages - .map((message) => extractTextContent(message.content)) - .filter((text) => text.length > 0) - .join("\n"); -} - /** * Detects secret material (e.g. private keys, API keys, tokens) in text * @@ -54,7 +44,7 @@ export function detectSecrets( // Aggregate results from all pattern detectors const allMatches: SecretsMatch[] = []; - const allRedactions: SecretsRedaction[] = []; + const allLocations: SecretLocation[] = []; for (const detector of patternDetectors) { // Skip detectors that don't handle any enabled types @@ -63,17 +53,80 @@ export function detectSecrets( const result = detector.detect(textToScan, enabledTypes); allMatches.push(...result.matches); - if (result.redactions) { - allRedactions.push(...result.redactions); + if (result.locations) { + allLocations.push(...result.locations); } } - // Sort redactions by start position (descending) for safe replacement - allRedactions.sort((a, b) => b.start - a.start); + // Sort locations by start position (descending) for safe replacement + allLocations.sort((a, b) => b.start - a.start); return { detected: allMatches.length > 0, matches: allMatches, - redactions: allRedactions.length > 0 ? allRedactions : undefined, + locations: allLocations.length > 0 ? allLocations : undefined, + }; +} + +/** + * Detects secrets in chat messages with per-part granularity + * + * For string content, partIdx is always 0. + * For array content (multimodal), each text part is scanned separately. + * This avoids complex offset mapping when applying masks. + */ +export function detectSecretsInMessages( + messages: ChatMessage[], + config: SecretsDetectionConfig, +): MessageSecretsResult { + if (!config.enabled) { + return { + detected: false, + matches: [], + messageLocations: messages.map(() => []), + }; + } + + const matchCounts = new Map(); + + const messageLocations: SecretLocation[][][] = messages.map((message) => { + // String content → single part at index 0 + if (typeof message.content === "string") { + const result = detectSecrets(message.content, config); + for (const match of result.matches) { + matchCounts.set(match.type, (matchCounts.get(match.type) || 0) + match.count); + } + return [result.locations || []]; + } + + // Array content (multimodal) → one array per part + if (Array.isArray(message.content)) { + return message.content.map((part: ContentPart) => { + if (part.type !== "text" || typeof part.text !== "string") { + return []; + } + const result = detectSecrets(part.text, config); + for (const match of result.matches) { + matchCounts.set(match.type, (matchCounts.get(match.type) || 0) + match.count); + } + return result.locations || []; + }); + } + + // Null/undefined content + return []; + }); + + const allMatches: SecretsMatch[] = []; + for (const [type, count] of matchCounts) { + allMatches.push({ type: type as SecretLocation["type"], count }); + } + + const hasLocations = messageLocations.some((msg) => msg.some((part) => part.length > 0)); + + return { + detected: hasLocations, + matches: allMatches, + messageLocations, }; } diff --git a/src/secrets/mask.test.ts b/src/secrets/mask.test.ts new file mode 100644 index 0000000..e58cb46 --- /dev/null +++ b/src/secrets/mask.test.ts @@ -0,0 +1,272 @@ +import { describe, expect, test } from "bun:test"; +import { createSecretsResult } from "../test-utils/detection-results"; +import type { SecretLocation } from "./detect"; +import { + createSecretsMaskingContext, + flushSecretsMaskingBuffer, + maskMessages, + maskSecrets, + unmaskSecrets, + unmaskSecretsResponse, + unmaskSecretsStreamChunk, +} from "./mask"; + +const sampleSecret = "sk-proj-abc123def456ghi789jkl012mno345pqr678stu901vwx"; + +describe("secrets placeholder format", () => { + test("uses [[SECRET_MASKED_TYPE_N]] format", () => { + const text = `My API key is ${sampleSecret}`; + const locations: SecretLocation[] = [ + { start: 14, end: 14 + sampleSecret.length, type: "API_KEY_OPENAI" }, + ]; + const result = maskSecrets(text, locations); + + expect(result.masked).toBe("My API key is [[SECRET_MASKED_API_KEY_OPENAI_1]]"); + }); + + test("increments counter per secret type", () => { + const anotherSecret = "sk-proj-xyz789abc123def456ghi789jkl012mno345pqr678"; + const text = `Key1: ${sampleSecret} Key2: ${anotherSecret}`; + const locations: SecretLocation[] = [ + { start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }, + { + start: 6 + sampleSecret.length + 7, + end: 6 + sampleSecret.length + 7 + anotherSecret.length, + type: "API_KEY_OPENAI", + }, + ]; + const result = maskSecrets(text, locations); + + expect(result.masked).toContain("[[SECRET_MASKED_API_KEY_OPENAI_1]]"); + expect(result.masked).toContain("[[SECRET_MASKED_API_KEY_OPENAI_2]]"); + }); + + test("tracks different secret types separately", () => { + const awsKey = "AKIAIOSFODNN7EXAMPLE"; + const text = `OpenAI: ${sampleSecret} AWS: ${awsKey}`; + const locations: SecretLocation[] = [ + { start: 8, end: 8 + sampleSecret.length, type: "API_KEY_OPENAI" }, + { + start: 8 + sampleSecret.length + 6, + end: 8 + sampleSecret.length + 6 + awsKey.length, + type: "API_KEY_AWS", + }, + ]; + const result = maskSecrets(text, locations); + + expect(result.masked).toContain("[[SECRET_MASKED_API_KEY_OPENAI_1]]"); + expect(result.masked).toContain("[[SECRET_MASKED_API_KEY_AWS_1]]"); + }); +}); + +describe("maskMessages with MessageSecretsResult", () => { + test("masks secrets in multiple messages", () => { + const messages = [ + { role: "user" as const, content: `My key is ${sampleSecret}` }, + { role: "assistant" as const, content: "I'll help you with that." }, + ]; + const detection = createSecretsResult([ + [[{ start: 10, end: 10 + sampleSecret.length, type: "API_KEY_OPENAI" }]], + [[]], + ]); + + const { masked, context } = maskMessages(messages, detection); + + expect(masked[0].content).toContain("[[SECRET_MASKED_API_KEY_OPENAI_1]]"); + expect(masked[0].content).not.toContain(sampleSecret); + expect(masked[1].content).toBe("I'll help you with that."); + expect(Object.keys(context.mapping)).toHaveLength(1); + }); + + test("shares context across messages - same secret gets same placeholder", () => { + const messages = [ + { role: "user" as const, content: `Key1: ${sampleSecret}` }, + { role: "user" as const, content: `Key2: ${sampleSecret}` }, + ]; + const detection = createSecretsResult([ + [[{ start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }]], + [[{ start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }]], + ]); + + const { masked, context } = maskMessages(messages, detection); + + expect(masked[0].content).toBe("Key1: [[SECRET_MASKED_API_KEY_OPENAI_1]]"); + expect(masked[1].content).toBe("Key2: [[SECRET_MASKED_API_KEY_OPENAI_1]]"); + expect(Object.keys(context.mapping)).toHaveLength(1); + }); + + test("handles multimodal array content", () => { + const messages = [ + { + role: "user" as const, + content: [ + { type: "text", text: `Key: ${sampleSecret}` }, + { type: "image_url", image_url: { url: "https://example.com/img.jpg" } }, + ], + }, + ]; + const detection = createSecretsResult([ + [[{ start: 5, end: 5 + sampleSecret.length, type: "API_KEY_OPENAI" }], []], + ]); + + const { masked } = maskMessages(messages, detection); + + const content = masked[0].content as Array<{ type: string; text?: string }>; + expect(content[0].text).toBe("Key: [[SECRET_MASKED_API_KEY_OPENAI_1]]"); + expect(content[1].type).toBe("image_url"); + }); +}); + +describe("streaming with secrets placeholders", () => { + test("buffers partial [[SECRET_MASKED placeholder", () => { + const context = createSecretsMaskingContext(); + context.mapping["[[SECRET_MASKED_API_KEY_OPENAI_1]]"] = sampleSecret; + + const { output, remainingBuffer } = unmaskSecretsStreamChunk("", "Key: [[SECRET_MAS", context); + + expect(output).toBe("Key: "); + expect(remainingBuffer).toBe("[[SECRET_MAS"); + }); + + test("completes buffered placeholder across chunks", () => { + const context = createSecretsMaskingContext(); + context.mapping["[[SECRET_MASKED_API_KEY_OPENAI_1]]"] = sampleSecret; + + const { output, remainingBuffer } = unmaskSecretsStreamChunk( + "[[SECRET_MAS", + "KED_API_KEY_OPENAI_1]] done", + context, + ); + + expect(output).toBe(`${sampleSecret} done`); + expect(remainingBuffer).toBe(""); + }); + + test("flushes incomplete buffer as-is", () => { + const context = createSecretsMaskingContext(); + const result = flushSecretsMaskingBuffer("[[SECRET_MAS", context); + expect(result).toBe("[[SECRET_MAS"); + }); +}); + +describe("mask -> unmask roundtrip", () => { + test("preserves original data through roundtrip", () => { + const originalText = ` +Here are my credentials: +OpenAI API Key: ${sampleSecret} +Please store them securely. +`; + const locations: SecretLocation[] = [ + { + start: originalText.indexOf(sampleSecret), + end: originalText.indexOf(sampleSecret) + sampleSecret.length, + type: "API_KEY_OPENAI", + }, + ]; + + const { masked, context } = maskSecrets(originalText, locations); + + expect(masked).not.toContain(sampleSecret); + expect(masked).toContain("[[SECRET_MASKED_API_KEY_OPENAI_1]]"); + + const restored = unmaskSecrets(masked, context); + expect(restored).toBe(originalText); + }); +}); + +describe("unmaskSecretsResponse", () => { + test("unmasks all choices in response", () => { + const context = createSecretsMaskingContext(); + context.mapping["[[SECRET_MASKED_API_KEY_OPENAI_1]]"] = sampleSecret; + + const response = { + id: "test", + object: "chat.completion" as const, + created: Date.now(), + model: "gpt-4", + choices: [ + { + index: 0, + message: { + role: "assistant" as const, + content: "Your key is [[SECRET_MASKED_API_KEY_OPENAI_1]]", + }, + finish_reason: "stop" as const, + }, + ], + }; + + const result = unmaskSecretsResponse(response, context); + expect(result.choices[0].message.content).toBe(`Your key is ${sampleSecret}`); + }); + + test("preserves response structure", () => { + const context = createSecretsMaskingContext(); + const response = { + id: "test-id", + object: "chat.completion" as const, + created: 12345, + model: "gpt-4-turbo", + choices: [ + { + index: 0, + message: { role: "assistant" as const, content: "Hello" }, + finish_reason: "stop" as const, + }, + ], + usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }, + }; + + const result = unmaskSecretsResponse(response, context); + expect(result.id).toBe("test-id"); + expect(result.model).toBe("gpt-4-turbo"); + expect(result.usage).toEqual({ prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }); + }); +}); + +describe("edge cases", () => { + test("returns original text when no locations", () => { + const text = "Hello world"; + const result = maskSecrets(text, []); + expect(result.masked).toBe("Hello world"); + expect(Object.keys(result.context.mapping)).toHaveLength(0); + }); + + test("reuses placeholder for duplicate secret values", () => { + const text = `Key1: ${sampleSecret} Key2: ${sampleSecret}`; + const locations: SecretLocation[] = [ + { start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }, + { + start: 6 + sampleSecret.length + 7, + end: 6 + sampleSecret.length * 2 + 7, + type: "API_KEY_OPENAI", + }, + ]; + const result = maskSecrets(text, locations); + + expect(result.masked).toBe( + "Key1: [[SECRET_MASKED_API_KEY_OPENAI_1]] Key2: [[SECRET_MASKED_API_KEY_OPENAI_1]]", + ); + expect(Object.keys(result.context.mapping)).toHaveLength(1); + }); + + test("preserves context across multiple calls", () => { + const context = createSecretsMaskingContext(); + + maskSecrets( + `Key: ${sampleSecret}`, + [{ start: 5, end: 5 + sampleSecret.length, type: "API_KEY_OPENAI" }], + context, + ); + + const anotherSecret = "sk-proj-xyz789abc123def456ghi789jkl012mno345pqr678"; + const result2 = maskSecrets( + `Another: ${anotherSecret}`, + [{ start: 9, end: 9 + anotherSecret.length, type: "API_KEY_OPENAI" }], + context, + ); + + expect(result2.masked).toBe("Another: [[SECRET_MASKED_API_KEY_OPENAI_2]]"); + expect(Object.keys(context.mapping)).toHaveLength(2); + }); +}); diff --git a/src/secrets/mask.ts b/src/secrets/mask.ts new file mode 100644 index 0000000..0c8cf19 --- /dev/null +++ b/src/secrets/mask.ts @@ -0,0 +1,115 @@ +import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client"; +import { resolveOverlaps } from "../utils/conflict-resolver"; +import { + createPlaceholderContext, + flushBuffer, + incrementAndGenerate, + type MaskResult, + type PlaceholderContext, + processStreamChunk, + replaceWithPlaceholders, + restorePlaceholders, + restoreResponsePlaceholders, + transformMessagesPerPart, +} from "../utils/message-transform"; +import { generateSecretPlaceholder } from "../utils/placeholders"; +import type { MessageSecretsResult, SecretLocation } from "./detect"; + +export type { MaskResult } from "../utils/message-transform"; + +/** + * Creates a new secrets masking context for a request + */ +export function createSecretsMaskingContext(): PlaceholderContext { + return createPlaceholderContext(); +} + +/** + * Generates a placeholder for a secret type + * + * Format: [[SECRET_MASKED_{TYPE}_{N}]] e.g. [[SECRET_MASKED_API_KEY_OPENAI_1]] + */ +function generatePlaceholder(secretType: string, context: PlaceholderContext): string { + return incrementAndGenerate(secretType, context, generateSecretPlaceholder); +} + +/** + * Masks secrets in text, replacing them with placeholders + */ +export function maskSecrets( + text: string, + locations: SecretLocation[], + context?: PlaceholderContext, +): MaskResult { + const ctx = context || createSecretsMaskingContext(); + const masked = replaceWithPlaceholders( + text, + locations, + ctx, + (loc) => loc.type, + generatePlaceholder, + resolveOverlaps, + ); + return { masked, context: ctx }; +} + +/** + * Unmasks text by replacing placeholders with original secrets + * + * @param text - Text containing secret placeholders + * @param context - Masking context with mappings + */ +export function unmaskSecrets(text: string, context: PlaceholderContext): string { + return restorePlaceholders(text, context); +} + +/** + * Masks secrets in messages using per-part detection results + * + * Uses transformMessagesPerPart for the common iteration pattern. + */ +export function maskMessages( + messages: ChatMessage[], + detection: MessageSecretsResult, +): { masked: ChatMessage[]; context: PlaceholderContext } { + const context = createSecretsMaskingContext(); + + const masked = transformMessagesPerPart( + messages, + detection.messageLocations, + (text, locations, ctx) => maskSecrets(text, locations, ctx).masked, + context, + ); + + return { masked, context }; +} + +/** + * Streaming unmask helper - processes chunks and unmasks when complete placeholders are found + * + * Returns the unmasked portion and any remaining buffer that might contain partial placeholders. + */ +export function unmaskSecretsStreamChunk( + buffer: string, + newChunk: string, + context: PlaceholderContext, +): { output: string; remainingBuffer: string } { + return processStreamChunk(buffer, newChunk, context, unmaskSecrets); +} + +/** + * Flushes remaining buffer at end of stream + */ +export function flushSecretsMaskingBuffer(buffer: string, context: PlaceholderContext): string { + return flushBuffer(buffer, context, unmaskSecrets); +} + +/** + * Unmasks a chat completion response by replacing placeholders in all choices + */ +export function unmaskSecretsResponse( + response: ChatCompletionResponse, + context: PlaceholderContext, +): ChatCompletionResponse { + return restoreResponsePlaceholders(response, context); +} diff --git a/src/secrets/multimodal.test.ts b/src/secrets/multimodal.test.ts index b58be8c..e5a4a2a 100644 --- a/src/secrets/multimodal.test.ts +++ b/src/secrets/multimodal.test.ts @@ -1,11 +1,25 @@ import { describe, expect, test } from "bun:test"; +import type { PIIDetectionResult, PIIEntity } from "../pii/detect"; +import { maskMessages } from "../pii/mask"; import type { ChatMessage } from "../services/llm-client"; -import { maskMessages } from "../services/masking"; -import type { PIIEntity } from "../services/pii-detector"; import type { ContentPart } from "../utils/content"; +/** + * Helper to create PIIDetectionResult from per-part entities + */ +function createPIIResult(messageEntities: PIIEntity[][][]): PIIDetectionResult { + return { + hasPII: messageEntities.flat(2).length > 0, + messageEntities, + allEntities: messageEntities.flat(2), + scanTimeMs: 0, + language: "en", + languageFallback: false, + }; +} + describe("Multimodal content handling", () => { - describe("PII masking with offset tracking", () => { + describe("PII masking with per-part entities", () => { test("masks PII in multimodal array content", () => { const messages: ChatMessage[] = [ { @@ -18,16 +32,19 @@ describe("Multimodal content handling", () => { }, ]; - // Concatenated text: "My email is john@example.com and\nmy phone is 555-1234" - // Entities for this concatenated text: - const entities: PIIEntity[] = [ - { entity_type: "EMAIL_ADDRESS", start: 12, end: 28, score: 0.9 }, // john@example.com in part 0 - { entity_type: "PHONE_NUMBER", start: 45, end: 53, score: 0.85 }, // 555-1234 in part 2 (after newline) - ]; - - const entitiesByMessage = [entities]; + // Per-part entities: messageEntities[msgIdx][partIdx] = entities + const detection = createPIIResult([ + [ + // Part 0: email entity (positions relative to part text) + [{ entity_type: "EMAIL_ADDRESS", start: 12, end: 28, score: 0.9 }], + // Part 1: image, no entities + [], + // Part 2: phone entity (positions relative to part text) + [{ entity_type: "PHONE_NUMBER", start: 12, end: 20, score: 0.85 }], + ], + ]); - const { masked } = maskMessages(messages, entitiesByMessage); + const { masked } = maskMessages(messages, detection); // Verify the content is still an array expect(Array.isArray(masked[0].content)).toBe(true); @@ -50,8 +67,6 @@ describe("Multimodal content handling", () => { }); test("returns masked array instead of original unmasked array", () => { - // This tests the bug fix: previously array content was extracted and masked, - // but then the original array was returned unchanged const messages: ChatMessage[] = [ { role: "user", @@ -59,12 +74,17 @@ describe("Multimodal content handling", () => { }, ]; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 8, end: 13, score: 0.9 }, // Alice - { entity_type: "EMAIL_ADDRESS", start: 17, end: 33, score: 0.95 }, // alice@secret.com - ]; + const detection = createPIIResult([ + [ + // Part 0 entities + [ + { entity_type: "PERSON", start: 8, end: 13, score: 0.9 }, + { entity_type: "EMAIL_ADDRESS", start: 17, end: 33, score: 0.95 }, + ], + ], + ]); - const { masked } = maskMessages(messages, [entities]); + const { masked } = maskMessages(messages, detection); // Verify content is still array expect(Array.isArray(masked[0].content)).toBe(true); @@ -78,40 +98,58 @@ describe("Multimodal content handling", () => { expect(maskedContent[0].text).toContain("[[EMAIL_ADDRESS_1]]"); }); - test("handles entities spanning multiple parts with proper offsets", () => { + test("handles multiple text parts independently", () => { const messages: ChatMessage[] = [ { role: "user", content: [ - { type: "text", text: "First part with email@" }, - { type: "text", text: "example.com in two parts" }, + { type: "text", text: "First: john@example.com" }, + { type: "text", text: "Second: jane@example.com" }, ], }, ]; - // In concatenated text: "First part with email@\nexample.com in two parts" - // Email spans from position 16 to 39 (crossing the newline at position 22) - const entities: PIIEntity[] = [ - { entity_type: "EMAIL_ADDRESS", start: 16, end: 34, score: 0.9 }, - ]; + const detection = createPIIResult([ + [ + // Part 0 entity + [{ entity_type: "EMAIL_ADDRESS", start: 7, end: 23, score: 0.9 }], + // Part 1 entity + [{ entity_type: "EMAIL_ADDRESS", start: 8, end: 24, score: 0.9 }], + ], + ]); - const { masked } = maskMessages(messages, [entities]); + const { masked } = maskMessages(messages, detection); const maskedContent = masked[0].content as ContentPart[]; - // Both parts should be affected by the email entity - // Part 0: "First part with [[EMAIL" or similar - // Part 1: "ADDRESS_1]] in two parts" or similar - // The exact split depends on how the masking handles cross-boundary entities + expect(maskedContent[0].text).toBe("First: [[EMAIL_ADDRESS_1]]"); + expect(maskedContent[1].text).toBe("Second: [[EMAIL_ADDRESS_2]]"); + }); + + test("handles mixed string and array content messages", () => { + const messages: ChatMessage[] = [ + { role: "system", content: "You are helpful" }, + { + role: "user", + content: [{ type: "text", text: "My name is John" }], + }, + { role: "assistant", content: "Hello John!" }, + ]; + + const detection = createPIIResult([ + // Message 0 (system): no PII + [[]], + // Message 1 (user multimodal): PII in part 0 + [[{ entity_type: "PERSON", start: 11, end: 15, score: 0.9 }]], + // Message 2 (assistant): PII in part 0 + [[{ entity_type: "PERSON", start: 6, end: 10, score: 0.9 }]], + ]); - // At minimum, verify that the entity is masked somewhere - const fullMasked = maskedContent - .filter((p) => p.type === "text") - .map((p) => p.text) - .join("\n"); + const { masked } = maskMessages(messages, detection); - expect(fullMasked).toContain("[[EMAIL_ADDRESS_"); - expect(fullMasked).not.toContain("email@example.com"); + expect(masked[0].content).toBe("You are helpful"); + expect((masked[1].content as ContentPart[])[0].text).toBe("My name is [[PERSON_1]]"); + expect(masked[2].content).toBe("Hello [[PERSON_1]]!"); }); }); }); diff --git a/src/secrets/patterns/api-keys.ts b/src/secrets/patterns/api-keys.ts index 438c1cd..57e33e8 100644 --- a/src/secrets/patterns/api-keys.ts +++ b/src/secrets/patterns/api-keys.ts @@ -1,4 +1,4 @@ -import type { PatternDetector, SecretsMatch, SecretsRedaction } from "./types"; +import type { PatternDetector, SecretLocation, SecretsMatch } from "./types"; import { detectPattern } from "./utils"; /** @@ -14,31 +14,31 @@ export const apiKeysDetector: PatternDetector = { detect(text: string, enabledTypes: Set) { const matches: SecretsMatch[] = []; - const redactions: SecretsRedaction[] = []; + const locations: SecretLocation[] = []; // OpenAI API keys: sk-... followed by alphanumeric chars // Modern format: sk-proj-... or sk-... with 48+ total chars if (enabledTypes.has("API_KEY_OPENAI")) { const openaiPattern = /sk-[a-zA-Z0-9_-]{45,}/g; - detectPattern(text, openaiPattern, "API_KEY_OPENAI", matches, redactions); + detectPattern(text, openaiPattern, "API_KEY_OPENAI", matches, locations); } // AWS access keys: AKIA followed by 16 uppercase alphanumeric chars if (enabledTypes.has("API_KEY_AWS")) { const awsPattern = /AKIA[0-9A-Z]{16}/g; - detectPattern(text, awsPattern, "API_KEY_AWS", matches, redactions); + detectPattern(text, awsPattern, "API_KEY_AWS", matches, locations); } // GitHub tokens: ghp_, gho_, ghu_, ghs_, ghr_ followed by 36+ alphanumeric chars if (enabledTypes.has("API_KEY_GITHUB")) { const githubPattern = /gh[pousr]_[a-zA-Z0-9]{36,}/g; - detectPattern(text, githubPattern, "API_KEY_GITHUB", matches, redactions); + detectPattern(text, githubPattern, "API_KEY_GITHUB", matches, locations); } return { detected: matches.length > 0, matches, - redactions: redactions.length > 0 ? redactions : undefined, + locations: locations.length > 0 ? locations : undefined, }; }, }; diff --git a/src/secrets/patterns/env-vars.ts b/src/secrets/patterns/env-vars.ts index 3b5c602..fa1a7f4 100644 --- a/src/secrets/patterns/env-vars.ts +++ b/src/secrets/patterns/env-vars.ts @@ -1,4 +1,4 @@ -import type { PatternDetector, SecretsMatch, SecretsRedaction } from "./types"; +import type { PatternDetector, SecretLocation, SecretsMatch } from "./types"; import { detectPattern } from "./utils"; /** @@ -14,21 +14,21 @@ export const envVarsDetector: PatternDetector = { detect(text: string, enabledTypes: Set) { const matches: SecretsMatch[] = []; - const redactions: SecretsRedaction[] = []; + const locations: SecretLocation[] = []; // Environment variable password patterns: _PASSWORD or _PWD suffix with value (8+ chars) // Case-insensitive for variable name, supports = and : assignment, quoted/unquoted values if (enabledTypes.has("ENV_PASSWORD")) { const passwordPattern = /[A-Za-z_][A-Za-z0-9_]*(?:PASSWORD|_PWD)\s*[=:]\s*['"]?[^\s'"]{8,}['"]?/gi; - detectPattern(text, passwordPattern, "ENV_PASSWORD", matches, redactions); + detectPattern(text, passwordPattern, "ENV_PASSWORD", matches, locations); } // Environment variable secret patterns: _SECRET suffix with value (8+ chars) // Case-insensitive for variable name, supports = and : assignment, quoted/unquoted values if (enabledTypes.has("ENV_SECRET")) { const secretPattern = /[A-Za-z_][A-Za-z0-9_]*_SECRET\s*[=:]\s*['"]?[^\s'"]{8,}['"]?/gi; - detectPattern(text, secretPattern, "ENV_SECRET", matches, redactions); + detectPattern(text, secretPattern, "ENV_SECRET", matches, locations); } // Database connection strings with embedded passwords (user:password@host format) @@ -36,13 +36,13 @@ export const envVarsDetector: PatternDetector = { if (enabledTypes.has("CONNECTION_STRING")) { const connectionPattern = /(?:postgres(?:ql)?|mysql|mariadb|mongodb(?:\+srv)?|redis|amqps?):\/\/[^:]+:[^@\s]+@[^\s'"]+/gi; - detectPattern(text, connectionPattern, "CONNECTION_STRING", matches, redactions); + detectPattern(text, connectionPattern, "CONNECTION_STRING", matches, locations); } return { detected: matches.length > 0, matches, - redactions: redactions.length > 0 ? redactions : undefined, + locations: locations.length > 0 ? locations : undefined, }; }, }; diff --git a/src/secrets/patterns/private-keys.ts b/src/secrets/patterns/private-keys.ts index 2325eea..1c94424 100644 --- a/src/secrets/patterns/private-keys.ts +++ b/src/secrets/patterns/private-keys.ts @@ -1,8 +1,8 @@ import type { PatternDetector, + SecretLocation, SecretsDetectionResult, SecretsMatch, - SecretsRedaction, } from "./types"; import { detectPattern } from "./utils"; @@ -18,13 +18,13 @@ export const privateKeysDetector: PatternDetector = { detect(text: string, enabledTypes: Set): SecretsDetectionResult { const matches: SecretsMatch[] = []; - const redactions: SecretsRedaction[] = []; + const locations: SecretLocation[] = []; // OpenSSH private key pattern if (enabledTypes.has("OPENSSH_PRIVATE_KEY")) { const opensshPattern = /-----BEGIN OPENSSH PRIVATE KEY-----[\s\S]*?-----END OPENSSH PRIVATE KEY-----/g; - detectPattern(text, opensshPattern, "OPENSSH_PRIVATE_KEY", matches, redactions); + detectPattern(text, opensshPattern, "OPENSSH_PRIVATE_KEY", matches, locations); } // PEM private key patterns @@ -34,7 +34,7 @@ export const privateKeysDetector: PatternDetector = { // RSA PRIVATE KEY const rsaPattern = /-----BEGIN RSA PRIVATE KEY-----[\s\S]*?-----END RSA PRIVATE KEY-----/g; - detectPattern(text, rsaPattern, "PEM_PRIVATE_KEY", matches, redactions, matchedPositions); + detectPattern(text, rsaPattern, "PEM_PRIVATE_KEY", matches, locations, matchedPositions); // Remove PEM_PRIVATE_KEY from matches to accumulate all PEM types together const pemMatch = matches.find((m) => m.type === "PEM_PRIVATE_KEY"); @@ -51,7 +51,7 @@ export const privateKeysDetector: PatternDetector = { privateKeyPattern, "PEM_PRIVATE_KEY", tempMatches, - redactions, + locations, matchedPositions, ); totalPemCount += tempMatches[0]?.count || 0; @@ -65,7 +65,7 @@ export const privateKeysDetector: PatternDetector = { encryptedPattern, "PEM_PRIVATE_KEY", tempMatches2, - redactions, + locations, matchedPositions, ); totalPemCount += tempMatches2[0]?.count || 0; @@ -78,7 +78,7 @@ export const privateKeysDetector: PatternDetector = { return { detected: matches.length > 0, matches, - redactions: redactions.length > 0 ? redactions : undefined, + locations: locations.length > 0 ? locations : undefined, }; }, }; diff --git a/src/secrets/patterns/tokens.ts b/src/secrets/patterns/tokens.ts index c5a6c02..ff940b6 100644 --- a/src/secrets/patterns/tokens.ts +++ b/src/secrets/patterns/tokens.ts @@ -1,4 +1,4 @@ -import type { PatternDetector, SecretsMatch, SecretsRedaction } from "./types"; +import type { PatternDetector, SecretLocation, SecretsMatch } from "./types"; import { detectPattern } from "./utils"; /** @@ -13,26 +13,26 @@ export const tokensDetector: PatternDetector = { detect(text: string, enabledTypes: Set) { const matches: SecretsMatch[] = []; - const redactions: SecretsRedaction[] = []; + const locations: SecretLocation[] = []; // JWT tokens: three base64url segments separated by dots // Header starts with eyJ (base64 for {"...), minimum 20 chars per segment if (enabledTypes.has("JWT_TOKEN")) { const jwtPattern = /eyJ[a-zA-Z0-9_-]{20,}\.eyJ[a-zA-Z0-9_-]{20,}\.[a-zA-Z0-9_-]{20,}/g; - detectPattern(text, jwtPattern, "JWT_TOKEN", matches, redactions); + detectPattern(text, jwtPattern, "JWT_TOKEN", matches, locations); } // Bearer tokens in Authorization-style contexts // Matches "Bearer " followed by a token (at least 40 chars to reduce placeholder matches) if (enabledTypes.has("BEARER_TOKEN")) { const bearerPattern = /Bearer\s+[a-zA-Z0-9._-]{40,}/gi; - detectPattern(text, bearerPattern, "BEARER_TOKEN", matches, redactions); + detectPattern(text, bearerPattern, "BEARER_TOKEN", matches, locations); } return { detected: matches.length > 0, matches, - redactions: redactions.length > 0 ? redactions : undefined, + locations: locations.length > 0 ? locations : undefined, }; }, }; diff --git a/src/secrets/patterns/types.ts b/src/secrets/patterns/types.ts index 95b6f27..1c19985 100644 --- a/src/secrets/patterns/types.ts +++ b/src/secrets/patterns/types.ts @@ -18,7 +18,10 @@ export interface SecretsMatch { count: number; } -export interface SecretsRedaction { +/** + * Location of a detected secret in text + */ +export interface SecretLocation { start: number; end: number; type: SecretEntityType; @@ -27,7 +30,18 @@ export interface SecretsRedaction { export interface SecretsDetectionResult { detected: boolean; matches: SecretsMatch[]; - redactions?: SecretsRedaction[]; + locations?: SecretLocation[]; +} + +/** + * Per-message, per-part secrets detection result + * Structure: messageLocations[msgIdx][partIdx] = locations for that part + */ +export interface MessageSecretsResult { + detected: boolean; + matches: SecretsMatch[]; + /** Per-message, per-part secret locations */ + messageLocations: SecretLocation[][][]; } /** diff --git a/src/secrets/patterns/utils.ts b/src/secrets/patterns/utils.ts index 6d124d2..58f14fa 100644 --- a/src/secrets/patterns/utils.ts +++ b/src/secrets/patterns/utils.ts @@ -1,14 +1,14 @@ -import type { SecretsMatch, SecretsRedaction } from "./types"; +import type { SecretLocation, SecretsMatch } from "./types"; /** - * Helper to detect secrets matching a pattern and collect matches/redactions + * Helper to detect secrets matching a pattern and collect matches/locations */ export function detectPattern( text: string, pattern: RegExp, entityType: string, matches: SecretsMatch[], - redactions: SecretsRedaction[], + locations: SecretLocation[], existingPositions?: Set, ): number { let count = 0; @@ -19,10 +19,10 @@ export function detectPattern( count++; existingPositions?.add(match.index); - redactions.push({ + locations.push({ start: match.index, end: match.index + match[0].length, - type: entityType as SecretsRedaction["type"], + type: entityType as SecretLocation["type"], }); } } diff --git a/src/secrets/redact.test.ts b/src/secrets/redact.test.ts deleted file mode 100644 index c865705..0000000 --- a/src/secrets/redact.test.ts +++ /dev/null @@ -1,368 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import type { SecretsRedaction } from "./detect"; -import { - createRedactionContext, - flushRedactionBuffer, - redactMessagesSecrets, - redactSecrets, - unredactResponse, - unredactSecrets, - unredactStreamChunk, -} from "./redact"; - -const sampleSecret = "sk-proj-abc123def456ghi789jkl012mno345pqr678stu901vwx"; - -describe("redactSecrets", () => { - test("returns original text when no redactions", () => { - const text = "Hello world"; - const result = redactSecrets(text, []); - expect(result.redacted).toBe("Hello world"); - expect(Object.keys(result.context.mapping)).toHaveLength(0); - }); - - test("redacts single secret", () => { - const text = `My API key is ${sampleSecret}`; - const redactions: SecretsRedaction[] = [ - { start: 14, end: 14 + sampleSecret.length, type: "API_KEY_OPENAI" }, - ]; - const result = redactSecrets(text, redactions); - - expect(result.redacted).toBe("My API key is [[SECRET_REDACTED_API_KEY_OPENAI_1]]"); - expect(result.context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"]).toBe(sampleSecret); - }); - - test("redacts multiple secrets of same type", () => { - const text = `Key1: ${sampleSecret} Key2: ${sampleSecret}`; - const redactions: SecretsRedaction[] = [ - { start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }, - { - start: 6 + sampleSecret.length + 7, - end: 6 + sampleSecret.length * 2 + 7, - type: "API_KEY_OPENAI", - }, - ]; - const result = redactSecrets(text, redactions); - - // Same secret value should get same placeholder - expect(result.redacted).toBe( - "Key1: [[SECRET_REDACTED_API_KEY_OPENAI_1]] Key2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]", - ); - expect(Object.keys(result.context.mapping)).toHaveLength(1); - }); - - test("redacts multiple secrets of different types", () => { - const awsKey = "AKIAIOSFODNN7EXAMPLE"; - const text = `OpenAI: ${sampleSecret} AWS: ${awsKey}`; - const redactions: SecretsRedaction[] = [ - { start: 8, end: 8 + sampleSecret.length, type: "API_KEY_OPENAI" }, - { - start: 8 + sampleSecret.length + 6, - end: 8 + sampleSecret.length + 6 + awsKey.length, - type: "API_KEY_AWS", - }, - ]; - const result = redactSecrets(text, redactions); - - expect(result.redacted).toContain("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); - expect(result.redacted).toContain("[[SECRET_REDACTED_API_KEY_AWS_1]]"); - expect(Object.keys(result.context.mapping)).toHaveLength(2); - }); - - test("preserves context across multiple calls", () => { - const context = createRedactionContext(); - const text1 = `Key: ${sampleSecret}`; - const redactions1: SecretsRedaction[] = [ - { start: 5, end: 5 + sampleSecret.length, type: "API_KEY_OPENAI" }, - ]; - redactSecrets(text1, redactions1, context); - - const anotherSecret = "sk-proj-xyz789abc123def456ghi789jkl012mno345pqr678"; - const text2 = `Another: ${anotherSecret}`; - const redactions2: SecretsRedaction[] = [ - { start: 9, end: 9 + anotherSecret.length, type: "API_KEY_OPENAI" }, - ]; - const result2 = redactSecrets(text2, redactions2, context); - - // Second secret should get incremented counter - expect(result2.redacted).toBe("Another: [[SECRET_REDACTED_API_KEY_OPENAI_2]]"); - expect(Object.keys(context.mapping)).toHaveLength(2); - }); -}); - -describe("unredactSecrets", () => { - test("returns original text when no mappings", () => { - const context = createRedactionContext(); - const text = "Hello world"; - const result = unredactSecrets(text, context); - expect(result).toBe("Hello world"); - }); - - test("restores single secret", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const text = "My API key is [[SECRET_REDACTED_API_KEY_OPENAI_1]]"; - const result = unredactSecrets(text, context); - - expect(result).toBe(`My API key is ${sampleSecret}`); - }); - - test("restores multiple secrets", () => { - const context = createRedactionContext(); - const awsKey = "AKIAIOSFODNN7EXAMPLE"; - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - context.mapping["[[SECRET_REDACTED_API_KEY_AWS_1]]"] = awsKey; - - const text = - "OpenAI: [[SECRET_REDACTED_API_KEY_OPENAI_1]] AWS: [[SECRET_REDACTED_API_KEY_AWS_1]]"; - const result = unredactSecrets(text, context); - - expect(result).toBe(`OpenAI: ${sampleSecret} AWS: ${awsKey}`); - }); - - test("restores repeated placeholders", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const text = - "Key1: [[SECRET_REDACTED_API_KEY_OPENAI_1]] Key2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]"; - const result = unredactSecrets(text, context); - - expect(result).toBe(`Key1: ${sampleSecret} Key2: ${sampleSecret}`); - }); -}); - -describe("redact -> unredact roundtrip", () => { - test("preserves original data through roundtrip", () => { - const originalText = ` -Here are my credentials: -OpenAI API Key: ${sampleSecret} -Please store them securely. -`; - const redactions: SecretsRedaction[] = [ - { - start: originalText.indexOf(sampleSecret), - end: originalText.indexOf(sampleSecret) + sampleSecret.length, - type: "API_KEY_OPENAI", - }, - ]; - - const { redacted, context } = redactSecrets(originalText, redactions); - - // Verify secret is not in redacted text - expect(redacted).not.toContain(sampleSecret); - expect(redacted).toContain("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); - - // Unredact and verify original is restored - const restored = unredactSecrets(redacted, context); - expect(restored).toBe(originalText); - }); - - test("handles empty redactions array", () => { - const text = "No secrets here"; - const { redacted, context } = redactSecrets(text, []); - const restored = unredactSecrets(redacted, context); - expect(restored).toBe(text); - }); -}); - -describe("redactMessagesSecrets", () => { - test("redacts secrets in multiple messages", () => { - const messages = [ - { role: "user" as const, content: `My key is ${sampleSecret}` }, - { role: "assistant" as const, content: "I'll help you with that." }, - ]; - const redactionsByMessage: SecretsRedaction[][] = [ - [{ start: 10, end: 10 + sampleSecret.length, type: "API_KEY_OPENAI" }], - [], - ]; - - const { redacted, context } = redactMessagesSecrets(messages, redactionsByMessage); - - expect(redacted[0].content).toContain("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); - expect(redacted[0].content).not.toContain(sampleSecret); - expect(redacted[1].content).toBe("I'll help you with that."); - expect(Object.keys(context.mapping)).toHaveLength(1); - }); - - test("preserves message roles", () => { - const messages = [ - { role: "system" as const, content: "You are helpful" }, - { role: "user" as const, content: `Key: ${sampleSecret}` }, - ]; - const redactionsByMessage: SecretsRedaction[][] = [ - [], - [{ start: 5, end: 5 + sampleSecret.length, type: "API_KEY_OPENAI" }], - ]; - - const { redacted } = redactMessagesSecrets(messages, redactionsByMessage); - - expect(redacted[0].role).toBe("system"); - expect(redacted[1].role).toBe("user"); - }); - - test("shares context across messages", () => { - const messages = [ - { role: "user" as const, content: `Key1: ${sampleSecret}` }, - { role: "user" as const, content: `Key2: ${sampleSecret}` }, - ]; - const redactionsByMessage: SecretsRedaction[][] = [ - [{ start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }], - [{ start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }], - ]; - - const { redacted, context } = redactMessagesSecrets(messages, redactionsByMessage); - - // Same secret should get same placeholder across messages - expect(redacted[0].content).toBe("Key1: [[SECRET_REDACTED_API_KEY_OPENAI_1]]"); - expect(redacted[1].content).toBe("Key2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]"); - expect(Object.keys(context.mapping)).toHaveLength(1); - }); -}); - -describe("streaming unredact", () => { - test("unredacts complete placeholder in chunk", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const { output, remainingBuffer } = unredactStreamChunk( - "", - "Key: [[SECRET_REDACTED_API_KEY_OPENAI_1]] end", - context, - ); - - expect(output).toBe(`Key: ${sampleSecret} end`); - expect(remainingBuffer).toBe(""); - }); - - test("buffers partial placeholder", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const { output, remainingBuffer } = unredactStreamChunk("", "Key: [[SECRET_RED", context); - - expect(output).toBe("Key: "); - expect(remainingBuffer).toBe("[[SECRET_RED"); - }); - - test("completes buffered placeholder", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const { output, remainingBuffer } = unredactStreamChunk( - "[[SECRET_RED", - "ACTED_API_KEY_OPENAI_1]] done", - context, - ); - - expect(output).toBe(`${sampleSecret} done`); - expect(remainingBuffer).toBe(""); - }); - - test("handles text without placeholders", () => { - const context = createRedactionContext(); - - const { output, remainingBuffer } = unredactStreamChunk("", "Hello world", context); - - expect(output).toBe("Hello world"); - expect(remainingBuffer).toBe(""); - }); - - test("flushes remaining buffer", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const result = flushRedactionBuffer(" { - const context = createRedactionContext(); - const result = flushRedactionBuffer("", context); - expect(result).toBe(""); - }); -}); - -describe("unredactResponse", () => { - test("unredacts all choices in response", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const response = { - id: "test", - object: "chat.completion" as const, - created: Date.now(), - model: "gpt-4", - choices: [ - { - index: 0, - message: { - role: "assistant" as const, - content: "Your key is [[SECRET_REDACTED_API_KEY_OPENAI_1]]", - }, - finish_reason: "stop" as const, - }, - ], - }; - - const result = unredactResponse(response, context); - expect(result.choices[0].message.content).toBe(`Your key is ${sampleSecret}`); - }); - - test("handles multiple choices", () => { - const context = createRedactionContext(); - context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - - const response = { - id: "test", - object: "chat.completion" as const, - created: Date.now(), - model: "gpt-4", - choices: [ - { - index: 0, - message: { - role: "assistant" as const, - content: "Choice 1: [[SECRET_REDACTED_API_KEY_OPENAI_1]]", - }, - finish_reason: "stop" as const, - }, - { - index: 1, - message: { - role: "assistant" as const, - content: "Choice 2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]", - }, - finish_reason: "stop" as const, - }, - ], - }; - - const result = unredactResponse(response, context); - expect(result.choices[0].message.content).toBe(`Choice 1: ${sampleSecret}`); - expect(result.choices[1].message.content).toBe(`Choice 2: ${sampleSecret}`); - }); - - test("preserves response structure", () => { - const context = createRedactionContext(); - const response = { - id: "test-id", - object: "chat.completion" as const, - created: 12345, - model: "gpt-4-turbo", - choices: [ - { - index: 0, - message: { role: "assistant" as const, content: "Hello" }, - finish_reason: "stop" as const, - }, - ], - usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }, - }; - - const result = unredactResponse(response, context); - expect(result.id).toBe("test-id"); - expect(result.model).toBe("gpt-4-turbo"); - expect(result.usage).toEqual({ prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }); - }); -}); diff --git a/src/secrets/redact.ts b/src/secrets/redact.ts deleted file mode 100644 index 9526512..0000000 --- a/src/secrets/redact.ts +++ /dev/null @@ -1,211 +0,0 @@ -import { findPartialPlaceholderStart, generateSecretPlaceholder } from "../constants/placeholders"; -import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client"; -import { resolveOverlaps } from "../utils/conflict-resolver"; -import { extractTextContent } from "../utils/content"; -import type { SecretsRedaction } from "./detect"; - -/** - * Context for tracking secret redaction mappings - * Similar to MaskingContext for PII but for secrets - */ -export interface RedactionContext { - /** Maps placeholder -> original secret */ - mapping: Record; - /** Maps original secret -> placeholder */ - reverseMapping: Record; - /** Counter per secret type for sequential numbering */ - counters: Record; -} - -export interface RedactionResult { - redacted: string; - context: RedactionContext; -} - -/** - * Creates a new redaction context for a request - */ -export function createRedactionContext(): RedactionContext { - return { - mapping: {}, - reverseMapping: {}, - counters: {}, - }; -} - -/** - * Generates a placeholder for a secret type - * - * Format: [[SECRET_REDACTED_{TYPE}_{N}]] e.g. [[SECRET_REDACTED_API_KEY_OPENAI_1]] - */ -function generatePlaceholder(secretType: string, context: RedactionContext): string { - const count = (context.counters[secretType] || 0) + 1; - context.counters[secretType] = count; - - return generateSecretPlaceholder(secretType, count); -} - -/** - * Redacts secrets in text, replacing them with placeholders - * - * Stores mapping in context for later unredaction. - * Redactions must be provided sorted by start position descending (as returned by detectSecrets). - * - * @param text - The text to redact secrets from - * @param redactions - Array of redaction positions (sorted by start position descending) - * @param context - Optional existing context to reuse (for multiple messages) - */ -export function redactSecrets( - text: string, - redactions: SecretsRedaction[], - context?: RedactionContext, -): RedactionResult { - const ctx = context || createRedactionContext(); - - if (redactions.length === 0) { - return { redacted: text, context: ctx }; - } - - // Resolve conflicts between overlapping redactions - const resolved = resolveOverlaps(redactions); - - // First pass: sort by start position ascending to assign placeholders in order of appearance - const sortedByStart = [...resolved].sort((a, b) => a.start - b.start); - - // Assign placeholders in order of appearance - const redactionPlaceholders = new Map(); - for (const redaction of sortedByStart) { - const originalValue = text.slice(redaction.start, redaction.end); - - // Check if we already have a placeholder for this exact value - let placeholder = ctx.reverseMapping[originalValue]; - - if (!placeholder) { - placeholder = generatePlaceholder(redaction.type, ctx); - ctx.mapping[placeholder] = originalValue; - ctx.reverseMapping[originalValue] = placeholder; - } - - redactionPlaceholders.set(redaction, placeholder); - } - - // Second pass: replace from end to start to maintain correct string positions - const sortedByEnd = [...resolved].sort((a, b) => b.start - a.start); - - let result = text; - for (const redaction of sortedByEnd) { - const placeholder = redactionPlaceholders.get(redaction)!; - result = result.slice(0, redaction.start) + placeholder + result.slice(redaction.end); - } - - return { redacted: result, context: ctx }; -} - -/** - * Unredacts text by replacing placeholders with original secrets - * - * @param text - Text containing secret placeholders - * @param context - Redaction context with mappings - */ -export function unredactSecrets(text: string, context: RedactionContext): string { - let result = text; - - // Sort placeholders by length descending to avoid partial replacements - const placeholders = Object.keys(context.mapping).sort((a, b) => b.length - a.length); - - for (const placeholder of placeholders) { - const originalValue = context.mapping[placeholder]; - // Replace all occurrences of the placeholder - result = result.split(placeholder).join(originalValue); - } - - return result; -} - -/** - * Redacts secrets in multiple messages (for chat completions) - * - * @param messages - Chat messages to redact - * @param redactionsByMessage - Redactions for each message (indexed by message position) - */ -export function redactMessagesSecrets( - messages: ChatMessage[], - redactionsByMessage: SecretsRedaction[][], -): { redacted: ChatMessage[]; context: RedactionContext } { - const context = createRedactionContext(); - - const redacted = messages.map((msg, i) => { - const redactions = redactionsByMessage[i] || []; - const text = extractTextContent(msg.content); - const { redacted: redactedContent } = redactSecrets(text, redactions, context); - - // If original content was a string, return redacted string - // Otherwise return original content (arrays are handled in proxy.ts) - return { ...msg, content: typeof msg.content === "string" ? redactedContent : msg.content }; - }); - - return { redacted, context }; -} - -/** - * Streaming unredact helper - processes chunks and unredacts when complete placeholders are found - * - * Similar to PII unmasking but for secrets. - * Returns the unredacted portion and any remaining buffer that might contain partial placeholders. - */ -export function unredactStreamChunk( - buffer: string, - newChunk: string, - context: RedactionContext, -): { output: string; remainingBuffer: string } { - const combined = buffer + newChunk; - - const partialStart = findPartialPlaceholderStart(combined); - - if (partialStart === -1) { - // No partial placeholder, safe to unredact everything - return { - output: unredactSecrets(combined, context), - remainingBuffer: "", - }; - } - - // Partial placeholder detected, buffer it - const safeToProcess = combined.slice(0, partialStart); - const toBuffer = combined.slice(partialStart); - - return { - output: unredactSecrets(safeToProcess, context), - remainingBuffer: toBuffer, - }; -} - -/** - * Flushes remaining buffer at end of stream - */ -export function flushRedactionBuffer(buffer: string, context: RedactionContext): string { - if (!buffer) return ""; - return unredactSecrets(buffer, context); -} - -/** - * Unredacts a chat completion response by replacing placeholders in all choices - */ -export function unredactResponse( - response: ChatCompletionResponse, - context: RedactionContext, -): ChatCompletionResponse { - return { - ...response, - choices: response.choices.map((choice) => ({ - ...choice, - message: { - ...choice.message, - content: - typeof choice.message.content === "string" - ? unredactSecrets(choice.message.content, context) - : choice.message.content, - }, - })), - }; -} diff --git a/src/services/decision.test.ts b/src/services/decision.test.ts index 3d4985a..8a87c24 100644 --- a/src/services/decision.test.ts +++ b/src/services/decision.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from "bun:test"; -import type { SecretsDetectionResult, SecretsMatch } from "../secrets/detect"; -import type { PIIDetectionResult } from "./pii-detector"; +import type { PIIDetectionResult } from "../pii/detect"; +import type { MessageSecretsResult, SecretsMatch } from "../secrets/detect"; /** * Pure routing logic extracted for testing @@ -8,8 +8,8 @@ import type { PIIDetectionResult } from "./pii-detector"; */ function decideRoute( piiResult: PIIDetectionResult, - secretsResult?: SecretsDetectionResult, - secretsAction?: "block" | "redact" | "route_local", + secretsResult?: MessageSecretsResult, + secretsAction?: "block" | "mask" | "route_local", ): { provider: "openai" | "local"; reason: string } { // Check for secrets route_local action first (takes precedence) if (secretsResult?.detected && secretsAction === "route_local") { @@ -21,7 +21,7 @@ function decideRoute( } if (piiResult.hasPII) { - const entityTypes = [...new Set(piiResult.newEntities.map((e) => e.entity_type))]; + const entityTypes = [...new Set(piiResult.allEntities.map((e) => e.entity_type))]; return { provider: "local", reason: `PII detected: ${entityTypes.join(", ")}`, @@ -41,7 +41,7 @@ function createPIIResult( hasPII: boolean, entities: Array<{ entity_type: string }> = [], ): PIIDetectionResult { - const newEntities = entities.map((e) => ({ + const allEntities = entities.map((e) => ({ entity_type: e.entity_type, start: 0, end: 10, @@ -50,8 +50,8 @@ function createPIIResult( return { hasPII, - newEntities, - entitiesByMessage: [newEntities], + allEntities, + messageEntities: [[allEntities]], language: "en", languageFallback: false, scanTimeMs: 50, @@ -104,16 +104,16 @@ describe("decideRoute", () => { }); /** - * Helper to create a mock SecretsDetectionResult + * Helper to create a mock MessageSecretsResult */ function createSecretsResult( detected: boolean, matches: SecretsMatch[] = [], -): SecretsDetectionResult { +): MessageSecretsResult { return { detected, matches, - redactions: matches.map((m, i) => ({ start: i * 100, end: i * 100 + 50, type: m.type })), + messageLocations: [], }; } @@ -175,14 +175,14 @@ describe("decideRoute with secrets", () => { }); }); - describe("with redact action", () => { - test("ignores secrets detection for routing (redacted before PII check)", () => { + describe("with mask action", () => { + test("ignores secrets detection for routing (masked before PII check)", () => { const piiResult = createPIIResult(false); const secretsResult = createSecretsResult(true, [{ type: "BEARER_TOKEN", count: 1 }]); - const result = decideRoute(piiResult, secretsResult, "redact"); + const result = decideRoute(piiResult, secretsResult, "mask"); - // With redact action, we route based on PII, not secrets + // With mask action, we route based on PII, not secrets expect(result.provider).toBe("openai"); expect(result.reason).toBe("No PII detected"); }); diff --git a/src/services/decision.ts b/src/services/decision.ts index da1bdbf..a6bef8b 100644 --- a/src/services/decision.ts +++ b/src/services/decision.ts @@ -1,8 +1,9 @@ import { type Config, getConfig } from "../config"; -import type { SecretsDetectionResult } from "../secrets/detect"; -import { type ChatMessage, LLMClient } from "../services/llm-client"; -import { createMaskingContext, type MaskingContext, maskMessages } from "../services/masking"; -import { getPIIDetector, type PIIDetectionResult } from "../services/pii-detector"; +import { getPIIDetector, type PIIDetectionResult } from "../pii/detect"; +import { createMaskingContext, maskMessages } from "../pii/mask"; +import type { MessageSecretsResult } from "../secrets/detect"; +import type { PlaceholderContext } from "../utils/message-transform"; +import { type ChatMessage, LLMClient } from "./llm-client"; /** * Routing decision result for route mode @@ -23,7 +24,7 @@ export interface MaskDecision { reason: string; piiResult: PIIDetectionResult; maskedMessages: ChatMessage[]; - maskingContext: MaskingContext; + maskingContext: PlaceholderContext; } export type RoutingDecision = RouteDecision | MaskDecision; @@ -61,13 +62,13 @@ export class Router { */ async decide( messages: ChatMessage[], - secretsResult?: SecretsDetectionResult, + secretsResult?: MessageSecretsResult, ): Promise { const detector = getPIIDetector(); const piiResult = await detector.analyzeMessages(messages); if (this.config.mode === "mask") { - return await this.decideMask(messages, piiResult); + return this.decideMask(messages, piiResult); } return this.decideRoute(piiResult, secretsResult); @@ -82,7 +83,7 @@ export class Router { */ private decideRoute( piiResult: PIIDetectionResult, - secretsResult?: SecretsDetectionResult, + secretsResult?: MessageSecretsResult, ): RouteDecision { // Check for secrets route_local action first (takes precedence) if (secretsResult?.detected && this.config.secrets_detection.action === "route_local") { @@ -97,7 +98,7 @@ export class Router { // Route based on PII detection if (piiResult.hasPII) { - const entityTypes = [...new Set(piiResult.newEntities.map((e) => e.entity_type))]; + const entityTypes = [...new Set(piiResult.allEntities.map((e) => e.entity_type))]; return { mode: "route", provider: "local", @@ -115,10 +116,7 @@ export class Router { }; } - private async decideMask( - messages: ChatMessage[], - piiResult: PIIDetectionResult, - ): Promise { + private decideMask(messages: ChatMessage[], piiResult: PIIDetectionResult): MaskDecision { if (!piiResult.hasPII) { return { mode: "mask", @@ -130,9 +128,9 @@ export class Router { }; } - const { masked, context } = maskMessages(messages, piiResult.entitiesByMessage); + const { masked, context } = maskMessages(messages, piiResult); - const entityTypes = [...new Set(piiResult.newEntities.map((e) => e.entity_type))]; + const entityTypes = [...new Set(piiResult.allEntities.map((e) => e.entity_type))]; return { mode: "mask", diff --git a/src/services/language-detector.ts b/src/services/language-detector.ts index 4442432..dd48541 100644 --- a/src/services/language-detector.ts +++ b/src/services/language-detector.ts @@ -1,32 +1,8 @@ import eld from "eld/small"; import { getConfig } from "../config"; +import type { SupportedLanguage } from "../constants/languages"; -// All 24 spaCy languages with trained pipelines -export type SupportedLanguage = - | "ca" - | "zh" - | "hr" - | "da" - | "nl" - | "en" - | "fi" - | "fr" - | "de" - | "el" - | "it" - | "ja" - | "ko" - | "lt" - | "mk" - | "nb" - | "pl" - | "pt" - | "ro" - | "ru" - | "sl" - | "es" - | "sv" - | "uk"; +export type { SupportedLanguage } from "../constants/languages"; export interface LanguageDetectionResult { language: SupportedLanguage; diff --git a/src/services/masking.test.ts b/src/services/masking.test.ts deleted file mode 100644 index bbfa58e..0000000 --- a/src/services/masking.test.ts +++ /dev/null @@ -1,656 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import type { MaskingConfig } from "../config"; -import type { ChatMessage } from "./llm-client"; -import { - createMaskingContext, - flushStreamBuffer, - mask, - maskMessages, - unmask, - unmaskResponse, - unmaskStreamChunk, -} from "./masking"; -import type { PIIEntity } from "./pii-detector"; - -const defaultConfig: MaskingConfig = { - show_markers: false, - marker_text: "[protected]", -}; - -const configWithMarkers: MaskingConfig = { - show_markers: true, - marker_text: "[protected]", -}; - -describe("mask", () => { - test("returns original text when no entities", () => { - const result = mask("Hello world", []); - expect(result.masked).toBe("Hello world"); - expect(Object.keys(result.context.mapping)).toHaveLength(0); - }); - - test("masks single email entity", () => { - // "Contact: john@example.com please" - // ^9 ^25 - const entities: PIIEntity[] = [{ entity_type: "EMAIL_ADDRESS", start: 9, end: 25, score: 1.0 }]; - - const result = mask("Contact: john@example.com please", entities); - - expect(result.masked).toBe("Contact: [[EMAIL_ADDRESS_1]] please"); - expect(result.context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("john@example.com"); - }); - - test("masks multiple entities of same type", () => { - const text = "Emails: a@b.com and c@d.com"; - const entities: PIIEntity[] = [ - { entity_type: "EMAIL_ADDRESS", start: 8, end: 15, score: 1.0 }, - { entity_type: "EMAIL_ADDRESS", start: 20, end: 27, score: 1.0 }, - ]; - - const result = mask(text, entities); - - expect(result.masked).toBe("Emails: [[EMAIL_ADDRESS_1]] and [[EMAIL_ADDRESS_2]]"); - expect(result.context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("a@b.com"); - expect(result.context.mapping["[[EMAIL_ADDRESS_2]]"]).toBe("c@d.com"); - }); - - test("masks multiple entity types", () => { - const text = "Hans Müller: hans@firma.de"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 0, end: 11, score: 0.9 }, - { entity_type: "EMAIL_ADDRESS", start: 13, end: 26, score: 1.0 }, - ]; - - const result = mask(text, entities); - - expect(result.masked).toBe("[[PERSON_1]]: [[EMAIL_ADDRESS_1]]"); - expect(result.context.mapping["[[PERSON_1]]"]).toBe("Hans Müller"); - expect(result.context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("hans@firma.de"); - }); - - test("reuses placeholder for duplicate values", () => { - const text = "a@b.com and again a@b.com"; - const entities: PIIEntity[] = [ - { entity_type: "EMAIL_ADDRESS", start: 0, end: 7, score: 1.0 }, - { entity_type: "EMAIL_ADDRESS", start: 18, end: 25, score: 1.0 }, - ]; - - const result = mask(text, entities); - - // Same value should get same placeholder - expect(result.masked).toBe("[[EMAIL_ADDRESS_1]] and again [[EMAIL_ADDRESS_1]]"); - expect(Object.keys(result.context.mapping)).toHaveLength(1); - }); - - test("handles adjacent entities", () => { - const text = "HansMüller"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, - { entity_type: "PERSON", start: 4, end: 10, score: 0.9 }, - ]; - - const result = mask(text, entities); - - expect(result.masked).toBe("[[PERSON_1]][[PERSON_2]]"); - }); - - test("preserves context across calls", () => { - const context = createMaskingContext(); - - const result1 = mask( - "Email: a@b.com", - [{ entity_type: "EMAIL_ADDRESS", start: 7, end: 14, score: 1.0 }], - context, - ); - - expect(result1.masked).toBe("Email: [[EMAIL_ADDRESS_1]]"); - - const result2 = mask( - "Another: c@d.com", - [{ entity_type: "EMAIL_ADDRESS", start: 9, end: 16, score: 1.0 }], - context, - ); - - // Should continue numbering - expect(result2.masked).toBe("Another: [[EMAIL_ADDRESS_2]]"); - expect(context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("a@b.com"); - expect(context.mapping["[[EMAIL_ADDRESS_2]]"]).toBe("c@d.com"); - }); -}); - -describe("unmask", () => { - test("returns original text when no mappings", () => { - const context = createMaskingContext(); - const result = unmask("Hello world", context, defaultConfig); - expect(result).toBe("Hello world"); - }); - - test("restores single placeholder", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "john@example.com"; - - const result = unmask("Reply to [[EMAIL_ADDRESS_1]]", context, defaultConfig); - expect(result).toBe("Reply to john@example.com"); - }); - - test("restores multiple placeholders", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "Hans Müller"; - context.mapping["[[EMAIL_ADDRESS_1]]"] = "hans@firma.de"; - - const result = unmask( - "Hello [[PERSON_1]], your email [[EMAIL_ADDRESS_1]] is confirmed", - context, - defaultConfig, - ); - expect(result).toBe("Hello Hans Müller, your email hans@firma.de is confirmed"); - }); - - test("restores repeated placeholders", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - - const result = unmask("[[EMAIL_ADDRESS_1]] and [[EMAIL_ADDRESS_1]]", context, defaultConfig); - expect(result).toBe("test@test.com and test@test.com"); - }); - - test("adds markers when configured", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "john@example.com"; - - const result = unmask("Email: [[EMAIL_ADDRESS_1]]", context, configWithMarkers); - expect(result).toBe("Email: [protected]john@example.com"); - }); - - test("handles partial placeholder (no match)", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - - const result = unmask("Text with [[EMAIL_ADDRESS_2]]", context, defaultConfig); - expect(result).toBe("Text with [[EMAIL_ADDRESS_2]]"); // No match, unchanged - }); -}); - -describe("mask -> unmask roundtrip", () => { - test("preserves original data through roundtrip", () => { - const originalText = "Contact Hans Müller at hans@firma.de or call +49123456789"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 8, end: 19, score: 0.9 }, - { entity_type: "EMAIL_ADDRESS", start: 23, end: 36, score: 1.0 }, - { entity_type: "PHONE_NUMBER", start: 45, end: 57, score: 0.95 }, - ]; - - const { masked, context } = mask(originalText, entities); - - // Verify masking worked - expect(masked).not.toContain("Hans Müller"); - expect(masked).not.toContain("hans@firma.de"); - expect(masked).not.toContain("+49123456789"); - - // Simulate LLM response that echoes placeholders - const llmResponse = `I see your contact info: ${masked.match(/\[\[PERSON_1\]\]/)?.[0]}, email ${masked.match(/\[\[EMAIL_ADDRESS_1\]\]/)?.[0]}`; - - const unmasked = unmask(llmResponse, context, defaultConfig); - - expect(unmasked).toContain("Hans Müller"); - expect(unmasked).toContain("hans@firma.de"); - }); - - test("handles empty entities array", () => { - const text = "No PII here"; - const { masked, context } = mask(text, []); - const unmasked = unmask(masked, context, defaultConfig); - - expect(unmasked).toBe(text); - }); -}); - -describe("maskMessages", () => { - test("masks multiple messages", () => { - const messages: ChatMessage[] = [ - { role: "user", content: "My email is test@example.com" }, - { role: "assistant", content: "Got it" }, - { role: "user", content: "Also john@test.com" }, - ]; - - const entitiesByMessage: PIIEntity[][] = [ - [{ entity_type: "EMAIL_ADDRESS", start: 12, end: 28, score: 1.0 }], - [], - [{ entity_type: "EMAIL_ADDRESS", start: 5, end: 18, score: 1.0 }], - ]; - - const { masked, context } = maskMessages(messages, entitiesByMessage); - - expect(masked[0].content).toBe("My email is [[EMAIL_ADDRESS_1]]"); - expect(masked[1].content).toBe("Got it"); - expect(masked[2].content).toBe("Also [[EMAIL_ADDRESS_2]]"); - - expect(context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("test@example.com"); - expect(context.mapping["[[EMAIL_ADDRESS_2]]"]).toBe("john@test.com"); - }); - - test("preserves message roles", () => { - const messages: ChatMessage[] = [ - { role: "system", content: "You are helpful" }, - { role: "user", content: "Hi" }, - ]; - - const { masked } = maskMessages(messages, [[], []]); - - expect(masked[0].role).toBe("system"); - expect(masked[1].role).toBe("user"); - }); -}); - -describe("streaming unmask", () => { - test("unmasks complete placeholder in chunk", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - - const { output, remainingBuffer } = unmaskStreamChunk( - "", - "Hello [[EMAIL_ADDRESS_1]]!", - context, - defaultConfig, - ); - - expect(output).toBe("Hello test@test.com!"); - expect(remainingBuffer).toBe(""); - }); - - test("buffers partial placeholder", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - - const { output, remainingBuffer } = unmaskStreamChunk( - "", - "Hello [[EMAIL_ADD", - context, - defaultConfig, - ); - - expect(output).toBe("Hello "); - expect(remainingBuffer).toBe("[[EMAIL_ADD"); - }); - - test("completes buffered placeholder", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - - const { output, remainingBuffer } = unmaskStreamChunk( - "[[EMAIL_ADD", - "RESS_1]] there", - context, - defaultConfig, - ); - - expect(output).toBe("test@test.com there"); - expect(remainingBuffer).toBe(""); - }); - - test("handles text without placeholders", () => { - const context = createMaskingContext(); - - const { output, remainingBuffer } = unmaskStreamChunk( - "", - "Just normal text", - context, - defaultConfig, - ); - - expect(output).toBe("Just normal text"); - expect(remainingBuffer).toBe(""); - }); - - test("flushes remaining buffer", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - - // Partial that never completes - const flushed = flushStreamBuffer("[[EMAIL_ADD", context, defaultConfig); - - // Should return as-is since no complete placeholder - expect(flushed).toBe("[[EMAIL_ADD"); - }); -}); - -describe("unmaskResponse", () => { - test("unmasks all choices in response", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - context.mapping["[[PERSON_1]]"] = "John Doe"; - - const response = { - id: "chatcmpl-123", - object: "chat.completion" as const, - created: 1234567890, - model: "gpt-4", - choices: [ - { - index: 0, - message: { - role: "assistant" as const, - content: "Contact [[PERSON_1]] at [[EMAIL_ADDRESS_1]]", - }, - finish_reason: "stop" as const, - }, - ], - usage: { - prompt_tokens: 10, - completion_tokens: 20, - total_tokens: 30, - }, - }; - - const result = unmaskResponse(response, context, defaultConfig); - - expect(result.choices[0].message.content).toBe("Contact John Doe at test@test.com"); - expect(result.id).toBe("chatcmpl-123"); - expect(result.model).toBe("gpt-4"); - }); - - test("handles multiple choices", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "a@b.com"; - - const response = { - id: "chatcmpl-456", - object: "chat.completion" as const, - created: 1234567890, - model: "gpt-4", - choices: [ - { - index: 0, - message: { role: "assistant" as const, content: "First: [[EMAIL_ADDRESS_1]]" }, - finish_reason: "stop" as const, - }, - { - index: 1, - message: { role: "assistant" as const, content: "Second: [[EMAIL_ADDRESS_1]]" }, - finish_reason: "stop" as const, - }, - ], - }; - - const result = unmaskResponse(response, context, defaultConfig); - - expect(result.choices[0].message.content).toBe("First: a@b.com"); - expect(result.choices[1].message.content).toBe("Second: a@b.com"); - }); - - test("preserves response structure", () => { - const context = createMaskingContext(); - const response = { - id: "test-id", - object: "chat.completion" as const, - created: 999, - model: "test-model", - choices: [ - { - index: 0, - message: { role: "assistant" as const, content: "No placeholders" }, - finish_reason: null, - }, - ], - usage: { prompt_tokens: 5, completion_tokens: 10, total_tokens: 15 }, - }; - - const result = unmaskResponse(response, context, defaultConfig); - - expect(result.id).toBe("test-id"); - expect(result.object).toBe("chat.completion"); - expect(result.created).toBe(999); - expect(result.model).toBe("test-model"); - expect(result.usage).toEqual({ prompt_tokens: 5, completion_tokens: 10, total_tokens: 15 }); - }); -}); - -describe("edge cases", () => { - test("handles unicode in masked text", () => { - const text = "Kontakt: François Müller"; - const entities: PIIEntity[] = [{ entity_type: "PERSON", start: 9, end: 24, score: 0.9 }]; - - const { masked, context } = mask(text, entities); - expect(masked).toBe("Kontakt: [[PERSON_1]]"); - - const unmasked = unmask(masked, context, defaultConfig); - expect(unmasked).toBe("Kontakt: François Müller"); - }); - - test("handles empty text", () => { - const { masked, context } = mask("", []); - expect(masked).toBe(""); - expect(unmask("", context, defaultConfig)).toBe(""); - }); - - test("handles placeholder-like text that is not a real placeholder", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - - const result = unmask("Use [[UNKNOWN_1]] format", context, defaultConfig); - expect(result).toBe("Use [[UNKNOWN_1]] format"); - }); -}); - -describe("HTML context handling (issue #36)", () => { - test("unmasks placeholders in HTML without encoding issues", () => { - // With [[]] format, placeholders are not affected by HTML encoding - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "Dr. Sarah Chen"; - context.mapping["[[EMAIL_ADDRESS_1]]"] = "sarah.chen@hospital.org"; - - // [[]] brackets don't get HTML-encoded, so they work directly - const htmlResponse = `

Contact [[PERSON_1]] at [[EMAIL_ADDRESS_1]]

`; - - const result = unmask(htmlResponse, context, defaultConfig); - - expect(result).toBe("

Contact Dr. Sarah Chen at sarah.chen@hospital.org

"); - }); - - test("unmasks placeholders in HTML title attributes", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "Jane Smith"; - - // [[]] works in HTML attributes without encoding - const htmlWithAttr = `Click here`; - - const result = unmask(htmlWithAttr, context, defaultConfig); - - expect(result).toBe(`Click here`); - }); - - test("unmasks placeholders in mailto links", () => { - const context = createMaskingContext(); - context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@example.com"; - - const mailtoHtml = `Send email`; - - const result = unmask(mailtoHtml, context, defaultConfig); - - expect(result).toBe(`Send email`); - }); - - test("handles multiple occurrences of same placeholder in HTML", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "Alice"; - - const response = `

[[PERSON_1]] said hello.

[[PERSON_1]] waved goodbye.

`; - - const result = unmask(response, context, defaultConfig); - - expect(result).toBe("

Alice said hello.

Alice waved goodbye.

"); - }); - - test("works with complex HTML structures", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "Dr. Sarah Chen"; - context.mapping["[[EMAIL_ADDRESS_1]]"] = "sarah@hospital.org"; - context.mapping["[[PHONE_NUMBER_1]]"] = "+1-555-0123"; - - const complexHtml = ` -
-

[[PERSON_1]]

- [[EMAIL_ADDRESS_1]] - Call: [[PHONE_NUMBER_1]] -
- `; - - const result = unmask(complexHtml, context, defaultConfig); - - expect(result).toContain("Dr. Sarah Chen"); - expect(result).toContain("sarah@hospital.org"); - expect(result).toContain("+1-555-0123"); - expect(result).not.toContain("[["); - expect(result).not.toContain("]]"); - }); -}); - -describe("streaming with [[]] placeholders (issue #36)", () => { - test("handles complete placeholder in chunk", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "John Doe"; - - const { output, remainingBuffer } = unmaskStreamChunk( - "", - "Hello [[PERSON_1]]!", - context, - defaultConfig, - ); - - expect(output).toBe("Hello John Doe!"); - expect(remainingBuffer).toBe(""); - }); - - test("buffers partial placeholder at end of chunk", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "John Doe"; - - // Partial placeholder at end: [[PERS - const { output, remainingBuffer } = unmaskStreamChunk( - "", - "Hello [[PERS", - context, - defaultConfig, - ); - - expect(output).toBe("Hello "); - expect(remainingBuffer).toBe("[[PERS"); - }); - - test("completes buffered placeholder across chunks", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "John Doe"; - - const { output, remainingBuffer } = unmaskStreamChunk( - "[[PERS", - "ON_1]] there", - context, - defaultConfig, - ); - - expect(output).toBe("John Doe there"); - expect(remainingBuffer).toBe(""); - }); - - test("handles placeholder split at closing brackets", () => { - const context = createMaskingContext(); - context.mapping["[[PERSON_1]]"] = "John Doe"; - - // First chunk ends with incomplete closing - const result1 = unmaskStreamChunk("", "Hello [[PERSON_1]", context, defaultConfig); - expect(result1.output).toBe("Hello "); - expect(result1.remainingBuffer).toBe("[[PERSON_1]"); - - // Second chunk completes it - const result2 = unmaskStreamChunk(result1.remainingBuffer, "] world", context, defaultConfig); - expect(result2.output).toBe("John Doe world"); - expect(result2.remainingBuffer).toBe(""); - }); -}); - -describe("overlapping entities (issue #33)", () => { - test("handles overlapping entities with same start - keeps longer", () => { - // Bug: Presidio returns both "Eric" and "Eric's" as separate PERSON entities - const text = "Given Eric's feedback"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 6, end: 10, score: 0.85 }, // "Eric" - { entity_type: "PERSON", start: 6, end: 12, score: 0.8 }, // "Eric's" - ]; - - const { masked, context } = mask(text, entities); - - // Longer span wins when same start position - expect(masked).toBe("Given [[PERSON_1]] feedback"); - expect(context.mapping["[[PERSON_1]]"]).toBe("Eric's"); - }); - - test("handles partially overlapping entities of same type - merges them", () => { - const text = "Contact John Smith Jones please"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 8, end: 18, score: 0.9 }, // "John Smith" - { entity_type: "PERSON", start: 13, end: 25, score: 0.7 }, // "Smith Jones" - ]; - - const { masked } = mask(text, entities); - - // Presidio behavior: same-type overlapping entities are MERGED - // Merged entity spans 8-25 ("John Smith Jones"), keeps highest score - expect(masked).toBe("Contact [[PERSON_1]]please"); - }); - - test("handles nested entities - keeps outer (starts first)", () => { - const text = "Dr. John Smith is here"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 0, end: 14, score: 0.9 }, // "Dr. John Smith" - { entity_type: "PERSON", start: 4, end: 8, score: 0.85 }, // "John" - ]; - - const { masked } = mask(text, entities); - - expect(masked).toBe("[[PERSON_1]] is here"); - }); - - test("keeps adjacent non-overlapping entities", () => { - const text = "HansMüller"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, // "Hans" - { entity_type: "PERSON", start: 4, end: 10, score: 0.9 }, // "Müller" - ]; - - const { masked } = mask(text, entities); - - expect(masked).toBe("[[PERSON_1]][[PERSON_2]]"); - }); - - test("handles multiple independent overlap groups", () => { - const text = "Laura Smith met Eric's friend Bob Jones Jr"; - const entities: PIIEntity[] = [ - // Group 1: same start - longer wins - { entity_type: "PERSON", start: 0, end: 5, score: 0.85 }, // "Laura" - { entity_type: "PERSON", start: 0, end: 11, score: 0.9 }, // "Laura Smith" - // Group 2: same start - longer wins - { entity_type: "PERSON", start: 16, end: 20, score: 0.85 }, // "Eric" - { entity_type: "PERSON", start: 16, end: 22, score: 0.8 }, // "Eric's" - // Group 3: same start - longer wins - { entity_type: "PERSON", start: 30, end: 33, score: 0.7 }, // "Bob" - { entity_type: "PERSON", start: 30, end: 42, score: 0.9 }, // "Bob Jones Jr" - ]; - - const { masked } = mask(text, entities); - - expect(masked).toBe("[[PERSON_1]] met [[PERSON_2]] friend [[PERSON_3]]"); - }); - - test("entity consistency - same value gets same placeholder", () => { - const text = "Eric met Eric again"; - const entities: PIIEntity[] = [ - { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, // "Eric" - { entity_type: "PERSON", start: 9, end: 13, score: 0.9 }, // "Eric" - ]; - - const { masked, context } = mask(text, entities); - - expect(masked).toBe("[[PERSON_1]] met [[PERSON_1]] again"); - expect(Object.keys(context.mapping)).toHaveLength(1); - }); -}); diff --git a/src/services/masking.ts b/src/services/masking.ts deleted file mode 100644 index b9dbeb9..0000000 --- a/src/services/masking.ts +++ /dev/null @@ -1,247 +0,0 @@ -import type { MaskingConfig } from "../config"; -import { - findPartialPlaceholderStart, - generatePlaceholder as generatePlaceholderFromFormat, - PII_PLACEHOLDER_FORMAT, -} from "../constants/placeholders"; -import { resolveConflicts } from "../utils/conflict-resolver"; -import { extractTextContent } from "../utils/content"; -import type { ChatCompletionResponse, ChatMessage } from "./llm-client"; -import type { PIIEntity } from "./pii-detector"; - -export interface MaskingContext { - mapping: Record; - reverseMapping: Record; - counters: Record; -} - -export interface MaskResult { - masked: string; - context: MaskingContext; -} - -/** - * Creates a new masking context for a request - */ -export function createMaskingContext(): MaskingContext { - return { - mapping: {}, - reverseMapping: {}, - counters: {}, - }; -} - -/** - * Generates a placeholder for a PII entity type - */ -function generatePlaceholder(entityType: string, context: MaskingContext): string { - const count = (context.counters[entityType] || 0) + 1; - context.counters[entityType] = count; - - return generatePlaceholderFromFormat(PII_PLACEHOLDER_FORMAT, entityType, count); -} - -/** - * Masks PII entities in text, replacing them with placeholders - * - * First assigns placeholders in order of appearance (start position ascending), - * then replaces from end to start to maintain correct string positions - */ -export function mask(text: string, entities: PIIEntity[], context?: MaskingContext): MaskResult { - const ctx = context || createMaskingContext(); - - if (entities.length === 0) { - return { masked: text, context: ctx }; - } - - // Resolve conflicts between overlapping entities using Presidio's algorithm - // Presidio can return overlapping entities (e.g., "Eric" and "Eric's") - const resolved = resolveConflicts(entities); - - // First pass: sort by start position ascending to assign placeholders in order - const sortedByStart = [...resolved].sort((a, b) => a.start - b.start); - - // Assign placeholders in order of appearance - const entityPlaceholders = new Map(); - for (const entity of sortedByStart) { - const originalValue = text.slice(entity.start, entity.end); - - // Check if we already have a placeholder for this exact value - let placeholder = ctx.reverseMapping[originalValue]; - - if (!placeholder) { - placeholder = generatePlaceholder(entity.entity_type, ctx); - ctx.mapping[placeholder] = originalValue; - ctx.reverseMapping[originalValue] = placeholder; - } - - entityPlaceholders.set(entity, placeholder); - } - - // Second pass: sort by start position descending for replacement - // This ensures string indices remain valid as we replace - const sortedByEnd = [...resolved].sort((a, b) => b.start - a.start); - - let result = text; - for (const entity of sortedByEnd) { - const placeholder = entityPlaceholders.get(entity)!; - result = result.slice(0, entity.start) + placeholder + result.slice(entity.end); - } - - return { masked: result, context: ctx }; -} - -/** - * Unmasks text by replacing placeholders with original values - * - * Optionally adds markers to indicate protected content - */ -export function unmask(text: string, context: MaskingContext, config: MaskingConfig): string { - let result = text; - - // Sort placeholders by length descending to avoid partial replacements - const placeholders = Object.keys(context.mapping).sort((a, b) => b.length - a.length); - - for (const placeholder of placeholders) { - const originalValue = context.mapping[placeholder]; - const replacement = config.show_markers - ? `${config.marker_text}${originalValue}` - : originalValue; - - // Replace all occurrences of the placeholder - result = result.split(placeholder).join(replacement); - } - - return result; -} - -/** - * Masks multiple messages (for chat completions) - */ -export function maskMessages( - messages: ChatMessage[], - entitiesByMessage: PIIEntity[][], -): { masked: ChatMessage[]; context: MaskingContext } { - const context = createMaskingContext(); - - const masked = messages.map((msg, i) => { - const entities = entitiesByMessage[i] || []; - - // Handle array content (multimodal messages) - if (Array.isArray(msg.content)) { - if (entities.length === 0) { - return msg; - } - - // Track offset position within the concatenated text for this message - // (matches how extractTextContent joins parts with \n) - let partOffset = 0; - - // Mask only text parts with proper offset tracking - const maskedContent = msg.content.map((part) => { - if (part.type === "text" && typeof part.text === "string") { - const partLength = part.text.length; - - // Find entities that apply to this specific part - const partEntities = entities - .filter((e) => e.start < partOffset + partLength && e.end > partOffset) - .map((e) => ({ - ...e, - start: Math.max(0, e.start - partOffset), - end: Math.min(partLength, e.end - partOffset), - })); - - if (partEntities.length > 0) { - const { masked: maskedText } = mask(part.text, partEntities, context); - partOffset += partLength + 1; // +1 for \n separator - return { ...part, text: maskedText }; - } - - partOffset += partLength + 1; // +1 for \n separator - return part; - } - return part; - }); - - return { ...msg, content: maskedContent }; - } - - // Handle string content (text-only messages) - const text = extractTextContent(msg.content); - const { masked: maskedContent } = mask(text, entities, context); - - // If original content was a string, return masked string - // Otherwise return original content - return { ...msg, content: typeof msg.content === "string" ? maskedContent : msg.content }; - }); - - return { masked, context }; -} - -/** - * Streaming unmask helper - processes chunks and unmasks when complete placeholders are found - * - * Returns the unmasked portion and any remaining buffer that might contain partial placeholders - */ -export function unmaskStreamChunk( - buffer: string, - newChunk: string, - context: MaskingContext, - config: MaskingConfig, -): { output: string; remainingBuffer: string } { - const combined = buffer + newChunk; - - const partialStart = findPartialPlaceholderStart(combined); - - if (partialStart === -1) { - // No partial placeholder, safe to unmask everything - return { - output: unmask(combined, context, config), - remainingBuffer: "", - }; - } - - // Partial placeholder detected, buffer it - const safeToProcess = combined.slice(0, partialStart); - const toBuffer = combined.slice(partialStart); - - return { - output: unmask(safeToProcess, context, config), - remainingBuffer: toBuffer, - }; -} - -/** - * Flushes remaining buffer at end of stream - */ -export function flushStreamBuffer( - buffer: string, - context: MaskingContext, - config: MaskingConfig, -): string { - if (!buffer) return ""; - return unmask(buffer, context, config); -} - -/** - * Unmasks a chat completion response by replacing placeholders in all choices - */ -export function unmaskResponse( - response: ChatCompletionResponse, - context: MaskingContext, - config: MaskingConfig, -): ChatCompletionResponse { - return { - ...response, - choices: response.choices.map((choice) => ({ - ...choice, - message: { - ...choice.message, - content: - typeof choice.message.content === "string" - ? unmask(choice.message.content, context, config) - : choice.message.content, - }, - })), - }; -} diff --git a/src/services/stream-transformer.test.ts b/src/services/stream-transformer.test.ts index b79cc56..eba9922 100644 --- a/src/services/stream-transformer.test.ts +++ b/src/services/stream-transformer.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from "bun:test"; import type { MaskingConfig } from "../config"; -import { createMaskingContext } from "./masking"; +import { createMaskingContext } from "../pii/mask"; import { createUnmaskingStream } from "./stream-transformer"; const defaultConfig: MaskingConfig = { diff --git a/src/services/stream-transformer.ts b/src/services/stream-transformer.ts index aa69b2f..ea64add 100644 --- a/src/services/stream-transformer.ts +++ b/src/services/stream-transformer.ts @@ -1,10 +1,7 @@ import type { MaskingConfig } from "../config"; -import { - flushRedactionBuffer, - type RedactionContext, - unredactStreamChunk, -} from "../secrets/redact"; -import { flushStreamBuffer, type MaskingContext, unmaskStreamChunk } from "./masking"; +import { flushMaskingBuffer, unmaskStreamChunk } from "../pii/mask"; +import { flushSecretsMaskingBuffer, unmaskSecretsStreamChunk } from "../secrets/mask"; +import type { PlaceholderContext } from "../utils/message-transform"; /** * Creates a transform stream that unmasks SSE content @@ -12,13 +9,13 @@ import { flushStreamBuffer, type MaskingContext, unmaskStreamChunk } from "./mas * Processes Server-Sent Events (SSE) chunks, buffering partial placeholders * and unmasking complete ones before forwarding to the client. * - * Supports both PII unmasking and secret unredaction, or either alone. + * Supports both PII unmasking and secrets unmasking, or either alone. */ export function createUnmaskingStream( source: ReadableStream, - piiContext: MaskingContext | undefined, + piiContext: PlaceholderContext | undefined, config: MaskingConfig, - secretsContext?: RedactionContext, + secretsContext?: PlaceholderContext, ): ReadableStream { const decoder = new TextDecoder(); const encoder = new TextEncoder(); @@ -39,14 +36,14 @@ export function createUnmaskingStream( // Flush PII buffer first if (piiBuffer && piiContext) { - flushed = flushStreamBuffer(piiBuffer, piiContext, config); + flushed = flushMaskingBuffer(piiBuffer, piiContext, config); } else if (piiBuffer) { flushed = piiBuffer; } // Then flush secrets buffer if (secretsBuffer && secretsContext) { - flushed += flushRedactionBuffer(secretsBuffer, secretsContext); + flushed += flushSecretsMaskingBuffer(secretsBuffer, secretsContext); } else if (secretsBuffer) { flushed += secretsBuffer; } @@ -101,9 +98,9 @@ export function createUnmaskingStream( processedContent = output; } - // Then unredact secrets if context provided + // Then unmask secrets if context provided if (secretsContext && processedContent) { - const { output, remainingBuffer } = unredactStreamChunk( + const { output, remainingBuffer } = unmaskSecretsStreamChunk( secretsBuffer, processedContent, secretsContext, diff --git a/src/test-utils/detection-results.ts b/src/test-utils/detection-results.ts new file mode 100644 index 0000000..30cfc48 --- /dev/null +++ b/src/test-utils/detection-results.ts @@ -0,0 +1,51 @@ +/** + * Test utilities for creating detection results + * + * Shared helpers for creating PIIDetectionResult and MessageSecretsResult + * from per-message, per-part data in tests. + */ + +import type { SupportedLanguage } from "../constants/languages"; +import type { PIIDetectionResult, PIIEntity } from "../pii/detect"; +import type { MessageSecretsResult, SecretLocation } from "../secrets/detect"; + +/** + * Creates a PIIDetectionResult from per-message, per-part entities + * + * @param messageEntities - Nested array: messageEntities[msgIdx][partIdx] = entities[] + * @param options - Optional overrides for language, scanTimeMs, etc. + */ +export function createPIIResult( + messageEntities: PIIEntity[][][], + options: { + language?: SupportedLanguage; + languageFallback?: boolean; + detectedLanguage?: string; + scanTimeMs?: number; + } = {}, +): PIIDetectionResult { + const allEntities = messageEntities.flat(2); + return { + hasPII: allEntities.length > 0, + messageEntities, + allEntities, + scanTimeMs: options.scanTimeMs ?? 0, + language: options.language ?? "en", + languageFallback: options.languageFallback ?? false, + detectedLanguage: options.detectedLanguage, + }; +} + +/** + * Creates a MessageSecretsResult from per-message, per-part locations + * + * @param messageLocations - Nested array: messageLocations[msgIdx][partIdx] = locations[] + */ +export function createSecretsResult(messageLocations: SecretLocation[][][]): MessageSecretsResult { + const hasLocations = messageLocations.some((msg) => msg.some((part) => part.length > 0)); + return { + detected: hasLocations, + matches: [], // Matches are aggregated separately in real detection + messageLocations, + }; +} diff --git a/src/utils/conflict-resolver.ts b/src/utils/conflict-resolver.ts index 5c2c090..1ae4148 100644 --- a/src/utils/conflict-resolver.ts +++ b/src/utils/conflict-resolver.ts @@ -1,19 +1,23 @@ // Conflict resolution based on Microsoft Presidio's logic // https://github.com/microsoft/presidio/blob/main/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py -export interface EntityWithScore { +/** + * Base interface for items with position (used by both PII and secrets) + */ +export interface Span { start: number; end: number; - score: number; - entity_type: string; } -interface Interval { - start: number; - end: number; +/** + * Extended interface for PII entities with confidence scores + */ +export interface EntityWithScore extends Span { + score: number; + entity_type: string; } -function overlaps(a: Interval, b: Interval): boolean { +function overlaps(a: Span, b: Span): boolean { return a.start < b.end && b.start < a.end; } @@ -28,7 +32,7 @@ function groupBy(items: T[], keyFn: (item: T) => string): Map { return groups; } -function mergeOverlapping(intervals: T[], merge: (a: T, b: T) => T): T[] { +function mergeOverlapping(intervals: T[], merge: (a: T, b: T) => T): T[] { if (intervals.length <= 1) return [...intervals]; const sorted = [...intervals].sort((a, b) => a.start - b.start); @@ -92,11 +96,14 @@ export function resolveConflicts(entities: T[]): T[] return removeConflicting(afterMerge); } -/** For secrets without scores. Keeps non-overlapping, longer wins ties. */ -export function resolveOverlaps(entities: T[]): T[] { - if (entities.length <= 1) return [...entities]; +/** + * Simple conflict resolution for items without scores (secrets) + * Keeps non-overlapping spans, longer span wins ties. + */ +export function resolveOverlaps(items: T[]): T[] { + if (items.length <= 1) return [...items]; - const sorted = [...entities].sort((a, b) => { + const sorted = [...items].sort((a, b) => { if (a.start !== b.start) return a.start - b.start; return b.end - b.start - (a.end - a.start); }); diff --git a/src/utils/content.test.ts b/src/utils/content.test.ts index 0040d07..3b60a2b 100644 --- a/src/utils/content.test.ts +++ b/src/utils/content.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from "bun:test"; -import { type ContentPart, extractTextContent, hasTextContent } from "./content"; +import { type ContentPart, extractTextContent } from "./content"; describe("extractTextContent", () => { test("returns empty string for null", () => { @@ -47,33 +47,3 @@ describe("extractTextContent", () => { expect(extractTextContent([])).toBe(""); }); }); - -describe("hasTextContent", () => { - test("returns false for null", () => { - expect(hasTextContent(null)).toBe(false); - }); - - test("returns false for undefined", () => { - expect(hasTextContent(undefined)).toBe(false); - }); - - test("returns true for non-empty string", () => { - expect(hasTextContent("Hello")).toBe(true); - }); - - test("returns false for empty string", () => { - expect(hasTextContent("")).toBe(false); - }); - - test("returns true for array with text", () => { - const content: ContentPart[] = [{ type: "text", text: "Hello" }]; - expect(hasTextContent(content)).toBe(true); - }); - - test("returns false for array without text", () => { - const content: ContentPart[] = [ - { type: "image_url", image_url: { url: "https://example.com/image.jpg" } }, - ]; - expect(hasTextContent(content)).toBe(false); - }); -}); diff --git a/src/utils/content.ts b/src/utils/content.ts index 5e52983..7a256de 100644 --- a/src/utils/content.ts +++ b/src/utils/content.ts @@ -67,13 +67,3 @@ export function extractTextContent(content: MessageContent): string { // Unexpected type - return empty string return ""; } - -/** - * Checks if content has any text - * - * @param content - The message content to check - * @returns true if content contains text, false otherwise - */ -export function hasTextContent(content: MessageContent): boolean { - return extractTextContent(content).length > 0; -} diff --git a/src/utils/message-transform.test.ts b/src/utils/message-transform.test.ts new file mode 100644 index 0000000..0c68abc --- /dev/null +++ b/src/utils/message-transform.test.ts @@ -0,0 +1,555 @@ +import { describe, expect, test } from "bun:test"; +import type { ChatMessage } from "../services/llm-client"; +import type { Span } from "./conflict-resolver"; +import { + createPlaceholderContext, + flushBuffer, + incrementAndGenerate, + processStreamChunk, + replaceWithPlaceholders, + restorePlaceholders, + restoreResponsePlaceholders, + transformMessagesPerPart, +} from "./message-transform"; + +/** + * Simple placeholder format for testing: [[TYPE_N]] + */ +function testPlaceholder(type: string, count: number): string { + return `[[${type}_${count}]]`; +} + +/** + * Simple conflict resolver that keeps non-overlapping items (first wins) + */ +function simpleResolveConflicts(items: T[]): T[] { + if (items.length <= 1) return [...items]; + const sorted = [...items].sort((a, b) => a.start - b.start); + const result: T[] = [sorted[0]]; + for (let i = 1; i < sorted.length; i++) { + const current = sorted[i]; + const last = result[result.length - 1]; + if (current.start >= last.end) { + result.push(current); + } + } + return result; +} + +interface TestItem extends Span { + type: string; +} + +describe("createPlaceholderContext", () => { + test("creates empty context", () => { + const ctx = createPlaceholderContext(); + expect(ctx.mapping).toEqual({}); + expect(ctx.reverseMapping).toEqual({}); + expect(ctx.counters).toEqual({}); + }); +}); + +describe("incrementAndGenerate", () => { + test("increments counter and generates placeholder", () => { + const ctx = createPlaceholderContext(); + + const p1 = incrementAndGenerate("EMAIL", ctx, testPlaceholder); + expect(p1).toBe("[[EMAIL_1]]"); + expect(ctx.counters.EMAIL).toBe(1); + + const p2 = incrementAndGenerate("EMAIL", ctx, testPlaceholder); + expect(p2).toBe("[[EMAIL_2]]"); + expect(ctx.counters.EMAIL).toBe(2); + }); + + test("tracks different types separately", () => { + const ctx = createPlaceholderContext(); + + incrementAndGenerate("EMAIL", ctx, testPlaceholder); + incrementAndGenerate("PERSON", ctx, testPlaceholder); + incrementAndGenerate("EMAIL", ctx, testPlaceholder); + + expect(ctx.counters.EMAIL).toBe(2); + expect(ctx.counters.PERSON).toBe(1); + }); +}); + +describe("replaceWithPlaceholders", () => { + test("returns original text when no items", () => { + const ctx = createPlaceholderContext(); + const result = replaceWithPlaceholders( + "Hello world", + [], + ctx, + (item: TestItem) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + expect(result).toBe("Hello world"); + }); + + test("replaces single item", () => { + const ctx = createPlaceholderContext(); + const items: TestItem[] = [{ start: 0, end: 5, type: "WORD" }]; + + const result = replaceWithPlaceholders( + "Hello world", + items, + ctx, + (item) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + expect(result).toBe("[[WORD_1]] world"); + expect(ctx.mapping["[[WORD_1]]"]).toBe("Hello"); + }); + + test("replaces multiple items", () => { + const ctx = createPlaceholderContext(); + const items: TestItem[] = [ + { start: 0, end: 5, type: "WORD" }, + { start: 6, end: 11, type: "WORD" }, + ]; + + const result = replaceWithPlaceholders( + "Hello world", + items, + ctx, + (item) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + expect(result).toBe("[[WORD_1]] [[WORD_2]]"); + }); + + test("reuses placeholder for duplicate values", () => { + const ctx = createPlaceholderContext(); + const items: TestItem[] = [ + { start: 0, end: 3, type: "WORD" }, + { start: 8, end: 11, type: "WORD" }, + ]; + + const result = replaceWithPlaceholders( + "foo bar foo", + items, + ctx, + (item) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + expect(result).toBe("[[WORD_1]] bar [[WORD_1]]"); + expect(Object.keys(ctx.mapping)).toHaveLength(1); + }); + + test("preserves context across calls", () => { + const ctx = createPlaceholderContext(); + + replaceWithPlaceholders( + "Hello", + [{ start: 0, end: 5, type: "WORD" }], + ctx, + (item: TestItem) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + const result = replaceWithPlaceholders( + "World", + [{ start: 0, end: 5, type: "WORD" }], + ctx, + (item: TestItem) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + expect(result).toBe("[[WORD_2]]"); + expect(ctx.mapping["[[WORD_1]]"]).toBe("Hello"); + expect(ctx.mapping["[[WORD_2]]"]).toBe("World"); + }); + + test("handles adjacent items", () => { + const ctx = createPlaceholderContext(); + const items: TestItem[] = [ + { start: 0, end: 2, type: "A" }, + { start: 2, end: 4, type: "B" }, + ]; + + const result = replaceWithPlaceholders( + "AABB", + items, + ctx, + (item) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + expect(result).toBe("[[A_1]][[B_1]]"); + }); +}); + +describe("restorePlaceholders", () => { + test("returns original text when no mappings", () => { + const ctx = createPlaceholderContext(); + expect(restorePlaceholders("Hello world", ctx)).toBe("Hello world"); + }); + + test("restores single placeholder", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[WORD_1]]"] = "Hello"; + + expect(restorePlaceholders("[[WORD_1]] world", ctx)).toBe("Hello world"); + }); + + test("restores multiple placeholders", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[A_1]]"] = "Hello"; + ctx.mapping["[[B_1]]"] = "World"; + + expect(restorePlaceholders("[[A_1]] [[B_1]]", ctx)).toBe("Hello World"); + }); + + test("restores repeated placeholders", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "test"; + + expect(restorePlaceholders("[[X_1]] and [[X_1]]", ctx)).toBe("test and test"); + }); + + test("applies formatValue function", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "secret"; + + const result = restorePlaceholders("Value: [[X_1]]", ctx, (v) => `[REDACTED:${v}]`); + expect(result).toBe("Value: [REDACTED:secret]"); + }); + + test("leaves unknown placeholders unchanged", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "known"; + + expect(restorePlaceholders("[[X_1]] [[Y_1]]", ctx)).toBe("known [[Y_1]]"); + }); +}); + +describe("replace -> restore roundtrip", () => { + test("preserves original data", () => { + const ctx = createPlaceholderContext(); + const original = "Contact john@example.com or call +1234567890"; + const items: TestItem[] = [ + { start: 8, end: 24, type: "EMAIL" }, + { start: 33, end: 44, type: "PHONE" }, + ]; + + const replaced = replaceWithPlaceholders( + original, + items, + ctx, + (item) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + expect(replaced).not.toContain("john@example.com"); + expect(replaced).not.toContain("+1234567890"); + + const restored = restorePlaceholders(replaced, ctx); + expect(restored).toBe(original); + }); +}); + +describe("transformMessagesPerPart", () => { + test("transforms string content", () => { + const messages: ChatMessage[] = [{ role: "user", content: "Hello world" }]; + const perPartData = [[[{ marker: true }]]]; + + const result = transformMessagesPerPart( + messages, + perPartData, + (text, data) => (data.length > 0 ? text.toUpperCase() : text), + {}, + ); + + expect(result[0].content).toBe("HELLO WORLD"); + }); + + test("skips messages without data", () => { + const messages: ChatMessage[] = [ + { role: "user", content: "Keep this" }, + { role: "assistant", content: "And this" }, + ]; + const perPartData = [[[]], [[]]]; + + const result = transformMessagesPerPart( + messages, + perPartData, + (text) => text.toUpperCase(), + {}, + ); + + expect(result[0].content).toBe("Keep this"); + expect(result[1].content).toBe("And this"); + }); + + test("transforms array content (multimodal)", () => { + const messages: ChatMessage[] = [ + { + role: "user", + content: [ + { type: "text", text: "Hello" }, + { type: "image_url", image_url: { url: "https://example.com/img.jpg" } }, + ], + }, + ]; + const perPartData = [[[{ marker: true }], []]]; + + const result = transformMessagesPerPart( + messages, + perPartData, + (text, data) => (data.length > 0 ? text.toUpperCase() : text), + {}, + ); + + const content = result[0].content as Array<{ type: string; text?: string }>; + expect(content[0].text).toBe("HELLO"); + expect(content[1].type).toBe("image_url"); + }); + + test("preserves message roles", () => { + const messages: ChatMessage[] = [ + { role: "system", content: "sys" }, + { role: "user", content: "usr" }, + { role: "assistant", content: "ast" }, + ]; + const perPartData = [[[]], [[]], [[]]]; + + const result = transformMessagesPerPart(messages, perPartData, (t) => t, {}); + + expect(result[0].role).toBe("system"); + expect(result[1].role).toBe("user"); + expect(result[2].role).toBe("assistant"); + }); + + test("passes context to transform function", () => { + const messages: ChatMessage[] = [{ role: "user", content: "test" }]; + const perPartData = [[[{ id: 1 }]]]; + const ctx = { prefix: ">> " }; + + const result = transformMessagesPerPart( + messages, + perPartData, + (text, _data, context: { prefix: string }) => context.prefix + text, + ctx, + ); + + expect(result[0].content).toBe(">> test"); + }); +}); + +describe("restoreResponsePlaceholders", () => { + test("restores placeholders in response choices", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "secret"; + + const response = { + id: "test", + choices: [{ message: { content: "Value: [[X_1]]" } }], + }; + + const result = restoreResponsePlaceholders(response, ctx); + expect(result.choices[0].message.content).toBe("Value: secret"); + }); + + test("handles multiple choices", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "val"; + + const response = { + id: "test", + choices: [{ message: { content: "A: [[X_1]]" } }, { message: { content: "B: [[X_1]]" } }], + }; + + const result = restoreResponsePlaceholders(response, ctx); + expect(result.choices[0].message.content).toBe("A: val"); + expect(result.choices[1].message.content).toBe("B: val"); + }); + + test("preserves response structure", () => { + const ctx = createPlaceholderContext(); + const response = { + id: "resp-123", + model: "test-model", + choices: [{ message: { content: "text" } }], + usage: { tokens: 10 }, + }; + + const result = restoreResponsePlaceholders(response, ctx); + expect(result.id).toBe("resp-123"); + expect(result.model).toBe("test-model"); + expect(result.usage).toEqual({ tokens: 10 }); + }); + + test("applies formatValue function", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "secret"; + + const response = { + id: "test", + choices: [{ message: { content: "[[X_1]]" } }], + }; + + const result = restoreResponsePlaceholders(response, ctx, (v) => `<${v}>`); + expect(result.choices[0].message.content).toBe(""); + }); + + test("handles non-string content", () => { + const ctx = createPlaceholderContext(); + const response = { + id: "test", + choices: [{ message: { content: null } }], + }; + + const result = restoreResponsePlaceholders(response, ctx); + expect(result.choices[0].message.content).toBe(null); + }); +}); + +describe("processStreamChunk", () => { + test("processes complete text without placeholders", () => { + const ctx = createPlaceholderContext(); + const restore = (text: string) => text; + + const { output, remainingBuffer } = processStreamChunk("", "Hello world", ctx, restore); + + expect(output).toBe("Hello world"); + expect(remainingBuffer).toBe(""); + }); + + test("processes complete placeholder", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "secret"; + + const { output, remainingBuffer } = processStreamChunk( + "", + "Value: [[X_1]]!", + ctx, + restorePlaceholders, + ); + + expect(output).toBe("Value: secret!"); + expect(remainingBuffer).toBe(""); + }); + + test("buffers partial placeholder at end", () => { + const ctx = createPlaceholderContext(); + + const { output, remainingBuffer } = processStreamChunk( + "", + "Hello [[PARTIAL", + ctx, + restorePlaceholders, + ); + + expect(output).toBe("Hello "); + expect(remainingBuffer).toBe("[[PARTIAL"); + }); + + test("completes buffered placeholder", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "done"; + + const { output, remainingBuffer } = processStreamChunk( + "[[X_", + "1]] end", + ctx, + restorePlaceholders, + ); + + expect(output).toBe("done end"); + expect(remainingBuffer).toBe(""); + }); + + test("handles multiple chunks with partial placeholders", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[LONG_PLACEHOLDER_1]]"] = "value"; + + // First chunk + const r1 = processStreamChunk("", "Start [[LONG_", ctx, restorePlaceholders); + expect(r1.output).toBe("Start "); + expect(r1.remainingBuffer).toBe("[[LONG_"); + + // Second chunk + const r2 = processStreamChunk(r1.remainingBuffer, "PLACEHOLDER_", ctx, restorePlaceholders); + expect(r2.output).toBe(""); + expect(r2.remainingBuffer).toBe("[[LONG_PLACEHOLDER_"); + + // Third chunk completes it + const r3 = processStreamChunk(r2.remainingBuffer, "1]] end", ctx, restorePlaceholders); + expect(r3.output).toBe("value end"); + expect(r3.remainingBuffer).toBe(""); + }); +}); + +describe("flushBuffer", () => { + test("returns empty string for empty buffer", () => { + const ctx = createPlaceholderContext(); + expect(flushBuffer("", ctx, restorePlaceholders)).toBe(""); + }); + + test("flushes incomplete placeholder as-is", () => { + const ctx = createPlaceholderContext(); + expect(flushBuffer("[[INCOMPLETE", ctx, restorePlaceholders)).toBe("[[INCOMPLETE"); + }); + + test("restores complete placeholder in buffer", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[X_1]]"] = "final"; + + expect(flushBuffer("[[X_1]]", ctx, restorePlaceholders)).toBe("final"); + }); +}); + +describe("edge cases", () => { + test("handles unicode text", () => { + const ctx = createPlaceholderContext(); + const items: TestItem[] = [{ start: 0, end: 11, type: "NAME" }]; + + const result = replaceWithPlaceholders( + "François Müller", + items, + ctx, + (item) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + + // Note: JS string indices are UTF-16 code units + expect(ctx.mapping["[[NAME_1]]"]).toBe("François Mü"); + + const restored = restorePlaceholders(result, ctx); + expect(restored).toContain("François Mü"); + }); + + test("handles empty text", () => { + const ctx = createPlaceholderContext(); + const result = replaceWithPlaceholders( + "", + [], + ctx, + (item: TestItem) => item.type, + (type, ctx) => incrementAndGenerate(type, ctx, testPlaceholder), + simpleResolveConflicts, + ); + expect(result).toBe(""); + }); + + test("handles placeholder-like text that is not in mapping", () => { + const ctx = createPlaceholderContext(); + ctx.mapping["[[A_1]]"] = "known"; + + const result = restorePlaceholders("[[A_1]] and [[B_1]]", ctx); + expect(result).toBe("known and [[B_1]]"); + }); +}); diff --git a/src/utils/message-transform.ts b/src/utils/message-transform.ts new file mode 100644 index 0000000..424d152 --- /dev/null +++ b/src/utils/message-transform.ts @@ -0,0 +1,282 @@ +/** + * Generic utilities for per-part message transformations + * + * Both PII masking and secrets masking need to: + * 1. Iterate over messages and their content parts + * 2. Apply transformations based on per-part detection data + * 3. Handle string vs array content uniformly + * + * This module provides shared infrastructure to avoid duplication. + */ + +import type { ChatMessage } from "../services/llm-client"; +import type { Span } from "./conflict-resolver"; +import type { ContentPart } from "./content"; +import { findPartialPlaceholderStart } from "./placeholders"; + +/** + * Generic context for placeholder-based transformations + * Used by both PII masking and secrets masking + */ +export interface PlaceholderContext { + /** Maps placeholder -> original value */ + mapping: Record; + /** Maps original value -> placeholder (for deduplication) */ + reverseMapping: Record; + /** Counter per type for sequential numbering */ + counters: Record; +} + +/** + * Result of masking text with placeholders + * Used by both PII masking and secrets masking + */ +export interface MaskResult { + /** Text with sensitive data replaced by placeholders */ + masked: string; + /** Context for unmasking (maps placeholders to original values) */ + context: PlaceholderContext; +} + +/** + * Creates a new placeholder context + */ +export function createPlaceholderContext(): PlaceholderContext { + return { + mapping: {}, + reverseMapping: {}, + counters: {}, + }; +} + +/** + * Increments counter for type and generates placeholder using format function + * + * Shared counter logic for both PII masking and secrets masking. + */ +export function incrementAndGenerate( + type: string, + context: PlaceholderContext, + format: (type: string, count: number) => string, +): string { + const count = (context.counters[type] || 0) + 1; + context.counters[type] = count; + return format(type, count); +} + +/** + * Transforms messages using per-part data + * + * Generic function that handles the common pattern of: + * - Iterating over messages + * - Handling string vs array content + * - Applying a transform function per text part + * + * @param messages - Chat messages to transform + * @param perPartData - Per-message, per-part data: data[msgIdx][partIdx] + * @param transform - Function to transform text using the part data + * @param context - Shared context passed to all transform calls + */ +export function transformMessagesPerPart( + messages: ChatMessage[], + perPartData: TData[][][], + transform: (text: string, data: TData[], context: TContext) => string, + context: TContext, +): ChatMessage[] { + return messages.map((msg, msgIdx) => { + const partData = perPartData[msgIdx] || []; + + // String content → data is in partData[0] + if (typeof msg.content === "string") { + const data = partData[0] || []; + if (data.length === 0) return msg; + const transformed = transform(msg.content, data, context); + return { ...msg, content: transformed }; + } + + // Array content (multimodal) → data is per-part + if (Array.isArray(msg.content)) { + const transformedContent = msg.content.map((part: ContentPart, partIdx: number) => { + const data = partData[partIdx] || []; + if (part.type === "text" && typeof part.text === "string" && data.length > 0) { + const transformed = transform(part.text, data, context); + return { ...part, text: transformed }; + } + return part; + }); + return { ...msg, content: transformedContent }; + } + + // Null/undefined content + return msg; + }); +} + +/** + * Restores placeholders in text with original values + * + * Generic function used by both PII unmasking and secrets unmasking. + * + * @param text - Text containing placeholders + * @param context - Context with placeholder mappings + * @param formatValue - Optional function to format restored values (e.g., add markers) + */ +export function restorePlaceholders( + text: string, + context: PlaceholderContext, + formatValue?: (original: string) => string, +): string { + let result = text; + + // Sort placeholders by length descending to avoid partial replacements + const placeholders = Object.keys(context.mapping).sort((a, b) => b.length - a.length); + + for (const placeholder of placeholders) { + const originalValue = context.mapping[placeholder]; + const replacement = formatValue ? formatValue(originalValue) : originalValue; + // Replace all occurrences of the placeholder + result = result.split(placeholder).join(replacement); + } + + return result; +} + +/** + * Restores placeholders in a chat completion response + * + * @param response - The response object with choices + * @param context - Context with placeholder mappings + * @param formatValue - Optional function to format restored values + */ +export function restoreResponsePlaceholders< + T extends { choices: Array<{ message: { content: unknown } }> }, +>(response: T, context: PlaceholderContext, formatValue?: (original: string) => string): T { + return { + ...response, + choices: response.choices.map((choice) => ({ + ...choice, + message: { + ...choice.message, + content: + typeof choice.message.content === "string" + ? restorePlaceholders(choice.message.content, context, formatValue) + : choice.message.content, + }, + })), + } as T; +} + +/** + * Replaces items in text with placeholders + * + * Generic function used by both PII masking and secrets masking. + * Handles: conflict resolution, placeholder assignment, and replacement. + * + * @param text - Text to process + * @param items - Items with start/end positions to replace + * @param context - Placeholder context for tracking mappings + * @param getType - Function to get the type string from an item + * @param generatePlaceholder - Function to generate placeholder for a type + * @param resolveConflicts - Function to resolve overlapping items + */ +export function replaceWithPlaceholders( + text: string, + items: T[], + context: PlaceholderContext, + getType: (item: T) => string, + generatePlaceholder: (type: string, context: PlaceholderContext) => string, + resolveConflicts: (items: T[]) => T[], +): string { + if (items.length === 0) { + return text; + } + + // Resolve conflicts between overlapping items + const resolved = resolveConflicts(items); + + // First pass: sort by start position ascending to assign placeholders in order + const sortedByStart = [...resolved].sort((a, b) => a.start - b.start); + + // Assign placeholders in order of appearance + const itemPlaceholders = new Map(); + for (const item of sortedByStart) { + const originalValue = text.slice(item.start, item.end); + + // Check if we already have a placeholder for this exact value + let placeholder = context.reverseMapping[originalValue]; + + if (!placeholder) { + placeholder = generatePlaceholder(getType(item), context); + context.mapping[placeholder] = originalValue; + context.reverseMapping[originalValue] = placeholder; + } + + itemPlaceholders.set(item, placeholder); + } + + // Second pass: sort by start position descending for replacement + // This ensures string indices remain valid as we replace + const sortedByEnd = [...resolved].sort((a, b) => b.start - a.start); + + let result = text; + for (const item of sortedByEnd) { + const placeholder = itemPlaceholders.get(item)!; + result = result.slice(0, item.start) + placeholder + result.slice(item.end); + } + + return result; +} + +/** + * Processes a stream chunk, buffering partial placeholders + * + * Generic function used by both PII unmasking and secrets unmasking. + * + * @param buffer - Previous buffer content + * @param newChunk - New chunk to process + * @param context - Placeholder context + * @param restore - Function to restore placeholders in text + */ +export function processStreamChunk( + buffer: string, + newChunk: string, + context: PlaceholderContext, + restore: (text: string, ctx: PlaceholderContext) => string, +): { output: string; remainingBuffer: string } { + const combined = buffer + newChunk; + + const partialStart = findPartialPlaceholderStart(combined); + + if (partialStart === -1) { + // No partial placeholder, safe to restore everything + return { + output: restore(combined, context), + remainingBuffer: "", + }; + } + + // Partial placeholder detected, buffer it + const safeToProcess = combined.slice(0, partialStart); + const toBuffer = combined.slice(partialStart); + + return { + output: restore(safeToProcess, context), + remainingBuffer: toBuffer, + }; +} + +/** + * Flushes remaining buffer at end of stream + * + * @param buffer - Remaining buffer content + * @param context - Placeholder context + * @param restore - Function to restore placeholders in text + */ +export function flushBuffer( + buffer: string, + context: PlaceholderContext, + restore: (text: string, ctx: PlaceholderContext) => string, +): string { + if (!buffer) return ""; + return restore(buffer, context); +} diff --git a/src/constants/placeholders.test.ts b/src/utils/placeholders.test.ts similarity index 93% rename from src/constants/placeholders.test.ts rename to src/utils/placeholders.test.ts index 48a3938..39d4bb9 100644 --- a/src/constants/placeholders.test.ts +++ b/src/utils/placeholders.test.ts @@ -23,7 +23,7 @@ describe("placeholder constants", () => { test("secret format uses correct delimiters", () => { expect(SECRET_PLACEHOLDER_FORMAT).toContain(PLACEHOLDER_DELIMITERS.start); expect(SECRET_PLACEHOLDER_FORMAT).toContain(PLACEHOLDER_DELIMITERS.end); - expect(SECRET_PLACEHOLDER_FORMAT).toBe("[[SECRET_REDACTED_{N}]]"); + expect(SECRET_PLACEHOLDER_FORMAT).toBe("[[SECRET_MASKED_{N}]]"); }); }); @@ -42,12 +42,12 @@ describe("generatePlaceholder", () => { describe("generateSecretPlaceholder", () => { test("generates secret placeholder", () => { const result = generateSecretPlaceholder("API_KEY_OPENAI", 1); - expect(result).toBe("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); + expect(result).toBe("[[SECRET_MASKED_API_KEY_OPENAI_1]]"); }); test("generates secret placeholder with different type and count", () => { const result = generateSecretPlaceholder("PEM_PRIVATE_KEY", 2); - expect(result).toBe("[[SECRET_REDACTED_PEM_PRIVATE_KEY_2]]"); + expect(result).toBe("[[SECRET_MASKED_PEM_PRIVATE_KEY_2]]"); }); }); diff --git a/src/constants/placeholders.ts b/src/utils/placeholders.ts similarity index 87% rename from src/constants/placeholders.ts rename to src/utils/placeholders.ts index 91bedf3..708c84b 100644 --- a/src/constants/placeholders.ts +++ b/src/utils/placeholders.ts @@ -1,5 +1,5 @@ /** - * Placeholder constants for PII masking and secrets redaction + * Placeholder constants for PII masking and secrets masking * Single source of truth for all placeholder-related logic */ @@ -11,8 +11,8 @@ export const PLACEHOLDER_DELIMITERS = { /** PII placeholder format: [[TYPE_N]] e.g. [[PERSON_1]], [[EMAIL_ADDRESS_2]] */ export const PII_PLACEHOLDER_FORMAT = "[[{TYPE}_{N}]]"; -/** Secrets placeholder format: [[SECRET_REDACTED_TYPE_N]] e.g. [[SECRET_REDACTED_API_KEY_OPENAI_1]] */ -export const SECRET_PLACEHOLDER_FORMAT = "[[SECRET_REDACTED_{N}]]"; +/** Secrets placeholder format: [[SECRET_MASKED_TYPE_N]] e.g. [[SECRET_MASKED_API_KEY_OPENAI_1]] */ +export const SECRET_PLACEHOLDER_FORMAT = "[[SECRET_MASKED_{N}]]"; /** * Generates a placeholder string from the format