From: Stefan Gasser Date: Fri, 16 Jan 2026 15:40:04 +0000 (+0100) Subject: fix: use [[]] delimiters for placeholders to prevent HTML encoding issues (#38) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=d239944614c8be1153cdb8ccfc2afb2b48a94f30;p=sgasser-llm-shield.git fix: use [[]] delimiters for placeholders to prevent HTML encoding issues (#38) Fixes #36 - HTML-encoded placeholders now unmask correctly Changes: - Changed placeholder format from to [[TYPE_N]] - Created src/constants/placeholders.ts as single source of truth - Removed configurable redact_placeholder (was bug - streaming hardcoded [[) - Updated dashboard regex for yellow highlighting - Added tests for HTML/JSON/URL contexts The [[]] delimiters are safe in HTML, JSON, and URLs - they don't get entity-encoded like <> did. --- diff --git a/README.md b/README.md index fd52fe4..7ba5333 100644 --- a/README.md +++ b/README.md @@ -58,10 +58,10 @@ Works with OpenAI, Azure, and any OpenAI-compatible API. Just change one URL. You send: "Write a follow-up email to Dr. Sarah Chen (sarah.chen@hospital.org) about next week's project meeting" -LLM receives: "Write a follow-up email to () +LLM receives: "Write a follow-up email to [[PERSON_1]] ([[EMAIL_ADDRESS_1]]) about next week's project meeting" -LLM responds: "Dear , Following up on our discussion..." +LLM responds: "Dear [[PERSON_1]], Following up on our discussion..." You receive: "Dear Dr. Sarah Chen, Following up on our discussion..." ``` diff --git a/config.example.yaml b/config.example.yaml index 2adad62..7245246 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -139,10 +139,6 @@ secrets_detection: # The 200KB default covers typical use cases max_scan_chars: 200000 - # Placeholder format for redaction (only used if action: redact) - # {N} will be replaced with type and sequential number (e.g., API_KEY_OPENAI_1) - redact_placeholder: "" - # Log detected secret types (never logs secret content) # Even if logging.log_content is true, secret content is never logged log_detected_types: true diff --git a/docs/api-reference/dashboard-api.mdx b/docs/api-reference/dashboard-api.mdx index 1545a9e..af02035 100644 --- a/docs/api-reference/dashboard-api.mdx +++ b/docs/api-reference/dashboard-api.mdx @@ -49,7 +49,7 @@ curl "http://localhost:3000/dashboard/api/logs?limit=100&offset=0" "language": "en", "language_fallback": false, "detected_language": "en", - "masked_content": "Hello ", + "masked_content": "Hello [[EMAIL_ADDRESS_1]]", "secrets_detected": 0, "secrets_types": null } diff --git a/docs/concepts/mask-mode.mdx b/docs/concepts/mask-mode.mdx index df1e05e..3d1ed52 100644 --- a/docs/concepts/mask-mode.mdx +++ b/docs/concepts/mask-mode.mdx @@ -17,10 +17,10 @@ Mask mode replaces PII with placeholders before sending to your LLM provider. Th PasteGuard finds: `Dr. Sarah Chen` (PERSON), `sarah.chen@hospital.org` (EMAIL) - Provider receives: `"Write a follow-up email to ()"` + Provider receives: `"Write a follow-up email to [[PERSON_1]] ([[EMAIL_ADDRESS_1]])"` - Provider responds: `"Dear , Following up on our discussion..."` + Provider responds: `"Dear [[PERSON_1]], Following up on our discussion..."` You receive: `"Dear Dr. Sarah Chen, Following up on our discussion..."` diff --git a/docs/configuration/logging.mdx b/docs/configuration/logging.mdx index 26a1861..284608e 100644 --- a/docs/configuration/logging.mdx +++ b/docs/configuration/logging.mdx @@ -87,4 +87,4 @@ Only metadata (timestamps, models, PII detected) is logged. - Secret content is **never** logged, even if `log_content: true` - Only secret types are logged if `log_detected_types: true` -- Masked content shows placeholders like ``, not real PII +- Masked content shows placeholders like `[[EMAIL_ADDRESS_1]]`, not real PII diff --git a/docs/configuration/secrets-detection.mdx b/docs/configuration/secrets-detection.mdx index 8d35ef2..c813523 100644 --- a/docs/configuration/secrets-detection.mdx +++ b/docs/configuration/secrets-detection.mdx @@ -25,7 +25,6 @@ secrets_detection: | `entities` | Private keys | Secret types to detect | | `max_scan_chars` | `200000` | Max characters to scan (0 = unlimited) | | `log_detected_types` | `true` | Log detected types (never logs content) | -| `redact_placeholder` | `` | Placeholder format for redaction | ## Actions diff --git a/docs/introduction.mdx b/docs/introduction.mdx index 28878bc..d5afdb9 100644 --- a/docs/introduction.mdx +++ b/docs/introduction.mdx @@ -35,11 +35,11 @@ Two privacy modes: ``` - Detected: `Dr. Sarah Chen` → ``, `sarah.chen@hospital.org` → `` + Detected: `Dr. Sarah Chen` → `[[PERSON_1]]`, `sarah.chen@hospital.org` → `[[EMAIL_ADDRESS_1]]` ``` - Write a follow-up email to () + Write a follow-up email to [[PERSON_1]] ([[EMAIL_ADDRESS_1]]) ``` diff --git a/src/config.ts b/src/config.ts index 73af18b..10bd6ad 100644 --- a/src/config.ts +++ b/src/config.ts @@ -111,7 +111,6 @@ const SecretsDetectionSchema = z.object({ action: z.enum(["block", "redact", "route_local"]).default("redact"), entities: z.array(z.enum(SecretEntityTypes)).default(["OPENSSH_PRIVATE_KEY", "PEM_PRIVATE_KEY"]), max_scan_chars: z.coerce.number().int().min(0).default(200000), - redact_placeholder: z.string().default(""), log_detected_types: z.boolean().default(true), }); diff --git a/src/constants/placeholders.test.ts b/src/constants/placeholders.test.ts new file mode 100644 index 0000000..48a3938 --- /dev/null +++ b/src/constants/placeholders.test.ts @@ -0,0 +1,90 @@ +import { describe, expect, test } from "bun:test"; +import { + findPartialPlaceholderStart, + generatePlaceholder, + generateSecretPlaceholder, + PII_PLACEHOLDER_FORMAT, + PLACEHOLDER_DELIMITERS, + SECRET_PLACEHOLDER_FORMAT, +} from "./placeholders"; + +describe("placeholder constants", () => { + test("delimiters are correct", () => { + expect(PLACEHOLDER_DELIMITERS.start).toBe("[["); + expect(PLACEHOLDER_DELIMITERS.end).toBe("]]"); + }); + + test("PII format uses correct delimiters", () => { + expect(PII_PLACEHOLDER_FORMAT).toContain(PLACEHOLDER_DELIMITERS.start); + expect(PII_PLACEHOLDER_FORMAT).toContain(PLACEHOLDER_DELIMITERS.end); + expect(PII_PLACEHOLDER_FORMAT).toBe("[[{TYPE}_{N}]]"); + }); + + test("secret format uses correct delimiters", () => { + expect(SECRET_PLACEHOLDER_FORMAT).toContain(PLACEHOLDER_DELIMITERS.start); + expect(SECRET_PLACEHOLDER_FORMAT).toContain(PLACEHOLDER_DELIMITERS.end); + expect(SECRET_PLACEHOLDER_FORMAT).toBe("[[SECRET_REDACTED_{N}]]"); + }); +}); + +describe("generatePlaceholder", () => { + test("generates PII placeholder", () => { + const result = generatePlaceholder(PII_PLACEHOLDER_FORMAT, "PERSON", 1); + expect(result).toBe("[[PERSON_1]]"); + }); + + test("generates placeholder with different type and count", () => { + const result = generatePlaceholder(PII_PLACEHOLDER_FORMAT, "EMAIL_ADDRESS", 3); + expect(result).toBe("[[EMAIL_ADDRESS_3]]"); + }); +}); + +describe("generateSecretPlaceholder", () => { + test("generates secret placeholder", () => { + const result = generateSecretPlaceholder("API_KEY_OPENAI", 1); + expect(result).toBe("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); + }); + + test("generates secret placeholder with different type and count", () => { + const result = generateSecretPlaceholder("PEM_PRIVATE_KEY", 2); + expect(result).toBe("[[SECRET_REDACTED_PEM_PRIVATE_KEY_2]]"); + }); +}); + +describe("findPartialPlaceholderStart", () => { + test("returns -1 for empty string", () => { + expect(findPartialPlaceholderStart("")).toBe(-1); + }); + + test("returns -1 when no placeholder pattern", () => { + expect(findPartialPlaceholderStart("Hello world")).toBe(-1); + }); + + test("returns -1 when placeholder is complete", () => { + expect(findPartialPlaceholderStart("Hello [[PERSON_1]] world")).toBe(-1); + }); + + test("returns -1 when multiple complete placeholders", () => { + expect(findPartialPlaceholderStart("[[PERSON_1]] and [[EMAIL_1]]")).toBe(-1); + }); + + test("returns position of partial placeholder at end", () => { + const text = "Hello [[PERSON"; + expect(findPartialPlaceholderStart(text)).toBe(6); + }); + + test("returns position of partial placeholder with complete one before", () => { + const text = "[[PERSON_1]] Hello [[EMAIL"; + expect(findPartialPlaceholderStart(text)).toBe(19); + }); + + test("handles just opening delimiter", () => { + const text = "Hello [["; + expect(findPartialPlaceholderStart(text)).toBe(6); + }); + + test("handles text ending with single bracket", () => { + // Single [ is not a placeholder start, so should return -1 + expect(findPartialPlaceholderStart("Hello [")).toBe(-1); + }); +}); diff --git a/src/constants/placeholders.ts b/src/constants/placeholders.ts new file mode 100644 index 0000000..91bedf3 --- /dev/null +++ b/src/constants/placeholders.ts @@ -0,0 +1,54 @@ +/** + * Placeholder constants for PII masking and secrets redaction + * Single source of truth for all placeholder-related logic + */ + +export const PLACEHOLDER_DELIMITERS = { + start: "[[", + end: "]]", +} as const; + +/** PII placeholder format: [[TYPE_N]] e.g. [[PERSON_1]], [[EMAIL_ADDRESS_2]] */ +export const PII_PLACEHOLDER_FORMAT = "[[{TYPE}_{N}]]"; + +/** Secrets placeholder format: [[SECRET_REDACTED_TYPE_N]] e.g. [[SECRET_REDACTED_API_KEY_OPENAI_1]] */ +export const SECRET_PLACEHOLDER_FORMAT = "[[SECRET_REDACTED_{N}]]"; + +/** + * Generates a placeholder string from the format + */ +export function generatePlaceholder(format: string, type: string, count: number): string { + return format.replace("{TYPE}", type).replace("{N}", String(count)); +} + +/** + * Generates a secret placeholder string + * {N} is replaced with TYPE_COUNT e.g. API_KEY_OPENAI_1 + */ +export function generateSecretPlaceholder(type: string, count: number): string { + return SECRET_PLACEHOLDER_FORMAT.replace("{N}", `${type}_${count}`); +} + +/** + * Streaming buffer helper - finds safe position to process text + * that may contain partial placeholders + * + * Returns the position where it's safe to split, or -1 if entire string is safe + */ +export function findPartialPlaceholderStart(text: string): number { + const placeholderStart = text.lastIndexOf(PLACEHOLDER_DELIMITERS.start); + + if (placeholderStart === -1) { + return -1; // No potential placeholder, entire string is safe + } + + // Check if there's a complete placeholder after the last [[ + const afterStart = text.slice(placeholderStart); + const hasCompletePlaceholder = afterStart.includes(PLACEHOLDER_DELIMITERS.end); + + if (hasCompletePlaceholder) { + return -1; // Placeholder is complete, entire string is safe + } + + return placeholderStart; // Return position where partial placeholder starts +} diff --git a/src/routes/proxy.ts b/src/routes/proxy.ts index 509fc5d..7e6407b 100644 --- a/src/routes/proxy.ts +++ b/src/routes/proxy.ts @@ -4,7 +4,7 @@ import { Hono } from "hono"; import { HTTPException } from "hono/http-exception"; import { proxy } from "hono/proxy"; import { z } from "zod"; -import { getConfig, type MaskingConfig, type SecretsDetectionConfig } from "../config"; +import { getConfig, type MaskingConfig } from "../config"; import { detectSecrets, extractTextFromRequest, @@ -138,11 +138,7 @@ proxyRoutes.post( // Redact action - replace secrets with placeholders and continue if (config.secrets_detection.action === "redact") { - const redactedMessages = redactMessagesWithSecrets( - body.messages, - secretsResult, - config.secrets_detection, - ); + const redactedMessages = redactMessagesWithSecrets(body.messages, secretsResult); body = { ...body, messages: redactedMessages.messages }; redactionContext = redactedMessages.context; secretsRedacted = true; @@ -180,7 +176,6 @@ proxyRoutes.post( function redactMessagesWithSecrets( messages: ChatMessage[], secretsResult: SecretsDetectionResult, - config: SecretsDetectionConfig, ): { messages: ChatMessage[]; context: RedactionContext } { // Build a map of message content to redactions // Since we concatenated all messages with \n, we need to track positions per message @@ -247,7 +242,6 @@ function redactMessagesWithSecrets( const { redacted, context: updatedContext } = redactSecrets( part.text, partRedactions, - config, context, ); context = updatedContext; @@ -287,7 +281,6 @@ function redactMessagesWithSecrets( const { redacted, context: updatedContext } = redactSecrets( msg.content, messageRedactions, - config, context, ); context = updatedContext; diff --git a/src/secrets/detect.test.ts b/src/secrets/detect.test.ts index 35c6409..c73506c 100644 --- a/src/secrets/detect.test.ts +++ b/src/secrets/detect.test.ts @@ -8,7 +8,6 @@ const defaultConfig: SecretsDetectionConfig = { action: "block", entities: ["OPENSSH_PRIVATE_KEY", "PEM_PRIVATE_KEY"], max_scan_chars: 200000, - redact_placeholder: "", log_detected_types: true, }; diff --git a/src/secrets/multimodal.test.ts b/src/secrets/multimodal.test.ts index 5dafa9c..b58be8c 100644 --- a/src/secrets/multimodal.test.ts +++ b/src/secrets/multimodal.test.ts @@ -1,25 +1,10 @@ import { describe, expect, test } from "bun:test"; -import type { SecretsDetectionConfig } from "../config"; import type { ChatMessage } from "../services/llm-client"; import { maskMessages } from "../services/masking"; import type { PIIEntity } from "../services/pii-detector"; import type { ContentPart } from "../utils/content"; describe("Multimodal content handling", () => { - const _secretsConfig: SecretsDetectionConfig = { - enabled: true, - action: "redact", - entities: ["API_KEY_OPENAI"], - max_scan_chars: 200000, - redact_placeholder: "", - log_detected_types: true, - }; - - describe("Secrets redaction with offset tracking", () => { - // Note: Secrets are not expected to span across newlines in real scenarios - // The offset tracking is implemented to handle PII entities correctly - }); - describe("PII masking with offset tracking", () => { test("masks PII in multimodal array content", () => { const messages: ChatMessage[] = [ @@ -51,7 +36,7 @@ describe("Multimodal content handling", () => { // Part 0 should have email masked expect(maskedContent[0].type).toBe("text"); - expect(maskedContent[0].text).toBe("My email is and"); + expect(maskedContent[0].text).toBe("My email is [[EMAIL_ADDRESS_1]] and"); expect(maskedContent[0].text).not.toContain("john@example.com"); // Part 1 should be unchanged (image) @@ -60,7 +45,7 @@ describe("Multimodal content handling", () => { // Part 2 should have phone masked expect(maskedContent[2].type).toBe("text"); - expect(maskedContent[2].text).toBe("my phone is "); + expect(maskedContent[2].text).toBe("my phone is [[PHONE_NUMBER_1]]"); expect(maskedContent[2].text).not.toContain("555-1234"); }); @@ -89,8 +74,8 @@ describe("Multimodal content handling", () => { // Verify the text is actually masked (not the original) expect(maskedContent[0].text).not.toContain("Alice"); expect(maskedContent[0].text).not.toContain("alice@secret.com"); - expect(maskedContent[0].text).toContain(""); - expect(maskedContent[0].text).toContain(""); + expect(maskedContent[0].text).toContain("[[PERSON_1]]"); + expect(maskedContent[0].text).toContain("[[EMAIL_ADDRESS_1]]"); }); test("handles entities spanning multiple parts with proper offsets", () => { @@ -115,8 +100,8 @@ describe("Multimodal content handling", () => { const maskedContent = masked[0].content as ContentPart[]; // Both parts should be affected by the email entity - // Part 0: "First part with in two parts" or similar + // Part 0: "First part with [[EMAIL" or similar + // Part 1: "ADDRESS_1]] in two parts" or similar // The exact split depends on how the masking handles cross-boundary entities // At minimum, verify that the entity is masked somewhere @@ -125,7 +110,7 @@ describe("Multimodal content handling", () => { .map((p) => p.text) .join("\n"); - expect(fullMasked).toContain("", - log_detected_types: true, -}; - const sampleSecret = "sk-proj-abc123def456ghi789jkl012mno345pqr678stu901vwx"; describe("redactSecrets", () => { test("returns original text when no redactions", () => { const text = "Hello world"; - const result = redactSecrets(text, [], defaultConfig); + const result = redactSecrets(text, []); expect(result.redacted).toBe("Hello world"); expect(Object.keys(result.context.mapping)).toHaveLength(0); }); @@ -35,10 +25,10 @@ describe("redactSecrets", () => { const redactions: SecretsRedaction[] = [ { start: 14, end: 14 + sampleSecret.length, type: "API_KEY_OPENAI" }, ]; - const result = redactSecrets(text, redactions, defaultConfig); + const result = redactSecrets(text, redactions); - expect(result.redacted).toBe("My API key is "); - expect(result.context.mapping[""]).toBe(sampleSecret); + expect(result.redacted).toBe("My API key is [[SECRET_REDACTED_API_KEY_OPENAI_1]]"); + expect(result.context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"]).toBe(sampleSecret); }); test("redacts multiple secrets of same type", () => { @@ -51,11 +41,11 @@ describe("redactSecrets", () => { type: "API_KEY_OPENAI", }, ]; - const result = redactSecrets(text, redactions, defaultConfig); + const result = redactSecrets(text, redactions); // Same secret value should get same placeholder expect(result.redacted).toBe( - "Key1: Key2: ", + "Key1: [[SECRET_REDACTED_API_KEY_OPENAI_1]] Key2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]", ); expect(Object.keys(result.context.mapping)).toHaveLength(1); }); @@ -71,10 +61,10 @@ describe("redactSecrets", () => { type: "API_KEY_AWS", }, ]; - const result = redactSecrets(text, redactions, defaultConfig); + const result = redactSecrets(text, redactions); - expect(result.redacted).toContain(""); - expect(result.redacted).toContain(""); + expect(result.redacted).toContain("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); + expect(result.redacted).toContain("[[SECRET_REDACTED_API_KEY_AWS_1]]"); expect(Object.keys(result.context.mapping)).toHaveLength(2); }); @@ -84,33 +74,19 @@ describe("redactSecrets", () => { const redactions1: SecretsRedaction[] = [ { start: 5, end: 5 + sampleSecret.length, type: "API_KEY_OPENAI" }, ]; - redactSecrets(text1, redactions1, defaultConfig, context); + redactSecrets(text1, redactions1, context); const anotherSecret = "sk-proj-xyz789abc123def456ghi789jkl012mno345pqr678"; const text2 = `Another: ${anotherSecret}`; const redactions2: SecretsRedaction[] = [ { start: 9, end: 9 + anotherSecret.length, type: "API_KEY_OPENAI" }, ]; - const result2 = redactSecrets(text2, redactions2, defaultConfig, context); + const result2 = redactSecrets(text2, redactions2, context); // Second secret should get incremented counter - expect(result2.redacted).toBe("Another: "); + expect(result2.redacted).toBe("Another: [[SECRET_REDACTED_API_KEY_OPENAI_2]]"); expect(Object.keys(context.mapping)).toHaveLength(2); }); - - test("handles custom placeholder format", () => { - const customConfig: SecretsDetectionConfig = { - ...defaultConfig, - redact_placeholder: "[REDACTED:{N}]", - }; - const text = `Key: ${sampleSecret}`; - const redactions: SecretsRedaction[] = [ - { start: 5, end: 5 + sampleSecret.length, type: "API_KEY_OPENAI" }, - ]; - const result = redactSecrets(text, redactions, customConfig); - - expect(result.redacted).toBe("Key: [REDACTED:API_KEY_OPENAI_1]"); - }); }); describe("unredactSecrets", () => { @@ -123,9 +99,9 @@ describe("unredactSecrets", () => { test("restores single secret", () => { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - const text = "My API key is "; + const text = "My API key is [[SECRET_REDACTED_API_KEY_OPENAI_1]]"; const result = unredactSecrets(text, context); expect(result).toBe(`My API key is ${sampleSecret}`); @@ -134,10 +110,11 @@ describe("unredactSecrets", () => { test("restores multiple secrets", () => { const context = createRedactionContext(); const awsKey = "AKIAIOSFODNN7EXAMPLE"; - context.mapping[""] = sampleSecret; - context.mapping[""] = awsKey; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_AWS_1]]"] = awsKey; - const text = "OpenAI: AWS: "; + const text = + "OpenAI: [[SECRET_REDACTED_API_KEY_OPENAI_1]] AWS: [[SECRET_REDACTED_API_KEY_AWS_1]]"; const result = unredactSecrets(text, context); expect(result).toBe(`OpenAI: ${sampleSecret} AWS: ${awsKey}`); @@ -145,10 +122,10 @@ describe("unredactSecrets", () => { test("restores repeated placeholders", () => { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; const text = - "Key1: Key2: "; + "Key1: [[SECRET_REDACTED_API_KEY_OPENAI_1]] Key2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]"; const result = unredactSecrets(text, context); expect(result).toBe(`Key1: ${sampleSecret} Key2: ${sampleSecret}`); @@ -170,11 +147,11 @@ Please store them securely. }, ]; - const { redacted, context } = redactSecrets(originalText, redactions, defaultConfig); + const { redacted, context } = redactSecrets(originalText, redactions); // Verify secret is not in redacted text expect(redacted).not.toContain(sampleSecret); - expect(redacted).toContain(""); + expect(redacted).toContain("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); // Unredact and verify original is restored const restored = unredactSecrets(redacted, context); @@ -183,7 +160,7 @@ Please store them securely. test("handles empty redactions array", () => { const text = "No secrets here"; - const { redacted, context } = redactSecrets(text, [], defaultConfig); + const { redacted, context } = redactSecrets(text, []); const restored = unredactSecrets(redacted, context); expect(restored).toBe(text); }); @@ -200,13 +177,9 @@ describe("redactMessagesSecrets", () => { [], ]; - const { redacted, context } = redactMessagesSecrets( - messages, - redactionsByMessage, - defaultConfig, - ); + const { redacted, context } = redactMessagesSecrets(messages, redactionsByMessage); - expect(redacted[0].content).toContain(""); + expect(redacted[0].content).toContain("[[SECRET_REDACTED_API_KEY_OPENAI_1]]"); expect(redacted[0].content).not.toContain(sampleSecret); expect(redacted[1].content).toBe("I'll help you with that."); expect(Object.keys(context.mapping)).toHaveLength(1); @@ -222,7 +195,7 @@ describe("redactMessagesSecrets", () => { [{ start: 5, end: 5 + sampleSecret.length, type: "API_KEY_OPENAI" }], ]; - const { redacted } = redactMessagesSecrets(messages, redactionsByMessage, defaultConfig); + const { redacted } = redactMessagesSecrets(messages, redactionsByMessage); expect(redacted[0].role).toBe("system"); expect(redacted[1].role).toBe("user"); @@ -238,15 +211,11 @@ describe("redactMessagesSecrets", () => { [{ start: 6, end: 6 + sampleSecret.length, type: "API_KEY_OPENAI" }], ]; - const { redacted, context } = redactMessagesSecrets( - messages, - redactionsByMessage, - defaultConfig, - ); + const { redacted, context } = redactMessagesSecrets(messages, redactionsByMessage); // Same secret should get same placeholder across messages - expect(redacted[0].content).toBe("Key1: "); - expect(redacted[1].content).toBe("Key2: "); + expect(redacted[0].content).toBe("Key1: [[SECRET_REDACTED_API_KEY_OPENAI_1]]"); + expect(redacted[1].content).toBe("Key2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]"); expect(Object.keys(context.mapping)).toHaveLength(1); }); }); @@ -254,11 +223,11 @@ describe("redactMessagesSecrets", () => { describe("streaming unredact", () => { test("unredacts complete placeholder in chunk", () => { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; const { output, remainingBuffer } = unredactStreamChunk( "", - "Key: end", + "Key: [[SECRET_REDACTED_API_KEY_OPENAI_1]] end", context, ); @@ -268,21 +237,21 @@ describe("streaming unredact", () => { test("buffers partial placeholder", () => { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; - const { output, remainingBuffer } = unredactStreamChunk("", "Key: { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; const { output, remainingBuffer } = unredactStreamChunk( - " done", + "[[SECRET_RED", + "ACTED_API_KEY_OPENAI_1]] done", context, ); @@ -301,7 +270,7 @@ describe("streaming unredact", () => { test("flushes remaining buffer", () => { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; const result = flushRedactionBuffer(" { describe("unredactResponse", () => { test("unredacts all choices in response", () => { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; const response = { id: "test", @@ -329,7 +298,7 @@ describe("unredactResponse", () => { index: 0, message: { role: "assistant" as const, - content: "Your key is ", + content: "Your key is [[SECRET_REDACTED_API_KEY_OPENAI_1]]", }, finish_reason: "stop" as const, }, @@ -342,7 +311,7 @@ describe("unredactResponse", () => { test("handles multiple choices", () => { const context = createRedactionContext(); - context.mapping[""] = sampleSecret; + context.mapping["[[SECRET_REDACTED_API_KEY_OPENAI_1]]"] = sampleSecret; const response = { id: "test", @@ -354,7 +323,7 @@ describe("unredactResponse", () => { index: 0, message: { role: "assistant" as const, - content: "Choice 1: ", + content: "Choice 1: [[SECRET_REDACTED_API_KEY_OPENAI_1]]", }, finish_reason: "stop" as const, }, @@ -362,7 +331,7 @@ describe("unredactResponse", () => { index: 1, message: { role: "assistant" as const, - content: "Choice 2: ", + content: "Choice 2: [[SECRET_REDACTED_API_KEY_OPENAI_1]]", }, finish_reason: "stop" as const, }, diff --git a/src/secrets/redact.ts b/src/secrets/redact.ts index 636f2fc..e4545c5 100644 --- a/src/secrets/redact.ts +++ b/src/secrets/redact.ts @@ -1,4 +1,4 @@ -import type { SecretsDetectionConfig } from "../config"; +import { findPartialPlaceholderStart, generateSecretPlaceholder } from "../constants/placeholders"; import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client"; import { extractTextContent } from "../utils/content"; import type { SecretsRedaction } from "./detect"; @@ -33,22 +33,15 @@ export function createRedactionContext(): RedactionContext { } /** - * Generates a placeholder for a secret type using configured format + * Generates a placeholder for a secret type * - * Format: configurable via `redact_placeholder`, default "" - * {N} is replaced with sequential number + * Format: [[SECRET_REDACTED_{TYPE}_{N}]] e.g. [[SECRET_REDACTED_API_KEY_OPENAI_1]] */ -function generatePlaceholder( - secretType: string, - context: RedactionContext, - config: SecretsDetectionConfig, -): string { +function generatePlaceholder(secretType: string, context: RedactionContext): string { const count = (context.counters[secretType] || 0) + 1; context.counters[secretType] = count; - // Use configured placeholder format, replace {N} with count - // Include type in the placeholder to make it unique per type - return config.redact_placeholder.replace("{N}", `${secretType}_${count}`); + return generateSecretPlaceholder(secretType, count); } /** @@ -59,13 +52,11 @@ function generatePlaceholder( * * @param text - The text to redact secrets from * @param redactions - Array of redaction positions (sorted by start position descending) - * @param config - Secrets detection configuration * @param context - Optional existing context to reuse (for multiple messages) */ export function redactSecrets( text: string, redactions: SecretsRedaction[], - config: SecretsDetectionConfig, context?: RedactionContext, ): RedactionResult { const ctx = context || createRedactionContext(); @@ -86,7 +77,7 @@ export function redactSecrets( let placeholder = ctx.reverseMapping[originalValue]; if (!placeholder) { - placeholder = generatePlaceholder(redaction.type, ctx, config); + placeholder = generatePlaceholder(redaction.type, ctx); ctx.mapping[placeholder] = originalValue; ctx.reverseMapping[originalValue] = placeholder; } @@ -133,19 +124,17 @@ export function unredactSecrets(text: string, context: RedactionContext): string * * @param messages - Chat messages to redact * @param redactionsByMessage - Redactions for each message (indexed by message position) - * @param config - Secrets detection configuration */ export function redactMessagesSecrets( messages: ChatMessage[], redactionsByMessage: SecretsRedaction[][], - config: SecretsDetectionConfig, ): { redacted: ChatMessage[]; context: RedactionContext } { const context = createRedactionContext(); const redacted = messages.map((msg, i) => { const redactions = redactionsByMessage[i] || []; const text = extractTextContent(msg.content); - const { redacted: redactedContent } = redactSecrets(text, redactions, config, context); + const { redacted: redactedContent } = redactSecrets(text, redactions, context); // If original content was a string, return redacted string // Otherwise return original content (arrays are handled in proxy.ts) @@ -168,24 +157,10 @@ export function unredactStreamChunk( ): { output: string; remainingBuffer: string } { const combined = buffer + newChunk; - // Find the last safe position to unredact (before any potential partial placeholder) - // Look for the start of any potential placeholder pattern - const placeholderStart = combined.lastIndexOf("<"); - - if (placeholderStart === -1) { - // No potential placeholder, safe to unredact everything - return { - output: unredactSecrets(combined, context), - remainingBuffer: "", - }; - } - - // Check if there's a complete placeholder after the last < - const afterStart = combined.slice(placeholderStart); - const hasCompletePlaceholder = afterStart.includes(">"); + const partialStart = findPartialPlaceholderStart(combined); - if (hasCompletePlaceholder) { - // The placeholder is complete, safe to unredact everything + if (partialStart === -1) { + // No partial placeholder, safe to unredact everything return { output: unredactSecrets(combined, context), remainingBuffer: "", @@ -193,8 +168,8 @@ export function unredactStreamChunk( } // Partial placeholder detected, buffer it - const safeToProcess = combined.slice(0, placeholderStart); - const toBuffer = combined.slice(placeholderStart); + const safeToProcess = combined.slice(0, partialStart); + const toBuffer = combined.slice(partialStart); return { output: unredactSecrets(safeToProcess, context), diff --git a/src/services/masking.test.ts b/src/services/masking.test.ts index 590b509..427fff3 100644 --- a/src/services/masking.test.ts +++ b/src/services/masking.test.ts @@ -36,8 +36,8 @@ describe("mask", () => { const result = mask("Contact: john@example.com please", entities); - expect(result.masked).toBe("Contact: please"); - expect(result.context.mapping[""]).toBe("john@example.com"); + expect(result.masked).toBe("Contact: [[EMAIL_ADDRESS_1]] please"); + expect(result.context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("john@example.com"); }); test("masks multiple entities of same type", () => { @@ -49,9 +49,9 @@ describe("mask", () => { const result = mask(text, entities); - expect(result.masked).toBe("Emails: and "); - expect(result.context.mapping[""]).toBe("a@b.com"); - expect(result.context.mapping[""]).toBe("c@d.com"); + expect(result.masked).toBe("Emails: [[EMAIL_ADDRESS_1]] and [[EMAIL_ADDRESS_2]]"); + expect(result.context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("a@b.com"); + expect(result.context.mapping["[[EMAIL_ADDRESS_2]]"]).toBe("c@d.com"); }); test("masks multiple entity types", () => { @@ -63,9 +63,9 @@ describe("mask", () => { const result = mask(text, entities); - expect(result.masked).toBe(": "); - expect(result.context.mapping[""]).toBe("Hans Müller"); - expect(result.context.mapping[""]).toBe("hans@firma.de"); + expect(result.masked).toBe("[[PERSON_1]]: [[EMAIL_ADDRESS_1]]"); + expect(result.context.mapping["[[PERSON_1]]"]).toBe("Hans Müller"); + expect(result.context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("hans@firma.de"); }); test("reuses placeholder for duplicate values", () => { @@ -78,7 +78,7 @@ describe("mask", () => { const result = mask(text, entities); // Same value should get same placeholder - expect(result.masked).toBe(" and again "); + expect(result.masked).toBe("[[EMAIL_ADDRESS_1]] and again [[EMAIL_ADDRESS_1]]"); expect(Object.keys(result.context.mapping)).toHaveLength(1); }); @@ -91,7 +91,7 @@ describe("mask", () => { const result = mask(text, entities); - expect(result.masked).toBe(""); + expect(result.masked).toBe("[[PERSON_1]][[PERSON_2]]"); }); test("preserves context across calls", () => { @@ -103,7 +103,7 @@ describe("mask", () => { context, ); - expect(result1.masked).toBe("Email: "); + expect(result1.masked).toBe("Email: [[EMAIL_ADDRESS_1]]"); const result2 = mask( "Another: c@d.com", @@ -112,9 +112,9 @@ describe("mask", () => { ); // Should continue numbering - expect(result2.masked).toBe("Another: "); - expect(context.mapping[""]).toBe("a@b.com"); - expect(context.mapping[""]).toBe("c@d.com"); + expect(result2.masked).toBe("Another: [[EMAIL_ADDRESS_2]]"); + expect(context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("a@b.com"); + expect(context.mapping["[[EMAIL_ADDRESS_2]]"]).toBe("c@d.com"); }); }); @@ -127,19 +127,19 @@ describe("unmask", () => { test("restores single placeholder", () => { const context = createMaskingContext(); - context.mapping[""] = "john@example.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "john@example.com"; - const result = unmask("Reply to ", context, defaultConfig); + const result = unmask("Reply to [[EMAIL_ADDRESS_1]]", context, defaultConfig); expect(result).toBe("Reply to john@example.com"); }); test("restores multiple placeholders", () => { const context = createMaskingContext(); - context.mapping[""] = "Hans Müller"; - context.mapping[""] = "hans@firma.de"; + context.mapping["[[PERSON_1]]"] = "Hans Müller"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "hans@firma.de"; const result = unmask( - "Hello , your email is confirmed", + "Hello [[PERSON_1]], your email [[EMAIL_ADDRESS_1]] is confirmed", context, defaultConfig, ); @@ -148,26 +148,26 @@ describe("unmask", () => { test("restores repeated placeholders", () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - const result = unmask(" and ", context, defaultConfig); + const result = unmask("[[EMAIL_ADDRESS_1]] and [[EMAIL_ADDRESS_1]]", context, defaultConfig); expect(result).toBe("test@test.com and test@test.com"); }); test("adds markers when configured", () => { const context = createMaskingContext(); - context.mapping[""] = "john@example.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "john@example.com"; - const result = unmask("Email: ", context, configWithMarkers); + const result = unmask("Email: [[EMAIL_ADDRESS_1]]", context, configWithMarkers); expect(result).toBe("Email: [protected]john@example.com"); }); test("handles partial placeholder (no match)", () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - const result = unmask("Text with ", context, defaultConfig); - expect(result).toBe("Text with "); // No match, unchanged + const result = unmask("Text with [[EMAIL_ADDRESS_2]]", context, defaultConfig); + expect(result).toBe("Text with [[EMAIL_ADDRESS_2]]"); // No match, unchanged }); }); @@ -188,7 +188,7 @@ describe("mask -> unmask roundtrip", () => { expect(masked).not.toContain("+49123456789"); // Simulate LLM response that echoes placeholders - const llmResponse = `I see your contact info: ${masked.match(//)?.[0]}, email ${masked.match(//)?.[0]}`; + const llmResponse = `I see your contact info: ${masked.match(/\[\[PERSON_1\]\]/)?.[0]}, email ${masked.match(/\[\[EMAIL_ADDRESS_1\]\]/)?.[0]}`; const unmasked = unmask(llmResponse, context, defaultConfig); @@ -221,12 +221,12 @@ describe("maskMessages", () => { const { masked, context } = maskMessages(messages, entitiesByMessage); - expect(masked[0].content).toBe("My email is "); + expect(masked[0].content).toBe("My email is [[EMAIL_ADDRESS_1]]"); expect(masked[1].content).toBe("Got it"); - expect(masked[2].content).toBe("Also "); + expect(masked[2].content).toBe("Also [[EMAIL_ADDRESS_2]]"); - expect(context.mapping[""]).toBe("test@example.com"); - expect(context.mapping[""]).toBe("john@test.com"); + expect(context.mapping["[[EMAIL_ADDRESS_1]]"]).toBe("test@example.com"); + expect(context.mapping["[[EMAIL_ADDRESS_2]]"]).toBe("john@test.com"); }); test("preserves message roles", () => { @@ -245,11 +245,11 @@ describe("maskMessages", () => { describe("streaming unmask", () => { test("unmasks complete placeholder in chunk", () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; const { output, remainingBuffer } = unmaskStreamChunk( "", - "Hello !", + "Hello [[EMAIL_ADDRESS_1]]!", context, defaultConfig, ); @@ -260,26 +260,26 @@ describe("streaming unmask", () => { test("buffers partial placeholder", () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; const { output, remainingBuffer } = unmaskStreamChunk( "", - "Hello { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; const { output, remainingBuffer } = unmaskStreamChunk( - " there", + "[[EMAIL_ADD", + "RESS_1]] there", context, defaultConfig, ); @@ -304,21 +304,21 @@ describe("streaming unmask", () => { test("flushes remaining buffer", () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; // Partial that never completes - const flushed = flushStreamBuffer(" { test("unmasks all choices in response", () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; - context.mapping[""] = "John Doe"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; + context.mapping["[[PERSON_1]]"] = "John Doe"; const response = { id: "chatcmpl-123", @@ -330,7 +330,7 @@ describe("unmaskResponse", () => { index: 0, message: { role: "assistant" as const, - content: "Contact at ", + content: "Contact [[PERSON_1]] at [[EMAIL_ADDRESS_1]]", }, finish_reason: "stop" as const, }, @@ -351,7 +351,7 @@ describe("unmaskResponse", () => { test("handles multiple choices", () => { const context = createMaskingContext(); - context.mapping[""] = "a@b.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "a@b.com"; const response = { id: "chatcmpl-456", @@ -361,12 +361,12 @@ describe("unmaskResponse", () => { choices: [ { index: 0, - message: { role: "assistant" as const, content: "First: " }, + message: { role: "assistant" as const, content: "First: [[EMAIL_ADDRESS_1]]" }, finish_reason: "stop" as const, }, { index: 1, - message: { role: "assistant" as const, content: "Second: " }, + message: { role: "assistant" as const, content: "Second: [[EMAIL_ADDRESS_1]]" }, finish_reason: "stop" as const, }, ], @@ -411,7 +411,7 @@ describe("edge cases", () => { const entities: PIIEntity[] = [{ entity_type: "PERSON", start: 9, end: 24, score: 0.9 }]; const { masked, context } = mask(text, entities); - expect(masked).toBe("Kontakt: "); + expect(masked).toBe("Kontakt: [[PERSON_1]]"); const unmasked = unmask(masked, context, defaultConfig); expect(unmasked).toBe("Kontakt: François Müller"); @@ -425,9 +425,145 @@ describe("edge cases", () => { test("handles placeholder-like text that is not a real placeholder", () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - const result = unmask("Use format", context, defaultConfig); - expect(result).toBe("Use format"); + const result = unmask("Use [[UNKNOWN_1]] format", context, defaultConfig); + expect(result).toBe("Use [[UNKNOWN_1]] format"); + }); +}); + +describe("HTML context handling (issue #36)", () => { + test("unmasks placeholders in HTML without encoding issues", () => { + // With [[]] format, placeholders are not affected by HTML encoding + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "Dr. Sarah Chen"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "sarah.chen@hospital.org"; + + // [[]] brackets don't get HTML-encoded, so they work directly + const htmlResponse = `

Contact [[PERSON_1]] at [[EMAIL_ADDRESS_1]]

`; + + const result = unmask(htmlResponse, context, defaultConfig); + + expect(result).toBe("

Contact Dr. Sarah Chen at sarah.chen@hospital.org

"); + }); + + test("unmasks placeholders in HTML title attributes", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "Jane Smith"; + + // [[]] works in HTML attributes without encoding + const htmlWithAttr = `Click here`; + + const result = unmask(htmlWithAttr, context, defaultConfig); + + expect(result).toBe(`Click here`); + }); + + test("unmasks placeholders in mailto links", () => { + const context = createMaskingContext(); + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@example.com"; + + const mailtoHtml = `Send email`; + + const result = unmask(mailtoHtml, context, defaultConfig); + + expect(result).toBe(`Send email`); + }); + + test("handles multiple occurrences of same placeholder in HTML", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "Alice"; + + const response = `

[[PERSON_1]] said hello.

[[PERSON_1]] waved goodbye.

`; + + const result = unmask(response, context, defaultConfig); + + expect(result).toBe("

Alice said hello.

Alice waved goodbye.

"); + }); + + test("works with complex HTML structures", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "Dr. Sarah Chen"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "sarah@hospital.org"; + context.mapping["[[PHONE_NUMBER_1]]"] = "+1-555-0123"; + + const complexHtml = ` +
+

[[PERSON_1]]

+ [[EMAIL_ADDRESS_1]] + Call: [[PHONE_NUMBER_1]] +
+ `; + + const result = unmask(complexHtml, context, defaultConfig); + + expect(result).toContain("Dr. Sarah Chen"); + expect(result).toContain("sarah@hospital.org"); + expect(result).toContain("+1-555-0123"); + expect(result).not.toContain("[["); + expect(result).not.toContain("]]"); + }); +}); + +describe("streaming with [[]] placeholders (issue #36)", () => { + test("handles complete placeholder in chunk", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "John Doe"; + + const { output, remainingBuffer } = unmaskStreamChunk( + "", + "Hello [[PERSON_1]]!", + context, + defaultConfig, + ); + + expect(output).toBe("Hello John Doe!"); + expect(remainingBuffer).toBe(""); + }); + + test("buffers partial placeholder at end of chunk", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "John Doe"; + + // Partial placeholder at end: [[PERS + const { output, remainingBuffer } = unmaskStreamChunk( + "", + "Hello [[PERS", + context, + defaultConfig, + ); + + expect(output).toBe("Hello "); + expect(remainingBuffer).toBe("[[PERS"); + }); + + test("completes buffered placeholder across chunks", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "John Doe"; + + const { output, remainingBuffer } = unmaskStreamChunk( + "[[PERS", + "ON_1]] there", + context, + defaultConfig, + ); + + expect(output).toBe("John Doe there"); + expect(remainingBuffer).toBe(""); + }); + + test("handles placeholder split at closing brackets", () => { + const context = createMaskingContext(); + context.mapping["[[PERSON_1]]"] = "John Doe"; + + // First chunk ends with incomplete closing + const result1 = unmaskStreamChunk("", "Hello [[PERSON_1]", context, defaultConfig); + expect(result1.output).toBe("Hello "); + expect(result1.remainingBuffer).toBe("[[PERSON_1]"); + + // Second chunk completes it + const result2 = unmaskStreamChunk(result1.remainingBuffer, "] world", context, defaultConfig); + expect(result2.output).toBe("John Doe world"); + expect(result2.remainingBuffer).toBe(""); }); }); diff --git a/src/services/masking.ts b/src/services/masking.ts index 2b10031..a2dd308 100644 --- a/src/services/masking.ts +++ b/src/services/masking.ts @@ -1,4 +1,9 @@ import type { MaskingConfig } from "../config"; +import { + findPartialPlaceholderStart, + generatePlaceholder as generatePlaceholderFromFormat, + PII_PLACEHOLDER_FORMAT, +} from "../constants/placeholders"; import { extractTextContent } from "../utils/content"; import type { ChatCompletionResponse, ChatMessage } from "./llm-client"; import type { PIIEntity } from "./pii-detector"; @@ -25,8 +30,6 @@ export function createMaskingContext(): MaskingContext { }; } -const PLACEHOLDER_FORMAT = "<{TYPE}_{N}>"; - /** * Generates a placeholder for a PII entity type */ @@ -34,7 +37,7 @@ function generatePlaceholder(entityType: string, context: MaskingContext): strin const count = (context.counters[entityType] || 0) + 1; context.counters[entityType] = count; - return PLACEHOLDER_FORMAT.replace("{TYPE}", entityType).replace("{N}", String(count)); + return generatePlaceholderFromFormat(PII_PLACEHOLDER_FORMAT, entityType, count); } /** @@ -183,24 +186,10 @@ export function unmaskStreamChunk( ): { output: string; remainingBuffer: string } { const combined = buffer + newChunk; - // Find the last safe position to unmask (before any potential partial placeholder) - // Look for the start of any potential placeholder pattern - const placeholderStart = combined.lastIndexOf("<"); - - if (placeholderStart === -1) { - // No potential placeholder, safe to unmask everything - return { - output: unmask(combined, context, config), - remainingBuffer: "", - }; - } - - // Check if there's a complete placeholder after the last < - const afterStart = combined.slice(placeholderStart); - const hasCompletePlaceholder = afterStart.includes(">"); + const partialStart = findPartialPlaceholderStart(combined); - if (hasCompletePlaceholder) { - // The placeholder is complete, safe to unmask everything + if (partialStart === -1) { + // No partial placeholder, safe to unmask everything return { output: unmask(combined, context, config), remainingBuffer: "", @@ -208,8 +197,8 @@ export function unmaskStreamChunk( } // Partial placeholder detected, buffer it - const safeToProcess = combined.slice(0, placeholderStart); - const toBuffer = combined.slice(placeholderStart); + const safeToProcess = combined.slice(0, partialStart); + const toBuffer = combined.slice(partialStart); return { output: unmask(safeToProcess, context, config), diff --git a/src/services/stream-transformer.test.ts b/src/services/stream-transformer.test.ts index 0e43ef2..b79cc56 100644 --- a/src/services/stream-transformer.test.ts +++ b/src/services/stream-transformer.test.ts @@ -47,9 +47,9 @@ async function consumeStream(stream: ReadableStream): Promise { test("unmasks complete placeholder in single chunk", async () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; - const sseData = `data: {"choices":[{"delta":{"content":"Hello !"}}]}\n\n`; + const sseData = `data: {"choices":[{"delta":{"content":"Hello [[EMAIL_ADDRESS_1]]!"}}]}\n\n`; const source = createSSEStream([sseData]); const unmaskedStream = createUnmaskingStream(source, context, defaultConfig); @@ -84,12 +84,12 @@ describe("createUnmaskingStream", () => { test("buffers partial placeholder across chunks", async () => { const context = createMaskingContext(); - context.mapping[""] = "a@b.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "a@b.com"; // Split placeholder across chunks const chunks = [ - `data: {"choices":[{"delta":{"content":"Hello world"}}]}\n\n`, + `data: {"choices":[{"delta":{"content":"Hello [[EMAIL_"}}]}\n\n`, + `data: {"choices":[{"delta":{"content":"ADDRESS_1]] world"}}]}\n\n`, ]; const source = createSSEStream(chunks); @@ -102,10 +102,10 @@ describe("createUnmaskingStream", () => { test("flushes remaining buffer on stream end", async () => { const context = createMaskingContext(); - context.mapping[""] = "test@test.com"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "test@test.com"; // Partial placeholder that completes only on flush - const chunks = [`data: {"choices":[{"delta":{"content":"Contact "}}]}\n\n`]; + const chunks = [`data: {"choices":[{"delta":{"content":"Contact [[EMAIL_ADDRESS_1]]"}}]}\n\n`]; const source = createSSEStream(chunks); const unmaskedStream = createUnmaskingStream(source, context, defaultConfig); @@ -116,10 +116,10 @@ describe("createUnmaskingStream", () => { test("handles multiple placeholders in stream", async () => { const context = createMaskingContext(); - context.mapping[""] = "John"; - context.mapping[""] = "john@test.com"; + context.mapping["[[PERSON_1]]"] = "John"; + context.mapping["[[EMAIL_ADDRESS_1]]"] = "john@test.com"; - const sseData = `data: {"choices":[{"delta":{"content":": "}}]}\n\n`; + const sseData = `data: {"choices":[{"delta":{"content":"[[PERSON_1]]: [[EMAIL_ADDRESS_1]]"}}]}\n\n`; const source = createSSEStream([sseData]); const unmaskedStream = createUnmaskingStream(source, context, defaultConfig); diff --git a/src/views/dashboard/page.tsx b/src/views/dashboard/page.tsx index a0c7e3f..d1eb1ad 100644 --- a/src/views/dashboard/page.tsx +++ b/src/views/dashboard/page.tsx @@ -525,7 +525,7 @@ function formatMaskedPreview(maskedContent, entities) { .replace(/&/g, '&') .replace(//g, '>') - .replace(/<([A-Z_]+_\\d+)>/g, '<$1>'); + .replace(/\\[\\[([A-Z_]+_\\d+)\\]\\]/g, '[[$1]]'); } if (!entities || entities.length === 0) { return 'No PII detected in this request';