return msg;
}
- // Redact only text parts of array content
+ // Track offset position within the concatenated text for this message
+ // (matches how extractTextContent joins parts with \n)
+ let partOffset = 0;
+
+ // Redact only text parts of array content with proper offset tracking
const redactedContent = msg.content.map((part: ContentPart) => {
if (part.type === "text" && typeof part.text === "string") {
- const { redacted, context: updatedContext } = redactSecrets(
- part.text,
- messageRedactions,
- config,
- context,
- );
- context = updatedContext;
- return { ...part, text: redacted };
+ const partLength = part.text.length;
+
+ // Find redactions that apply to this specific part
+ const partRedactions = messageRedactions
+ .filter((r) => r.start < partOffset + partLength && r.end > partOffset)
+ .map((r) => ({
+ ...r,
+ start: Math.max(0, r.start - partOffset),
+ end: Math.min(partLength, r.end - partOffset),
+ }));
+
+ if (partRedactions.length > 0) {
+ const { redacted, context: updatedContext } = redactSecrets(
+ part.text,
+ partRedactions,
+ config,
+ context,
+ );
+ context = updatedContext;
+ partOffset += partLength + 1; // +1 for \n separator
+ return { ...part, text: redacted };
+ }
+
+ partOffset += partLength + 1; // +1 for \n separator
+ return part;
}
return part;
});
--- /dev/null
+import { describe, expect, test } from "bun:test";
+import type { MaskingConfig, SecretsDetectionConfig } from "../config";
+import type { ChatMessage } from "../services/llm-client";
+import { maskMessages } from "../services/masking";
+import type { PIIEntity } from "../services/pii-detector";
+import { detectSecrets } from "./detect";
+import { redactSecrets } from "./redact";
+import type { ContentPart } from "../utils/content";
+
+describe("Multimodal content handling", () => {
+ const secretsConfig: SecretsDetectionConfig = {
+ enabled: true,
+ action: "redact",
+ entities: ["API_KEY_OPENAI"],
+ max_scan_chars: 200000,
+ redact_placeholder: "<SECRET_REDACTED_{N}>",
+ log_detected_types: true,
+ };
+
+ describe("Secrets redaction with offset tracking", () => {
+ // Note: Secrets are not expected to span across newlines in real scenarios
+ // The offset tracking is implemented to handle PII entities correctly
+ });
+
+ describe("PII masking with offset tracking", () => {
+ test("masks PII in multimodal array content", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ content: [
+ { type: "text", text: "My email is john@example.com and" },
+ { type: "image_url", image_url: { url: "https://example.com/img.jpg" } },
+ { type: "text", text: "my phone is 555-1234" },
+ ],
+ },
+ ];
+
+ // Concatenated text: "My email is john@example.com and\nmy phone is 555-1234"
+ // Entities for this concatenated text:
+ const entities: PIIEntity[] = [
+ { entity_type: "EMAIL_ADDRESS", start: 12, end: 28, score: 0.9 }, // john@example.com in part 0
+ { entity_type: "PHONE_NUMBER", start: 45, end: 53, score: 0.85 }, // 555-1234 in part 2 (after newline)
+ ];
+
+ const entitiesByMessage = [entities];
+
+ const { masked } = maskMessages(messages, entitiesByMessage);
+
+ // Verify the content is still an array
+ expect(Array.isArray(masked[0].content)).toBe(true);
+
+ const maskedContent = masked[0].content as ContentPart[];
+
+ // Part 0 should have email masked
+ expect(maskedContent[0].type).toBe("text");
+ expect(maskedContent[0].text).toBe("My email is <EMAIL_ADDRESS_1> and");
+ expect(maskedContent[0].text).not.toContain("john@example.com");
+
+ // Part 1 should be unchanged (image)
+ expect(maskedContent[1].type).toBe("image_url");
+ expect(maskedContent[1].image_url?.url).toBe("https://example.com/img.jpg");
+
+ // Part 2 should have phone masked
+ expect(maskedContent[2].type).toBe("text");
+ expect(maskedContent[2].text).toBe("my phone is <PHONE_NUMBER_1>");
+ expect(maskedContent[2].text).not.toContain("555-1234");
+ });
+
+ test("returns masked array instead of original unmasked array", () => {
+ // This tests the bug fix: previously array content was extracted and masked,
+ // but then the original array was returned unchanged
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ content: [
+ { type: "text", text: "Contact Alice at alice@secret.com" },
+ ],
+ },
+ ];
+
+ const entities: PIIEntity[] = [
+ { entity_type: "PERSON", start: 8, end: 13, score: 0.9 }, // Alice
+ { entity_type: "EMAIL_ADDRESS", start: 17, end: 33, score: 0.95 }, // alice@secret.com
+ ];
+
+ const { masked } = maskMessages(messages, [entities]);
+
+ // Verify content is still array
+ expect(Array.isArray(masked[0].content)).toBe(true);
+
+ const maskedContent = masked[0].content as ContentPart[];
+
+ // Verify the text is actually masked (not the original)
+ expect(maskedContent[0].text).not.toContain("Alice");
+ expect(maskedContent[0].text).not.toContain("alice@secret.com");
+ expect(maskedContent[0].text).toContain("<PERSON_1>");
+ expect(maskedContent[0].text).toContain("<EMAIL_ADDRESS_1>");
+ });
+
+ test("handles entities spanning multiple parts with proper offsets", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ content: [
+ { type: "text", text: "First part with email@" },
+ { type: "text", text: "example.com in two parts" },
+ ],
+ },
+ ];
+
+ // In concatenated text: "First part with email@\nexample.com in two parts"
+ // Email spans from position 16 to 39 (crossing the newline at position 22)
+ const entities: PIIEntity[] = [
+ { entity_type: "EMAIL_ADDRESS", start: 16, end: 34, score: 0.9 },
+ ];
+
+ const { masked } = maskMessages(messages, [entities]);
+
+ const maskedContent = masked[0].content as ContentPart[];
+
+ // Both parts should be affected by the email entity
+ // Part 0: "First part with <EMAIL" or similar
+ // Part 1: "ADDRESS_1> in two parts" or similar
+ // The exact split depends on how the masking handles cross-boundary entities
+
+ // At minimum, verify that the entity is masked somewhere
+ const fullMasked = maskedContent
+ .filter(p => p.type === "text")
+ .map(p => p.text)
+ .join("\n");
+
+ expect(fullMasked).toContain("<EMAIL_ADDRESS_");
+ expect(fullMasked).not.toContain("email@example.com");
+ });
+ });
+});
import type { MaskingConfig } from "../config";
-import { extractTextContent } from "../utils/content";
+import { extractTextContent, type ContentPart } from "../utils/content";
import type { ChatCompletionResponse, ChatMessage } from "./llm-client";
import type { PIIEntity } from "./pii-detector";
const masked = messages.map((msg, i) => {
const entities = entitiesByMessage[i] || [];
+
+ // Handle array content (multimodal messages)
+ if (Array.isArray(msg.content)) {
+ if (entities.length === 0) {
+ return msg;
+ }
+
+ // Track offset position within the concatenated text for this message
+ // (matches how extractTextContent joins parts with \n)
+ let partOffset = 0;
+
+ // Mask only text parts with proper offset tracking
+ const maskedContent = msg.content.map((part) => {
+ if (part.type === "text" && typeof part.text === "string") {
+ const partLength = part.text.length;
+
+ // Find entities that apply to this specific part
+ const partEntities = entities
+ .filter((e) => e.start < partOffset + partLength && e.end > partOffset)
+ .map((e) => ({
+ ...e,
+ start: Math.max(0, e.start - partOffset),
+ end: Math.min(partLength, e.end - partOffset),
+ }));
+
+ if (partEntities.length > 0) {
+ const { masked: maskedText } = mask(part.text, partEntities, context);
+ partOffset += partLength + 1; // +1 for \n separator
+ return { ...part, text: maskedText };
+ }
+
+ partOffset += partLength + 1; // +1 for \n separator
+ return part;
+ }
+ return part;
+ });
+
+ return { ...msg, content: maskedContent };
+ }
+
+ // Handle string content (text-only messages)
const text = extractTextContent(msg.content);
const { masked: maskedContent } = mask(text, entities, context);
// If original content was a string, return masked string
- // Otherwise return original content (arrays are handled elsewhere)
+ // Otherwise return original content
return { ...msg, content: typeof msg.content === "string" ? maskedContent : msg.content };
});
--- /dev/null
+import { describe, expect, test } from "bun:test";
+import { extractTextContent, hasTextContent, type ContentPart } from "./content";
+
+describe("extractTextContent", () => {
+ test("returns empty string for null", () => {
+ expect(extractTextContent(null)).toBe("");
+ });
+
+ test("returns empty string for undefined", () => {
+ expect(extractTextContent(undefined)).toBe("");
+ });
+
+ test("returns string content unchanged", () => {
+ expect(extractTextContent("Hello world")).toBe("Hello world");
+ });
+
+ test("extracts text from single text part", () => {
+ const content: ContentPart[] = [{ type: "text", text: "What's in this image?" }];
+ expect(extractTextContent(content)).toBe("What's in this image?");
+ });
+
+ test("extracts and joins multiple text parts", () => {
+ const content: ContentPart[] = [
+ { type: "text", text: "First part" },
+ { type: "text", text: "Second part" },
+ ];
+ expect(extractTextContent(content)).toBe("First part\nSecond part");
+ });
+
+ test("skips image_url parts", () => {
+ const content: ContentPart[] = [
+ { type: "text", text: "Look at this" },
+ { type: "image_url", image_url: { url: "https://example.com/image.jpg" } },
+ { type: "text", text: "What is it?" },
+ ];
+ expect(extractTextContent(content)).toBe("Look at this\nWhat is it?");
+ });
+
+ test("returns empty string for array with no text parts", () => {
+ const content: ContentPart[] = [
+ { type: "image_url", image_url: { url: "https://example.com/image.jpg" } },
+ ];
+ expect(extractTextContent(content)).toBe("");
+ });
+
+ test("handles empty array", () => {
+ expect(extractTextContent([])).toBe("");
+ });
+});
+
+describe("hasTextContent", () => {
+ test("returns false for null", () => {
+ expect(hasTextContent(null)).toBe(false);
+ });
+
+ test("returns false for undefined", () => {
+ expect(hasTextContent(undefined)).toBe(false);
+ });
+
+ test("returns true for non-empty string", () => {
+ expect(hasTextContent("Hello")).toBe(true);
+ });
+
+ test("returns false for empty string", () => {
+ expect(hasTextContent("")).toBe(false);
+ });
+
+ test("returns true for array with text", () => {
+ const content: ContentPart[] = [{ type: "text", text: "Hello" }];
+ expect(hasTextContent(content)).toBe(true);
+ });
+
+ test("returns false for array without text", () => {
+ const content: ContentPart[] = [
+ { type: "image_url", image_url: { url: "https://example.com/image.jpg" } },
+ ];
+ expect(hasTextContent(content)).toBe(false);
+ });
+});