From: Stefan Gasser Date: Fri, 16 Jan 2026 14:09:32 +0000 (+0100) Subject: Fix overlapping entity masking (#33) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=c8cc3cfc3449336bf8005b1445a9ed1a677ddcee;p=sgasser-llm-shield.git Fix overlapping entity masking (#33) Presidio can return overlapping PII detections (e.g., both "Eric" and "Eric's" as separate PERSON entities). When both were masked, the string positions became invalid, corrupting the output. Added removeOverlappingEntities() to filter overlaps before masking: - Sort by start position (longer first if same start) - Keep only non-overlapping entities --- diff --git a/src/secrets/redact.ts b/src/secrets/redact.ts index e4545c5..4fba7b1 100644 --- a/src/secrets/redact.ts +++ b/src/secrets/redact.ts @@ -1,6 +1,7 @@ import { findPartialPlaceholderStart, generateSecretPlaceholder } from "../constants/placeholders"; import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client"; import { extractTextContent } from "../utils/content"; +import { removeOverlappingEntities } from "../utils/entities"; import type { SecretsRedaction } from "./detect"; /** @@ -65,8 +66,11 @@ export function redactSecrets( return { redacted: text, context: ctx }; } + // Remove overlapping redactions to prevent text corruption + const nonOverlapping = removeOverlappingEntities(redactions); + // First pass: sort by start position ascending to assign placeholders in order of appearance - const sortedByStart = [...redactions].sort((a, b) => a.start - b.start); + const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start); // Assign placeholders in order of appearance const redactionPlaceholders = new Map(); @@ -86,8 +90,7 @@ export function redactSecrets( } // Second pass: replace from end to start to maintain correct string positions - // Redactions should already be sorted by start descending, but re-sort to be safe - const sortedByEnd = [...redactions].sort((a, b) => b.start - a.start); + const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start); let result = text; for (const redaction of sortedByEnd) { diff --git a/src/services/masking.test.ts b/src/services/masking.test.ts index 427fff3..1194d02 100644 --- a/src/services/masking.test.ts +++ b/src/services/masking.test.ts @@ -567,3 +567,89 @@ describe("streaming with [[]] placeholders (issue #36)", () => { expect(result2.remainingBuffer).toBe(""); }); }); + +describe("overlapping entities (issue #33)", () => { + test("handles overlapping entities with same start - keeps longer", () => { + // Bug: Presidio returns both "Eric" and "Eric's" as separate PERSON entities + const text = "Given Eric's feedback"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 6, end: 10, score: 0.85 }, // "Eric" + { entity_type: "PERSON", start: 6, end: 12, score: 0.8 }, // "Eric's" + ]; + + const { masked, context } = mask(text, entities); + + // Longer span wins when same start position + expect(masked).toBe("Given feedback"); + expect(context.mapping[""]).toBe("Eric's"); + }); + + test("handles partially overlapping entities - keeps first", () => { + const text = "Contact John Smith Jones please"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 8, end: 18, score: 0.9 }, // "John Smith" + { entity_type: "PERSON", start: 13, end: 25, score: 0.7 }, // "Smith Jones" + ]; + + const { masked } = mask(text, entities); + + // First by position wins + expect(masked).toBe("Contact Jones please"); + }); + + test("handles nested entities - keeps outer (starts first)", () => { + const text = "Dr. John Smith is here"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 0, end: 14, score: 0.9 }, // "Dr. John Smith" + { entity_type: "PERSON", start: 4, end: 8, score: 0.85 }, // "John" + ]; + + const { masked } = mask(text, entities); + + expect(masked).toBe(" is here"); + }); + + test("keeps adjacent non-overlapping entities", () => { + const text = "HansMüller"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, // "Hans" + { entity_type: "PERSON", start: 4, end: 10, score: 0.9 }, // "Müller" + ]; + + const { masked } = mask(text, entities); + + expect(masked).toBe(""); + }); + + test("handles multiple independent overlap groups", () => { + const text = "Laura Smith met Eric's friend Bob Jones Jr"; + const entities: PIIEntity[] = [ + // Group 1: same start - longer wins + { entity_type: "PERSON", start: 0, end: 5, score: 0.85 }, // "Laura" + { entity_type: "PERSON", start: 0, end: 11, score: 0.9 }, // "Laura Smith" + // Group 2: same start - longer wins + { entity_type: "PERSON", start: 16, end: 20, score: 0.85 }, // "Eric" + { entity_type: "PERSON", start: 16, end: 22, score: 0.8 }, // "Eric's" + // Group 3: same start - longer wins + { entity_type: "PERSON", start: 30, end: 33, score: 0.7 }, // "Bob" + { entity_type: "PERSON", start: 30, end: 42, score: 0.9 }, // "Bob Jones Jr" + ]; + + const { masked } = mask(text, entities); + + expect(masked).toBe(" met friend "); + }); + + test("entity consistency - same value gets same placeholder", () => { + const text = "Eric met Eric again"; + const entities: PIIEntity[] = [ + { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, // "Eric" + { entity_type: "PERSON", start: 9, end: 13, score: 0.9 }, // "Eric" + ]; + + const { masked, context } = mask(text, entities); + + expect(masked).toBe(" met again"); + expect(Object.keys(context.mapping)).toHaveLength(1); + }); +}); diff --git a/src/services/masking.ts b/src/services/masking.ts index a2dd308..d07bd4b 100644 --- a/src/services/masking.ts +++ b/src/services/masking.ts @@ -5,6 +5,7 @@ import { PII_PLACEHOLDER_FORMAT, } from "../constants/placeholders"; import { extractTextContent } from "../utils/content"; +import { removeOverlappingEntities } from "../utils/entities"; import type { ChatCompletionResponse, ChatMessage } from "./llm-client"; import type { PIIEntity } from "./pii-detector"; @@ -53,8 +54,12 @@ export function mask(text: string, entities: PIIEntity[], context?: MaskingConte return { masked: text, context: ctx }; } + // Remove overlapping entities to prevent text corruption + // Presidio can return both "Eric" and "Eric's" as separate entities + const nonOverlapping = removeOverlappingEntities(entities); + // First pass: sort by start position ascending to assign placeholders in order - const sortedByStart = [...entities].sort((a, b) => a.start - b.start); + const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start); // Assign placeholders in order of appearance const entityPlaceholders = new Map(); @@ -75,7 +80,7 @@ export function mask(text: string, entities: PIIEntity[], context?: MaskingConte // Second pass: sort by start position descending for replacement // This ensures string indices remain valid as we replace - const sortedByEnd = [...entities].sort((a, b) => b.start - a.start); + const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start); let result = text; for (const entity of sortedByEnd) { diff --git a/src/utils/entities.test.ts b/src/utils/entities.test.ts new file mode 100644 index 0000000..34cc04b --- /dev/null +++ b/src/utils/entities.test.ts @@ -0,0 +1,84 @@ +import { describe, expect, test } from "bun:test"; +import { removeOverlappingEntities } from "./entities"; + +describe("removeOverlappingEntities", () => { + test("returns empty array for empty input", () => { + expect(removeOverlappingEntities([])).toEqual([]); + }); + + test("returns single entity unchanged", () => { + const entities = [{ start: 0, end: 5 }]; + expect(removeOverlappingEntities(entities)).toEqual(entities); + }); + + test("keeps non-overlapping entities", () => { + const entities = [ + { start: 0, end: 5 }, + { start: 10, end: 15 }, + ]; + expect(removeOverlappingEntities(entities)).toEqual(entities); + }); + + test("keeps adjacent entities (not overlapping)", () => { + const entities = [ + { start: 0, end: 4 }, + { start: 4, end: 9 }, + ]; + expect(removeOverlappingEntities(entities)).toEqual(entities); + }); + + test("removes overlapping entity with same start - keeps longer", () => { + // "Eric" vs "Eric's" - both start at 6 + const entities = [ + { start: 6, end: 10 }, // "Eric" + { start: 6, end: 12 }, // "Eric's" + ]; + const result = removeOverlappingEntities(entities); + expect(result).toHaveLength(1); + expect(result[0].end).toBe(12); // longer one kept + }); + + test("removes partially overlapping entity", () => { + const entities = [ + { start: 0, end: 10 }, + { start: 5, end: 15 }, + ]; + const result = removeOverlappingEntities(entities); + expect(result).toHaveLength(1); + expect(result[0].start).toBe(0); + }); + + test("removes nested entity", () => { + const entities = [ + { start: 0, end: 14 }, // "Dr. John Smith" + { start: 4, end: 8 }, // "John" + ]; + const result = removeOverlappingEntities(entities); + expect(result).toHaveLength(1); + expect(result[0].end).toBe(14); // outer kept + }); + + test("handles multiple overlap groups", () => { + const entities = [ + { start: 0, end: 5 }, + { start: 2, end: 7 }, // overlaps with first + { start: 20, end: 25 }, + { start: 22, end: 28 }, // overlaps with third + ]; + const result = removeOverlappingEntities(entities); + expect(result).toHaveLength(2); + expect(result[0].start).toBe(0); + expect(result[1].start).toBe(20); + }); + + test("preserves additional entity properties", () => { + const entities = [ + { start: 0, end: 5, entity_type: "PERSON", score: 0.9 }, + { start: 2, end: 7, entity_type: "PERSON", score: 0.8 }, + ]; + const result = removeOverlappingEntities(entities); + expect(result).toHaveLength(1); + expect(result[0].entity_type).toBe("PERSON"); + expect(result[0].score).toBe(0.9); + }); +}); diff --git a/src/utils/entities.ts b/src/utils/entities.ts new file mode 100644 index 0000000..edcdbd2 --- /dev/null +++ b/src/utils/entities.ts @@ -0,0 +1,32 @@ +/** + * Removes overlapping entities to prevent text corruption during masking + * + * When entities overlap (e.g., "Eric" and "Eric's" both detected), applying + * both replacements corrupts the text. This function keeps only non-overlapping + * entities, preferring longer spans when they share the same start position. + */ +export function removeOverlappingEntities( + entities: T[], +): T[] { + if (entities.length <= 1) return entities; + + // Sort by start position, then by length descending (longer = more specific) + const sorted = [...entities].sort((a, b) => { + if (a.start !== b.start) return a.start - b.start; + return b.end - b.start - (a.end - a.start); + }); + + const result: T[] = [sorted[0]]; + + for (let i = 1; i < sorted.length; i++) { + const current = sorted[i]; + const last = result[result.length - 1]; + + // Keep only if no overlap with last kept entity + if (current.start >= last.end) { + result.push(current); + } + } + + return result; +}