From: Stefan Gasser Date: Fri, 16 Jan 2026 16:11:50 +0000 (+0100) Subject: Fix overlapping entity conflict resolution (#33) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=dbb221a2130e1a97518d7a39abc75d4733873758;p=sgasser-llm-shield.git Fix overlapping entity conflict resolution (#33) Implement Presidio-style conflict resolution for overlapping PII entities. Problem: Presidio returns overlapping entities (e.g., both "Eric" and "Eric's") which caused text corruption during masking. Solution: Two-phase algorithm matching Presidio's Anonymizer logic: 1. Merge overlapping entities of same type (expand boundaries, keep highest score) 2. Remove conflicting entities of different types (contained or lower score loses) - Add conflict-resolver.ts with resolveConflicts() and resolveConflictsSimple() - Replace old removeOverlappingEntities with new conflict resolver - Add 18 tests covering all conflict scenarios - Delete old entities.ts (replaced by conflict-resolver.ts) --- diff --git a/src/secrets/redact.ts b/src/secrets/redact.ts index 4fba7b1..d460a62 100644 --- a/src/secrets/redact.ts +++ b/src/secrets/redact.ts @@ -1,7 +1,7 @@ import { findPartialPlaceholderStart, generateSecretPlaceholder } from "../constants/placeholders"; import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client"; +import { resolveConflictsSimple } from "../utils/conflict-resolver"; import { extractTextContent } from "../utils/content"; -import { removeOverlappingEntities } from "../utils/entities"; import type { SecretsRedaction } from "./detect"; /** @@ -66,11 +66,11 @@ export function redactSecrets( return { redacted: text, context: ctx }; } - // Remove overlapping redactions to prevent text corruption - const nonOverlapping = removeOverlappingEntities(redactions); + // Resolve conflicts between overlapping redactions + const resolved = resolveConflictsSimple(redactions); // First pass: sort by start position ascending to assign placeholders in order of appearance - const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start); + const sortedByStart = [...resolved].sort((a, b) => a.start - b.start); // Assign placeholders in order of appearance const redactionPlaceholders = new Map(); @@ -90,7 +90,7 @@ export function redactSecrets( } // Second pass: replace from end to start to maintain correct string positions - const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start); + const sortedByEnd = [...resolved].sort((a, b) => b.start - a.start); let result = text; for (const redaction of sortedByEnd) { diff --git a/src/services/masking.test.ts b/src/services/masking.test.ts index 1194d02..bbfa58e 100644 --- a/src/services/masking.test.ts +++ b/src/services/masking.test.ts @@ -580,11 +580,11 @@ describe("overlapping entities (issue #33)", () => { const { masked, context } = mask(text, entities); // Longer span wins when same start position - expect(masked).toBe("Given feedback"); - expect(context.mapping[""]).toBe("Eric's"); + expect(masked).toBe("Given [[PERSON_1]] feedback"); + expect(context.mapping["[[PERSON_1]]"]).toBe("Eric's"); }); - test("handles partially overlapping entities - keeps first", () => { + test("handles partially overlapping entities of same type - merges them", () => { const text = "Contact John Smith Jones please"; const entities: PIIEntity[] = [ { entity_type: "PERSON", start: 8, end: 18, score: 0.9 }, // "John Smith" @@ -593,8 +593,9 @@ describe("overlapping entities (issue #33)", () => { const { masked } = mask(text, entities); - // First by position wins - expect(masked).toBe("Contact Jones please"); + // Presidio behavior: same-type overlapping entities are MERGED + // Merged entity spans 8-25 ("John Smith Jones"), keeps highest score + expect(masked).toBe("Contact [[PERSON_1]]please"); }); test("handles nested entities - keeps outer (starts first)", () => { @@ -606,7 +607,7 @@ describe("overlapping entities (issue #33)", () => { const { masked } = mask(text, entities); - expect(masked).toBe(" is here"); + expect(masked).toBe("[[PERSON_1]] is here"); }); test("keeps adjacent non-overlapping entities", () => { @@ -618,7 +619,7 @@ describe("overlapping entities (issue #33)", () => { const { masked } = mask(text, entities); - expect(masked).toBe(""); + expect(masked).toBe("[[PERSON_1]][[PERSON_2]]"); }); test("handles multiple independent overlap groups", () => { @@ -637,7 +638,7 @@ describe("overlapping entities (issue #33)", () => { const { masked } = mask(text, entities); - expect(masked).toBe(" met friend "); + expect(masked).toBe("[[PERSON_1]] met [[PERSON_2]] friend [[PERSON_3]]"); }); test("entity consistency - same value gets same placeholder", () => { @@ -649,7 +650,7 @@ describe("overlapping entities (issue #33)", () => { const { masked, context } = mask(text, entities); - expect(masked).toBe(" met again"); + expect(masked).toBe("[[PERSON_1]] met [[PERSON_1]] again"); expect(Object.keys(context.mapping)).toHaveLength(1); }); }); diff --git a/src/services/masking.ts b/src/services/masking.ts index d07bd4b..b9dbeb9 100644 --- a/src/services/masking.ts +++ b/src/services/masking.ts @@ -4,8 +4,8 @@ import { generatePlaceholder as generatePlaceholderFromFormat, PII_PLACEHOLDER_FORMAT, } from "../constants/placeholders"; +import { resolveConflicts } from "../utils/conflict-resolver"; import { extractTextContent } from "../utils/content"; -import { removeOverlappingEntities } from "../utils/entities"; import type { ChatCompletionResponse, ChatMessage } from "./llm-client"; import type { PIIEntity } from "./pii-detector"; @@ -54,12 +54,12 @@ export function mask(text: string, entities: PIIEntity[], context?: MaskingConte return { masked: text, context: ctx }; } - // Remove overlapping entities to prevent text corruption - // Presidio can return both "Eric" and "Eric's" as separate entities - const nonOverlapping = removeOverlappingEntities(entities); + // Resolve conflicts between overlapping entities using Presidio's algorithm + // Presidio can return overlapping entities (e.g., "Eric" and "Eric's") + const resolved = resolveConflicts(entities); // First pass: sort by start position ascending to assign placeholders in order - const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start); + const sortedByStart = [...resolved].sort((a, b) => a.start - b.start); // Assign placeholders in order of appearance const entityPlaceholders = new Map(); @@ -80,7 +80,7 @@ export function mask(text: string, entities: PIIEntity[], context?: MaskingConte // Second pass: sort by start position descending for replacement // This ensures string indices remain valid as we replace - const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start); + const sortedByEnd = [...resolved].sort((a, b) => b.start - a.start); let result = text; for (const entity of sortedByEnd) { diff --git a/src/utils/conflict-resolver.test.ts b/src/utils/conflict-resolver.test.ts new file mode 100644 index 0000000..f7c75cf --- /dev/null +++ b/src/utils/conflict-resolver.test.ts @@ -0,0 +1,182 @@ +import { describe, expect, test } from "bun:test"; +import { + type EntityWithScore, + resolveConflicts, + resolveConflictsSimple, +} from "./conflict-resolver"; + +describe("resolveConflicts (Presidio-style)", () => { + test("returns empty array for empty input", () => { + expect(resolveConflicts([])).toEqual([]); + }); + + test("returns single entity unchanged", () => { + const entities = [{ start: 0, end: 5, score: 0.9, entity_type: "PERSON" }]; + expect(resolveConflicts(entities)).toEqual(entities); + }); + + test("keeps non-overlapping entities", () => { + const entities = [ + { start: 0, end: 5, score: 0.9, entity_type: "PERSON" }, + { start: 10, end: 15, score: 0.8, entity_type: "PERSON" }, + ]; + expect(resolveConflicts(entities)).toHaveLength(2); + }); + + test("keeps adjacent entities (not overlapping)", () => { + const entities = [ + { start: 0, end: 4, score: 0.9, entity_type: "PERSON" }, + { start: 4, end: 9, score: 0.8, entity_type: "PERSON" }, + ]; + expect(resolveConflicts(entities)).toHaveLength(2); + }); + + // Presidio behavior: same type overlapping -> merge + test("merges overlapping entities of SAME type", () => { + // "Eric" (0-4) and "Eric's" (0-6) both PERSON -> merge to (0-6) + const entities = [ + { start: 0, end: 4, score: 0.85, entity_type: "PERSON" }, + { start: 0, end: 6, score: 0.8, entity_type: "PERSON" }, + ]; + const result = resolveConflicts(entities); + expect(result).toHaveLength(1); + expect(result[0].start).toBe(0); + expect(result[0].end).toBe(6); + expect(result[0].score).toBe(0.85); // keeps highest score + }); + + // Presidio behavior: different type, one contained -> remove contained + test("removes contained entity of DIFFERENT type (keeps larger)", () => { + // "123" detected as PHONE (0-10) and SSN (2-8) -> keep larger + const entities = [ + { start: 0, end: 10, score: 0.7, entity_type: "PHONE_NUMBER" }, + { start: 2, end: 8, score: 0.9, entity_type: "US_SSN" }, + ]; + const result = resolveConflicts(entities); + expect(result).toHaveLength(1); + expect(result[0].entity_type).toBe("PHONE_NUMBER"); + }); + + // Presidio behavior: same indices, different type -> higher score wins + test("keeps higher score when same indices different types", () => { + const entities = [ + { start: 0, end: 10, score: 0.6, entity_type: "URL" }, + { start: 0, end: 10, score: 0.9, entity_type: "EMAIL_ADDRESS" }, + ]; + const result = resolveConflicts(entities); + expect(result).toHaveLength(1); + expect(result[0].entity_type).toBe("EMAIL_ADDRESS"); + }); + + // The original bug case: "Eric" vs "Eric's" + test("handles Eric vs Eric's case correctly", () => { + // Given Eric's feedback -> Presidio returns both "Eric" and "Eric's" + const entities = [ + { start: 6, end: 10, score: 0.85, entity_type: "PERSON" }, // "Eric" + { start: 6, end: 12, score: 0.8, entity_type: "PERSON" }, // "Eric's" + ]; + const result = resolveConflicts(entities); + expect(result).toHaveLength(1); + // Should merge to cover full span with highest score + expect(result[0].start).toBe(6); + expect(result[0].end).toBe(12); + expect(result[0].score).toBe(0.85); + }); + + test("handles multiple overlap groups", () => { + const entities = [ + { start: 0, end: 5, score: 0.9, entity_type: "PERSON" }, + { start: 2, end: 7, score: 0.8, entity_type: "PERSON" }, // overlaps with first + { start: 20, end: 25, score: 0.9, entity_type: "PERSON" }, + { start: 22, end: 28, score: 0.85, entity_type: "PERSON" }, // overlaps with third + ]; + const result = resolveConflicts(entities); + // Each group should merge into one + expect(result).toHaveLength(2); + }); + + test("preserves additional entity properties", () => { + interface ExtendedEntity extends EntityWithScore { + extra: string; + } + const entities: ExtendedEntity[] = [ + { start: 0, end: 5, score: 0.9, entity_type: "PERSON", extra: "data" }, + ]; + const result = resolveConflicts(entities); + expect(result[0].extra).toBe("data"); + }); + + test("does not mutate input entities", () => { + const entities = [ + { start: 0, end: 4, score: 0.85, entity_type: "PERSON" }, + { start: 0, end: 6, score: 0.8, entity_type: "PERSON" }, + ]; + // Save original values + const originalStart = entities[0].start; + const originalEnd = entities[0].end; + + resolveConflicts(entities); + + // Original should be unchanged + expect(entities[0].start).toBe(originalStart); + expect(entities[0].end).toBe(originalEnd); + expect(entities).toHaveLength(2); // Array not modified + }); +}); + +describe("resolveConflictsSimple (for secrets without scores)", () => { + test("returns empty array for empty input", () => { + expect(resolveConflictsSimple([])).toEqual([]); + }); + + test("returns single entity unchanged", () => { + const entities = [{ start: 0, end: 5 }]; + expect(resolveConflictsSimple(entities)).toEqual(entities); + }); + + test("keeps non-overlapping entities", () => { + const entities = [ + { start: 0, end: 5 }, + { start: 10, end: 15 }, + ]; + expect(resolveConflictsSimple(entities)).toEqual(entities); + }); + + test("keeps adjacent entities", () => { + const entities = [ + { start: 0, end: 4 }, + { start: 4, end: 9 }, + ]; + expect(resolveConflictsSimple(entities)).toEqual(entities); + }); + + test("keeps longer when same start position", () => { + const entities = [ + { start: 6, end: 10 }, + { start: 6, end: 12 }, + ]; + const result = resolveConflictsSimple(entities); + expect(result).toHaveLength(1); + expect(result[0].end).toBe(12); + }); + + test("removes overlapping entity", () => { + const entities = [ + { start: 0, end: 10 }, + { start: 5, end: 15 }, + ]; + const result = resolveConflictsSimple(entities); + expect(result).toHaveLength(1); + expect(result[0].start).toBe(0); + }); + + test("removes nested entity", () => { + const entities = [ + { start: 0, end: 14 }, + { start: 4, end: 8 }, + ]; + const result = resolveConflictsSimple(entities); + expect(result).toHaveLength(1); + expect(result[0].end).toBe(14); + }); +}); diff --git a/src/utils/conflict-resolver.ts b/src/utils/conflict-resolver.ts new file mode 100644 index 0000000..e4b4138 --- /dev/null +++ b/src/utils/conflict-resolver.ts @@ -0,0 +1,146 @@ +/** + * Conflict resolution for overlapping entities + * + * Based on Microsoft Presidio's conflict resolution logic: + * https://github.com/microsoft/presidio/blob/main/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py + */ + +export interface EntityWithScore { + start: number; + end: number; + score: number; + entity_type: string; +} + +interface Interval { + start: number; + end: number; +} + +function overlaps(a: Interval, b: Interval): boolean { + return a.start < b.end && b.start < a.end; +} + +function isContainedIn(a: Interval, b: Interval): boolean { + return b.start <= a.start && b.end >= a.end; +} + +function groupBy(items: T[], keyFn: (item: T) => string): Map { + const groups = new Map(); + for (const item of items) { + const key = keyFn(item); + const group = groups.get(key) ?? []; + group.push(item); + groups.set(key, group); + } + return groups; +} + +/** + * Merge overlapping intervals. Returns new array (does not mutate input). + */ +function mergeOverlapping( + intervals: T[], + merge: (a: T, b: T) => T, +): T[] { + if (intervals.length <= 1) return [...intervals]; + + const sorted = [...intervals].sort((a, b) => a.start - b.start); + const result: T[] = [sorted[0]]; + + for (let i = 1; i < sorted.length; i++) { + const current = sorted[i]; + const last = result[result.length - 1]; + + if (overlaps(current, last)) { + // Replace last with merged interval + result[result.length - 1] = merge(last, current); + } else { + result.push(current); + } + } + + return result; +} + +/** + * Remove entities that are contained in another or have same indices with lower score. + */ +function removeConflicting(entities: T[]): T[] { + if (entities.length <= 1) return [...entities]; + + // Sort by start, then by score descending (higher score first) + const sorted = [...entities].sort((a, b) => { + if (a.start !== b.start) return a.start - b.start; + if (a.end !== b.end) return a.end - b.end; + return b.score - a.score; + }); + + const result: T[] = []; + + for (const entity of sorted) { + const hasConflict = result.some((kept) => { + if (entity.start === kept.start && entity.end === kept.end) { + return true; + } + return isContainedIn(entity, kept); + }); + + if (!hasConflict) { + result.push(entity); + } + } + + return result; +} + +/** + * Resolve conflicts between overlapping entities using Presidio's algorithm. + * + * Phase 1: Merge overlapping entities of the same type (expand boundaries, keep highest score) + * Phase 2: Remove conflicting entities of different types (contained or same indices with lower score) + */ +export function resolveConflicts(entities: T[]): T[] { + if (entities.length <= 1) return [...entities]; + + const byType = groupBy(entities, (e) => e.entity_type); + const afterMerge: T[] = []; + + for (const group of byType.values()) { + const merged = mergeOverlapping(group, (a, b) => ({ + ...a, + start: Math.min(a.start, b.start), + end: Math.max(a.end, b.end), + score: Math.max(a.score, b.score), + })); + afterMerge.push(...merged); + } + + return removeConflicting(afterMerge); +} + +/** + * Simple overlap resolution for entities without scores. + * Uses length as tiebreaker (longer wins). For secrets detection. + */ +export function resolveConflictsSimple(entities: T[]): T[] { + if (entities.length <= 1) return [...entities]; + + const sorted = [...entities].sort((a, b) => { + if (a.start !== b.start) return a.start - b.start; + return b.end - b.start - (a.end - a.start); + }); + + const result: T[] = [sorted[0]]; + + for (let i = 1; i < sorted.length; i++) { + const current = sorted[i]; + const last = result[result.length - 1]; + + if (current.start >= last.end) { + result.push(current); + } + } + + return result; +} diff --git a/src/utils/entities.test.ts b/src/utils/entities.test.ts deleted file mode 100644 index 34cc04b..0000000 --- a/src/utils/entities.test.ts +++ /dev/null @@ -1,84 +0,0 @@ -import { describe, expect, test } from "bun:test"; -import { removeOverlappingEntities } from "./entities"; - -describe("removeOverlappingEntities", () => { - test("returns empty array for empty input", () => { - expect(removeOverlappingEntities([])).toEqual([]); - }); - - test("returns single entity unchanged", () => { - const entities = [{ start: 0, end: 5 }]; - expect(removeOverlappingEntities(entities)).toEqual(entities); - }); - - test("keeps non-overlapping entities", () => { - const entities = [ - { start: 0, end: 5 }, - { start: 10, end: 15 }, - ]; - expect(removeOverlappingEntities(entities)).toEqual(entities); - }); - - test("keeps adjacent entities (not overlapping)", () => { - const entities = [ - { start: 0, end: 4 }, - { start: 4, end: 9 }, - ]; - expect(removeOverlappingEntities(entities)).toEqual(entities); - }); - - test("removes overlapping entity with same start - keeps longer", () => { - // "Eric" vs "Eric's" - both start at 6 - const entities = [ - { start: 6, end: 10 }, // "Eric" - { start: 6, end: 12 }, // "Eric's" - ]; - const result = removeOverlappingEntities(entities); - expect(result).toHaveLength(1); - expect(result[0].end).toBe(12); // longer one kept - }); - - test("removes partially overlapping entity", () => { - const entities = [ - { start: 0, end: 10 }, - { start: 5, end: 15 }, - ]; - const result = removeOverlappingEntities(entities); - expect(result).toHaveLength(1); - expect(result[0].start).toBe(0); - }); - - test("removes nested entity", () => { - const entities = [ - { start: 0, end: 14 }, // "Dr. John Smith" - { start: 4, end: 8 }, // "John" - ]; - const result = removeOverlappingEntities(entities); - expect(result).toHaveLength(1); - expect(result[0].end).toBe(14); // outer kept - }); - - test("handles multiple overlap groups", () => { - const entities = [ - { start: 0, end: 5 }, - { start: 2, end: 7 }, // overlaps with first - { start: 20, end: 25 }, - { start: 22, end: 28 }, // overlaps with third - ]; - const result = removeOverlappingEntities(entities); - expect(result).toHaveLength(2); - expect(result[0].start).toBe(0); - expect(result[1].start).toBe(20); - }); - - test("preserves additional entity properties", () => { - const entities = [ - { start: 0, end: 5, entity_type: "PERSON", score: 0.9 }, - { start: 2, end: 7, entity_type: "PERSON", score: 0.8 }, - ]; - const result = removeOverlappingEntities(entities); - expect(result).toHaveLength(1); - expect(result[0].entity_type).toBe("PERSON"); - expect(result[0].score).toBe(0.9); - }); -}); diff --git a/src/utils/entities.ts b/src/utils/entities.ts deleted file mode 100644 index edcdbd2..0000000 --- a/src/utils/entities.ts +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Removes overlapping entities to prevent text corruption during masking - * - * When entities overlap (e.g., "Eric" and "Eric's" both detected), applying - * both replacements corrupts the text. This function keeps only non-overlapping - * entities, preferring longer spans when they share the same start position. - */ -export function removeOverlappingEntities( - entities: T[], -): T[] { - if (entities.length <= 1) return entities; - - // Sort by start position, then by length descending (longer = more specific) - const sorted = [...entities].sort((a, b) => { - if (a.start !== b.start) return a.start - b.start; - return b.end - b.start - (a.end - a.start); - }); - - const result: T[] = [sorted[0]]; - - for (let i = 1; i < sorted.length; i++) { - const current = sorted[i]; - const last = result[result.length - 1]; - - // Keep only if no overlap with last kept entity - if (current.start >= last.end) { - result.push(current); - } - } - - return result; -}