Implement Presidio-style conflict resolution for overlapping PII entities.
Problem: Presidio returns overlapping entities (e.g., both "Eric" and "Eric's")
which caused text corruption during masking.
Solution: Two-phase algorithm matching Presidio's Anonymizer logic:
1. Merge overlapping entities of same type (expand boundaries, keep highest score)
2. Remove conflicting entities of different types (contained or lower score loses)
- Add conflict-resolver.ts with resolveConflicts() and resolveConflictsSimple()
- Replace old removeOverlappingEntities with new conflict resolver
- Add 18 tests covering all conflict scenarios
- Delete old entities.ts (replaced by conflict-resolver.ts)
import { findPartialPlaceholderStart, generateSecretPlaceholder } from "../constants/placeholders";
import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client";
+import { resolveConflictsSimple } from "../utils/conflict-resolver";
import { extractTextContent } from "../utils/content";
-import { removeOverlappingEntities } from "../utils/entities";
import type { SecretsRedaction } from "./detect";
/**
return { redacted: text, context: ctx };
}
- // Remove overlapping redactions to prevent text corruption
- const nonOverlapping = removeOverlappingEntities(redactions);
+ // Resolve conflicts between overlapping redactions
+ const resolved = resolveConflictsSimple(redactions);
// First pass: sort by start position ascending to assign placeholders in order of appearance
- const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start);
+ const sortedByStart = [...resolved].sort((a, b) => a.start - b.start);
// Assign placeholders in order of appearance
const redactionPlaceholders = new Map<SecretsRedaction, string>();
}
// Second pass: replace from end to start to maintain correct string positions
- const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start);
+ const sortedByEnd = [...resolved].sort((a, b) => b.start - a.start);
let result = text;
for (const redaction of sortedByEnd) {
const { masked, context } = mask(text, entities);
// Longer span wins when same start position
- expect(masked).toBe("Given <PERSON_1> feedback");
- expect(context.mapping["<PERSON_1>"]).toBe("Eric's");
+ expect(masked).toBe("Given [[PERSON_1]] feedback");
+ expect(context.mapping["[[PERSON_1]]"]).toBe("Eric's");
});
- test("handles partially overlapping entities - keeps first", () => {
+ test("handles partially overlapping entities of same type - merges them", () => {
const text = "Contact John Smith Jones please";
const entities: PIIEntity[] = [
{ entity_type: "PERSON", start: 8, end: 18, score: 0.9 }, // "John Smith"
const { masked } = mask(text, entities);
- // First by position wins
- expect(masked).toBe("Contact <PERSON_1> Jones please");
+ // Presidio behavior: same-type overlapping entities are MERGED
+ // Merged entity spans 8-25 ("John Smith Jones"), keeps highest score
+ expect(masked).toBe("Contact [[PERSON_1]]please");
});
test("handles nested entities - keeps outer (starts first)", () => {
const { masked } = mask(text, entities);
- expect(masked).toBe("<PERSON_1> is here");
+ expect(masked).toBe("[[PERSON_1]] is here");
});
test("keeps adjacent non-overlapping entities", () => {
const { masked } = mask(text, entities);
- expect(masked).toBe("<PERSON_1><PERSON_2>");
+ expect(masked).toBe("[[PERSON_1]][[PERSON_2]]");
});
test("handles multiple independent overlap groups", () => {
const { masked } = mask(text, entities);
- expect(masked).toBe("<PERSON_1> met <PERSON_2> friend <PERSON_3>");
+ expect(masked).toBe("[[PERSON_1]] met [[PERSON_2]] friend [[PERSON_3]]");
});
test("entity consistency - same value gets same placeholder", () => {
const { masked, context } = mask(text, entities);
- expect(masked).toBe("<PERSON_1> met <PERSON_1> again");
+ expect(masked).toBe("[[PERSON_1]] met [[PERSON_1]] again");
expect(Object.keys(context.mapping)).toHaveLength(1);
});
});
generatePlaceholder as generatePlaceholderFromFormat,
PII_PLACEHOLDER_FORMAT,
} from "../constants/placeholders";
+import { resolveConflicts } from "../utils/conflict-resolver";
import { extractTextContent } from "../utils/content";
-import { removeOverlappingEntities } from "../utils/entities";
import type { ChatCompletionResponse, ChatMessage } from "./llm-client";
import type { PIIEntity } from "./pii-detector";
return { masked: text, context: ctx };
}
- // Remove overlapping entities to prevent text corruption
- // Presidio can return both "Eric" and "Eric's" as separate entities
- const nonOverlapping = removeOverlappingEntities(entities);
+ // Resolve conflicts between overlapping entities using Presidio's algorithm
+ // Presidio can return overlapping entities (e.g., "Eric" and "Eric's")
+ const resolved = resolveConflicts(entities);
// First pass: sort by start position ascending to assign placeholders in order
- const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start);
+ const sortedByStart = [...resolved].sort((a, b) => a.start - b.start);
// Assign placeholders in order of appearance
const entityPlaceholders = new Map<PIIEntity, string>();
// Second pass: sort by start position descending for replacement
// This ensures string indices remain valid as we replace
- const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start);
+ const sortedByEnd = [...resolved].sort((a, b) => b.start - a.start);
let result = text;
for (const entity of sortedByEnd) {
--- /dev/null
+import { describe, expect, test } from "bun:test";
+import {
+ type EntityWithScore,
+ resolveConflicts,
+ resolveConflictsSimple,
+} from "./conflict-resolver";
+
+describe("resolveConflicts (Presidio-style)", () => {
+ test("returns empty array for empty input", () => {
+ expect(resolveConflicts([])).toEqual([]);
+ });
+
+ test("returns single entity unchanged", () => {
+ const entities = [{ start: 0, end: 5, score: 0.9, entity_type: "PERSON" }];
+ expect(resolveConflicts(entities)).toEqual(entities);
+ });
+
+ test("keeps non-overlapping entities", () => {
+ const entities = [
+ { start: 0, end: 5, score: 0.9, entity_type: "PERSON" },
+ { start: 10, end: 15, score: 0.8, entity_type: "PERSON" },
+ ];
+ expect(resolveConflicts(entities)).toHaveLength(2);
+ });
+
+ test("keeps adjacent entities (not overlapping)", () => {
+ const entities = [
+ { start: 0, end: 4, score: 0.9, entity_type: "PERSON" },
+ { start: 4, end: 9, score: 0.8, entity_type: "PERSON" },
+ ];
+ expect(resolveConflicts(entities)).toHaveLength(2);
+ });
+
+ // Presidio behavior: same type overlapping -> merge
+ test("merges overlapping entities of SAME type", () => {
+ // "Eric" (0-4) and "Eric's" (0-6) both PERSON -> merge to (0-6)
+ const entities = [
+ { start: 0, end: 4, score: 0.85, entity_type: "PERSON" },
+ { start: 0, end: 6, score: 0.8, entity_type: "PERSON" },
+ ];
+ const result = resolveConflicts(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].start).toBe(0);
+ expect(result[0].end).toBe(6);
+ expect(result[0].score).toBe(0.85); // keeps highest score
+ });
+
+ // Presidio behavior: different type, one contained -> remove contained
+ test("removes contained entity of DIFFERENT type (keeps larger)", () => {
+ // "123" detected as PHONE (0-10) and SSN (2-8) -> keep larger
+ const entities = [
+ { start: 0, end: 10, score: 0.7, entity_type: "PHONE_NUMBER" },
+ { start: 2, end: 8, score: 0.9, entity_type: "US_SSN" },
+ ];
+ const result = resolveConflicts(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].entity_type).toBe("PHONE_NUMBER");
+ });
+
+ // Presidio behavior: same indices, different type -> higher score wins
+ test("keeps higher score when same indices different types", () => {
+ const entities = [
+ { start: 0, end: 10, score: 0.6, entity_type: "URL" },
+ { start: 0, end: 10, score: 0.9, entity_type: "EMAIL_ADDRESS" },
+ ];
+ const result = resolveConflicts(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].entity_type).toBe("EMAIL_ADDRESS");
+ });
+
+ // The original bug case: "Eric" vs "Eric's"
+ test("handles Eric vs Eric's case correctly", () => {
+ // Given Eric's feedback -> Presidio returns both "Eric" and "Eric's"
+ const entities = [
+ { start: 6, end: 10, score: 0.85, entity_type: "PERSON" }, // "Eric"
+ { start: 6, end: 12, score: 0.8, entity_type: "PERSON" }, // "Eric's"
+ ];
+ const result = resolveConflicts(entities);
+ expect(result).toHaveLength(1);
+ // Should merge to cover full span with highest score
+ expect(result[0].start).toBe(6);
+ expect(result[0].end).toBe(12);
+ expect(result[0].score).toBe(0.85);
+ });
+
+ test("handles multiple overlap groups", () => {
+ const entities = [
+ { start: 0, end: 5, score: 0.9, entity_type: "PERSON" },
+ { start: 2, end: 7, score: 0.8, entity_type: "PERSON" }, // overlaps with first
+ { start: 20, end: 25, score: 0.9, entity_type: "PERSON" },
+ { start: 22, end: 28, score: 0.85, entity_type: "PERSON" }, // overlaps with third
+ ];
+ const result = resolveConflicts(entities);
+ // Each group should merge into one
+ expect(result).toHaveLength(2);
+ });
+
+ test("preserves additional entity properties", () => {
+ interface ExtendedEntity extends EntityWithScore {
+ extra: string;
+ }
+ const entities: ExtendedEntity[] = [
+ { start: 0, end: 5, score: 0.9, entity_type: "PERSON", extra: "data" },
+ ];
+ const result = resolveConflicts(entities);
+ expect(result[0].extra).toBe("data");
+ });
+
+ test("does not mutate input entities", () => {
+ const entities = [
+ { start: 0, end: 4, score: 0.85, entity_type: "PERSON" },
+ { start: 0, end: 6, score: 0.8, entity_type: "PERSON" },
+ ];
+ // Save original values
+ const originalStart = entities[0].start;
+ const originalEnd = entities[0].end;
+
+ resolveConflicts(entities);
+
+ // Original should be unchanged
+ expect(entities[0].start).toBe(originalStart);
+ expect(entities[0].end).toBe(originalEnd);
+ expect(entities).toHaveLength(2); // Array not modified
+ });
+});
+
+describe("resolveConflictsSimple (for secrets without scores)", () => {
+ test("returns empty array for empty input", () => {
+ expect(resolveConflictsSimple([])).toEqual([]);
+ });
+
+ test("returns single entity unchanged", () => {
+ const entities = [{ start: 0, end: 5 }];
+ expect(resolveConflictsSimple(entities)).toEqual(entities);
+ });
+
+ test("keeps non-overlapping entities", () => {
+ const entities = [
+ { start: 0, end: 5 },
+ { start: 10, end: 15 },
+ ];
+ expect(resolveConflictsSimple(entities)).toEqual(entities);
+ });
+
+ test("keeps adjacent entities", () => {
+ const entities = [
+ { start: 0, end: 4 },
+ { start: 4, end: 9 },
+ ];
+ expect(resolveConflictsSimple(entities)).toEqual(entities);
+ });
+
+ test("keeps longer when same start position", () => {
+ const entities = [
+ { start: 6, end: 10 },
+ { start: 6, end: 12 },
+ ];
+ const result = resolveConflictsSimple(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].end).toBe(12);
+ });
+
+ test("removes overlapping entity", () => {
+ const entities = [
+ { start: 0, end: 10 },
+ { start: 5, end: 15 },
+ ];
+ const result = resolveConflictsSimple(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].start).toBe(0);
+ });
+
+ test("removes nested entity", () => {
+ const entities = [
+ { start: 0, end: 14 },
+ { start: 4, end: 8 },
+ ];
+ const result = resolveConflictsSimple(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].end).toBe(14);
+ });
+});
--- /dev/null
+/**
+ * Conflict resolution for overlapping entities
+ *
+ * Based on Microsoft Presidio's conflict resolution logic:
+ * https://github.com/microsoft/presidio/blob/main/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py
+ */
+
+export interface EntityWithScore {
+ start: number;
+ end: number;
+ score: number;
+ entity_type: string;
+}
+
+interface Interval {
+ start: number;
+ end: number;
+}
+
+function overlaps(a: Interval, b: Interval): boolean {
+ return a.start < b.end && b.start < a.end;
+}
+
+function isContainedIn(a: Interval, b: Interval): boolean {
+ return b.start <= a.start && b.end >= a.end;
+}
+
+function groupBy<T>(items: T[], keyFn: (item: T) => string): Map<string, T[]> {
+ const groups = new Map<string, T[]>();
+ for (const item of items) {
+ const key = keyFn(item);
+ const group = groups.get(key) ?? [];
+ group.push(item);
+ groups.set(key, group);
+ }
+ return groups;
+}
+
+/**
+ * Merge overlapping intervals. Returns new array (does not mutate input).
+ */
+function mergeOverlapping<T extends Interval>(
+ intervals: T[],
+ merge: (a: T, b: T) => T,
+): T[] {
+ if (intervals.length <= 1) return [...intervals];
+
+ const sorted = [...intervals].sort((a, b) => a.start - b.start);
+ const result: T[] = [sorted[0]];
+
+ for (let i = 1; i < sorted.length; i++) {
+ const current = sorted[i];
+ const last = result[result.length - 1];
+
+ if (overlaps(current, last)) {
+ // Replace last with merged interval
+ result[result.length - 1] = merge(last, current);
+ } else {
+ result.push(current);
+ }
+ }
+
+ return result;
+}
+
+/**
+ * Remove entities that are contained in another or have same indices with lower score.
+ */
+function removeConflicting<T extends EntityWithScore>(entities: T[]): T[] {
+ if (entities.length <= 1) return [...entities];
+
+ // Sort by start, then by score descending (higher score first)
+ const sorted = [...entities].sort((a, b) => {
+ if (a.start !== b.start) return a.start - b.start;
+ if (a.end !== b.end) return a.end - b.end;
+ return b.score - a.score;
+ });
+
+ const result: T[] = [];
+
+ for (const entity of sorted) {
+ const hasConflict = result.some((kept) => {
+ if (entity.start === kept.start && entity.end === kept.end) {
+ return true;
+ }
+ return isContainedIn(entity, kept);
+ });
+
+ if (!hasConflict) {
+ result.push(entity);
+ }
+ }
+
+ return result;
+}
+
+/**
+ * Resolve conflicts between overlapping entities using Presidio's algorithm.
+ *
+ * Phase 1: Merge overlapping entities of the same type (expand boundaries, keep highest score)
+ * Phase 2: Remove conflicting entities of different types (contained or same indices with lower score)
+ */
+export function resolveConflicts<T extends EntityWithScore>(entities: T[]): T[] {
+ if (entities.length <= 1) return [...entities];
+
+ const byType = groupBy(entities, (e) => e.entity_type);
+ const afterMerge: T[] = [];
+
+ for (const group of byType.values()) {
+ const merged = mergeOverlapping(group, (a, b) => ({
+ ...a,
+ start: Math.min(a.start, b.start),
+ end: Math.max(a.end, b.end),
+ score: Math.max(a.score, b.score),
+ }));
+ afterMerge.push(...merged);
+ }
+
+ return removeConflicting(afterMerge);
+}
+
+/**
+ * Simple overlap resolution for entities without scores.
+ * Uses length as tiebreaker (longer wins). For secrets detection.
+ */
+export function resolveConflictsSimple<T extends Interval>(entities: T[]): T[] {
+ if (entities.length <= 1) return [...entities];
+
+ const sorted = [...entities].sort((a, b) => {
+ if (a.start !== b.start) return a.start - b.start;
+ return b.end - b.start - (a.end - a.start);
+ });
+
+ const result: T[] = [sorted[0]];
+
+ for (let i = 1; i < sorted.length; i++) {
+ const current = sorted[i];
+ const last = result[result.length - 1];
+
+ if (current.start >= last.end) {
+ result.push(current);
+ }
+ }
+
+ return result;
+}
+++ /dev/null
-import { describe, expect, test } from "bun:test";
-import { removeOverlappingEntities } from "./entities";
-
-describe("removeOverlappingEntities", () => {
- test("returns empty array for empty input", () => {
- expect(removeOverlappingEntities([])).toEqual([]);
- });
-
- test("returns single entity unchanged", () => {
- const entities = [{ start: 0, end: 5 }];
- expect(removeOverlappingEntities(entities)).toEqual(entities);
- });
-
- test("keeps non-overlapping entities", () => {
- const entities = [
- { start: 0, end: 5 },
- { start: 10, end: 15 },
- ];
- expect(removeOverlappingEntities(entities)).toEqual(entities);
- });
-
- test("keeps adjacent entities (not overlapping)", () => {
- const entities = [
- { start: 0, end: 4 },
- { start: 4, end: 9 },
- ];
- expect(removeOverlappingEntities(entities)).toEqual(entities);
- });
-
- test("removes overlapping entity with same start - keeps longer", () => {
- // "Eric" vs "Eric's" - both start at 6
- const entities = [
- { start: 6, end: 10 }, // "Eric"
- { start: 6, end: 12 }, // "Eric's"
- ];
- const result = removeOverlappingEntities(entities);
- expect(result).toHaveLength(1);
- expect(result[0].end).toBe(12); // longer one kept
- });
-
- test("removes partially overlapping entity", () => {
- const entities = [
- { start: 0, end: 10 },
- { start: 5, end: 15 },
- ];
- const result = removeOverlappingEntities(entities);
- expect(result).toHaveLength(1);
- expect(result[0].start).toBe(0);
- });
-
- test("removes nested entity", () => {
- const entities = [
- { start: 0, end: 14 }, // "Dr. John Smith"
- { start: 4, end: 8 }, // "John"
- ];
- const result = removeOverlappingEntities(entities);
- expect(result).toHaveLength(1);
- expect(result[0].end).toBe(14); // outer kept
- });
-
- test("handles multiple overlap groups", () => {
- const entities = [
- { start: 0, end: 5 },
- { start: 2, end: 7 }, // overlaps with first
- { start: 20, end: 25 },
- { start: 22, end: 28 }, // overlaps with third
- ];
- const result = removeOverlappingEntities(entities);
- expect(result).toHaveLength(2);
- expect(result[0].start).toBe(0);
- expect(result[1].start).toBe(20);
- });
-
- test("preserves additional entity properties", () => {
- const entities = [
- { start: 0, end: 5, entity_type: "PERSON", score: 0.9 },
- { start: 2, end: 7, entity_type: "PERSON", score: 0.8 },
- ];
- const result = removeOverlappingEntities(entities);
- expect(result).toHaveLength(1);
- expect(result[0].entity_type).toBe("PERSON");
- expect(result[0].score).toBe(0.9);
- });
-});
+++ /dev/null
-/**
- * Removes overlapping entities to prevent text corruption during masking
- *
- * When entities overlap (e.g., "Eric" and "Eric's" both detected), applying
- * both replacements corrupts the text. This function keeps only non-overlapping
- * entities, preferring longer spans when they share the same start position.
- */
-export function removeOverlappingEntities<T extends { start: number; end: number }>(
- entities: T[],
-): T[] {
- if (entities.length <= 1) return entities;
-
- // Sort by start position, then by length descending (longer = more specific)
- const sorted = [...entities].sort((a, b) => {
- if (a.start !== b.start) return a.start - b.start;
- return b.end - b.start - (a.end - a.start);
- });
-
- const result: T[] = [sorted[0]];
-
- for (let i = 1; i < sorted.length; i++) {
- const current = sorted[i];
- const last = result[result.length - 1];
-
- // Keep only if no overlap with last kept entity
- if (current.start >= last.end) {
- result.push(current);
- }
- }
-
- return result;
-}