import { findPartialPlaceholderStart, generateSecretPlaceholder } from "../constants/placeholders";
import type { ChatCompletionResponse, ChatMessage } from "../services/llm-client";
import { extractTextContent } from "../utils/content";
+import { removeOverlappingEntities } from "../utils/entities";
import type { SecretsRedaction } from "./detect";
/**
return { redacted: text, context: ctx };
}
+ // Remove overlapping redactions to prevent text corruption
+ const nonOverlapping = removeOverlappingEntities(redactions);
+
// First pass: sort by start position ascending to assign placeholders in order of appearance
- const sortedByStart = [...redactions].sort((a, b) => a.start - b.start);
+ const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start);
// Assign placeholders in order of appearance
const redactionPlaceholders = new Map<SecretsRedaction, string>();
}
// Second pass: replace from end to start to maintain correct string positions
- // Redactions should already be sorted by start descending, but re-sort to be safe
- const sortedByEnd = [...redactions].sort((a, b) => b.start - a.start);
+ const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start);
let result = text;
for (const redaction of sortedByEnd) {
expect(result2.remainingBuffer).toBe("");
});
});
+
+describe("overlapping entities (issue #33)", () => {
+ test("handles overlapping entities with same start - keeps longer", () => {
+ // Bug: Presidio returns both "Eric" and "Eric's" as separate PERSON entities
+ const text = "Given Eric's feedback";
+ const entities: PIIEntity[] = [
+ { entity_type: "PERSON", start: 6, end: 10, score: 0.85 }, // "Eric"
+ { entity_type: "PERSON", start: 6, end: 12, score: 0.8 }, // "Eric's"
+ ];
+
+ const { masked, context } = mask(text, entities);
+
+ // Longer span wins when same start position
+ expect(masked).toBe("Given <PERSON_1> feedback");
+ expect(context.mapping["<PERSON_1>"]).toBe("Eric's");
+ });
+
+ test("handles partially overlapping entities - keeps first", () => {
+ const text = "Contact John Smith Jones please";
+ const entities: PIIEntity[] = [
+ { entity_type: "PERSON", start: 8, end: 18, score: 0.9 }, // "John Smith"
+ { entity_type: "PERSON", start: 13, end: 25, score: 0.7 }, // "Smith Jones"
+ ];
+
+ const { masked } = mask(text, entities);
+
+ // First by position wins
+ expect(masked).toBe("Contact <PERSON_1> Jones please");
+ });
+
+ test("handles nested entities - keeps outer (starts first)", () => {
+ const text = "Dr. John Smith is here";
+ const entities: PIIEntity[] = [
+ { entity_type: "PERSON", start: 0, end: 14, score: 0.9 }, // "Dr. John Smith"
+ { entity_type: "PERSON", start: 4, end: 8, score: 0.85 }, // "John"
+ ];
+
+ const { masked } = mask(text, entities);
+
+ expect(masked).toBe("<PERSON_1> is here");
+ });
+
+ test("keeps adjacent non-overlapping entities", () => {
+ const text = "HansMüller";
+ const entities: PIIEntity[] = [
+ { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, // "Hans"
+ { entity_type: "PERSON", start: 4, end: 10, score: 0.9 }, // "Müller"
+ ];
+
+ const { masked } = mask(text, entities);
+
+ expect(masked).toBe("<PERSON_1><PERSON_2>");
+ });
+
+ test("handles multiple independent overlap groups", () => {
+ const text = "Laura Smith met Eric's friend Bob Jones Jr";
+ const entities: PIIEntity[] = [
+ // Group 1: same start - longer wins
+ { entity_type: "PERSON", start: 0, end: 5, score: 0.85 }, // "Laura"
+ { entity_type: "PERSON", start: 0, end: 11, score: 0.9 }, // "Laura Smith"
+ // Group 2: same start - longer wins
+ { entity_type: "PERSON", start: 16, end: 20, score: 0.85 }, // "Eric"
+ { entity_type: "PERSON", start: 16, end: 22, score: 0.8 }, // "Eric's"
+ // Group 3: same start - longer wins
+ { entity_type: "PERSON", start: 30, end: 33, score: 0.7 }, // "Bob"
+ { entity_type: "PERSON", start: 30, end: 42, score: 0.9 }, // "Bob Jones Jr"
+ ];
+
+ const { masked } = mask(text, entities);
+
+ expect(masked).toBe("<PERSON_1> met <PERSON_2> friend <PERSON_3>");
+ });
+
+ test("entity consistency - same value gets same placeholder", () => {
+ const text = "Eric met Eric again";
+ const entities: PIIEntity[] = [
+ { entity_type: "PERSON", start: 0, end: 4, score: 0.9 }, // "Eric"
+ { entity_type: "PERSON", start: 9, end: 13, score: 0.9 }, // "Eric"
+ ];
+
+ const { masked, context } = mask(text, entities);
+
+ expect(masked).toBe("<PERSON_1> met <PERSON_1> again");
+ expect(Object.keys(context.mapping)).toHaveLength(1);
+ });
+});
PII_PLACEHOLDER_FORMAT,
} from "../constants/placeholders";
import { extractTextContent } from "../utils/content";
+import { removeOverlappingEntities } from "../utils/entities";
import type { ChatCompletionResponse, ChatMessage } from "./llm-client";
import type { PIIEntity } from "./pii-detector";
return { masked: text, context: ctx };
}
+ // Remove overlapping entities to prevent text corruption
+ // Presidio can return both "Eric" and "Eric's" as separate entities
+ const nonOverlapping = removeOverlappingEntities(entities);
+
// First pass: sort by start position ascending to assign placeholders in order
- const sortedByStart = [...entities].sort((a, b) => a.start - b.start);
+ const sortedByStart = [...nonOverlapping].sort((a, b) => a.start - b.start);
// Assign placeholders in order of appearance
const entityPlaceholders = new Map<PIIEntity, string>();
// Second pass: sort by start position descending for replacement
// This ensures string indices remain valid as we replace
- const sortedByEnd = [...entities].sort((a, b) => b.start - a.start);
+ const sortedByEnd = [...nonOverlapping].sort((a, b) => b.start - a.start);
let result = text;
for (const entity of sortedByEnd) {
--- /dev/null
+import { describe, expect, test } from "bun:test";
+import { removeOverlappingEntities } from "./entities";
+
+describe("removeOverlappingEntities", () => {
+ test("returns empty array for empty input", () => {
+ expect(removeOverlappingEntities([])).toEqual([]);
+ });
+
+ test("returns single entity unchanged", () => {
+ const entities = [{ start: 0, end: 5 }];
+ expect(removeOverlappingEntities(entities)).toEqual(entities);
+ });
+
+ test("keeps non-overlapping entities", () => {
+ const entities = [
+ { start: 0, end: 5 },
+ { start: 10, end: 15 },
+ ];
+ expect(removeOverlappingEntities(entities)).toEqual(entities);
+ });
+
+ test("keeps adjacent entities (not overlapping)", () => {
+ const entities = [
+ { start: 0, end: 4 },
+ { start: 4, end: 9 },
+ ];
+ expect(removeOverlappingEntities(entities)).toEqual(entities);
+ });
+
+ test("removes overlapping entity with same start - keeps longer", () => {
+ // "Eric" vs "Eric's" - both start at 6
+ const entities = [
+ { start: 6, end: 10 }, // "Eric"
+ { start: 6, end: 12 }, // "Eric's"
+ ];
+ const result = removeOverlappingEntities(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].end).toBe(12); // longer one kept
+ });
+
+ test("removes partially overlapping entity", () => {
+ const entities = [
+ { start: 0, end: 10 },
+ { start: 5, end: 15 },
+ ];
+ const result = removeOverlappingEntities(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].start).toBe(0);
+ });
+
+ test("removes nested entity", () => {
+ const entities = [
+ { start: 0, end: 14 }, // "Dr. John Smith"
+ { start: 4, end: 8 }, // "John"
+ ];
+ const result = removeOverlappingEntities(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].end).toBe(14); // outer kept
+ });
+
+ test("handles multiple overlap groups", () => {
+ const entities = [
+ { start: 0, end: 5 },
+ { start: 2, end: 7 }, // overlaps with first
+ { start: 20, end: 25 },
+ { start: 22, end: 28 }, // overlaps with third
+ ];
+ const result = removeOverlappingEntities(entities);
+ expect(result).toHaveLength(2);
+ expect(result[0].start).toBe(0);
+ expect(result[1].start).toBe(20);
+ });
+
+ test("preserves additional entity properties", () => {
+ const entities = [
+ { start: 0, end: 5, entity_type: "PERSON", score: 0.9 },
+ { start: 2, end: 7, entity_type: "PERSON", score: 0.8 },
+ ];
+ const result = removeOverlappingEntities(entities);
+ expect(result).toHaveLength(1);
+ expect(result[0].entity_type).toBe("PERSON");
+ expect(result[0].score).toBe(0.9);
+ });
+});
--- /dev/null
+/**
+ * Removes overlapping entities to prevent text corruption during masking
+ *
+ * When entities overlap (e.g., "Eric" and "Eric's" both detected), applying
+ * both replacements corrupts the text. This function keeps only non-overlapping
+ * entities, preferring longer spans when they share the same start position.
+ */
+export function removeOverlappingEntities<T extends { start: number; end: number }>(
+ entities: T[],
+): T[] {
+ if (entities.length <= 1) return entities;
+
+ // Sort by start position, then by length descending (longer = more specific)
+ const sorted = [...entities].sort((a, b) => {
+ if (a.start !== b.start) return a.start - b.start;
+ return b.end - b.start - (a.end - a.start);
+ });
+
+ const result: T[] = [sorted[0]];
+
+ for (let i = 1; i < sorted.length; i++) {
+ const current = sorted[i];
+ const last = result[result.length - 1];
+
+ // Keep only if no overlap with last kept entity
+ if (current.start >= last.end) {
+ result.push(current);
+ }
+ }
+
+ return result;
+}