Add PII accuracy benchmark with multi-language phone context (#1)

author Stefan Gasser <redacted>

Thu, 8 Jan 2026 16:15:59 +0000 (17:15 +0100)

committer GitHub <redacted>

Thu, 8 Jan 2026 16:15:59 +0000 (17:15 +0100)
author Stefan Gasser <redacted>
Thu, 8 Jan 2026 16:15:59 +0000 (17:15 +0100)
committer GitHub <redacted>
Thu, 8 Jan 2026 16:15:59 +0000 (17:15 +0100)
diff --git a/README.md b/README.md

index 1bd27935e995cc5c58b269a1ee80fa67c9422d39..619234c3d9c12f9f31b16c692feaa02a27e67883 100644 (file)
--- a/README.md
+++ b/README.md
@@ -229,6 +229,26 @@ bun test                                  # Run tests
  bun run check                             # Lint & format
  ```
  
+## Benchmarks
+
+PII detection accuracy benchmark with test cases for multiple languages:
+
+```bash
+# Start Presidio with all benchmark languages
+LANGUAGES=en,de,fr,it,es docker compose up presidio-analyzer -d
+
+# Run all tests
+bun run benchmarks/pii-accuracy/run.ts
+
+# Run specific languages only
+bun run benchmarks/pii-accuracy/run.ts --languages de,en
+
+# Verbose output
+bun run benchmarks/pii-accuracy/run.ts --verbose
+```
+
+Test data in `benchmarks/pii-accuracy/test-data/` (one file per language).
+
  ## License
  
  [Apache 2.0](LICENSE)
diff --git a/benchmarks/pii-accuracy/run.ts b/benchmarks/pii-accuracy/run.ts

new file mode 100644 (file)

index 0000000..b7ab343
--- /dev/null
+++ b/benchmarks/pii-accuracy/run.ts
@@ -0,0 +1,412 @@
+#!/usr/bin/env bun
+/**
+ * PII Detection Accuracy Benchmark
+ *
+ * Measures precision, recall, and F1 score of the PII detection system.
+ *
+ * Usage:
+ *   bun run benchmarks/pii-accuracy/run.ts
+ *   bun run benchmarks/pii-accuracy/run.ts --threshold 0.5
+ *   bun run benchmarks/pii-accuracy/run.ts --languages de,en
+ *   bun run benchmarks/pii-accuracy/run.ts --verbose
+ */
+
+import { parseArgs } from "util";
+import { Glob } from "bun";
+import { parse as parseYaml } from "yaml";
+import {
+  TestCaseSchema,
+  type AccuracyMetrics,
+  type DetectedEntity,
+  type ExpectedEntity,
+  type TestCase,
+  type TestResult,
+} from "./types";
+
+// Configuration
+const DEFAULT_THRESHOLD = 0.7;
+const PRESIDIO_URL = process.env.PRESIDIO_URL || "http://localhost:5002";
+const TEST_DATA_DIR = import.meta.dir + "/test-data";
+
+// Parse command line arguments
+const { values: args } = parseArgs({
+  args: Bun.argv.slice(2),
+  options: {
+    threshold: { type: "string", short: "t" },
+    languages: { type: "string", short: "l" },
+    verbose: { type: "boolean", short: "v", default: false },
+    help: { type: "boolean", short: "h", default: false },
+  },
+});
+
+if (args.help) {
+  console.log(`
+PII Detection Accuracy Benchmark
+
+Usage:
+  bun run benchmarks/pii-accuracy/run.ts [options]
+
+Options:
+  -t, --threshold <value>    Score threshold (default: ${DEFAULT_THRESHOLD})
+  -l, --languages <langs>    Comma-separated languages to test (e.g., de,en)
+  -v, --verbose              Show detailed results for each test case
+  -h, --help                 Show this help message
+
+Examples:
+  bun run benchmarks/pii-accuracy/run.ts
+  bun run benchmarks/pii-accuracy/run.ts --threshold 0.5
+  bun run benchmarks/pii-accuracy/run.ts --languages de,en
+  bun run benchmarks/pii-accuracy/run.ts --verbose
+`);
+  process.exit(0);
+}
+
+const threshold = args.threshold ? Number.parseFloat(args.threshold) : DEFAULT_THRESHOLD;
+const verbose = args.verbose ?? false;
+const languageFilter = args.languages?.split(",").map((l) => l.trim().toLowerCase());
+
+// Entity types we test for
+const ENTITY_TYPES = [
+  "PERSON",
+  "EMAIL_ADDRESS",
+  "PHONE_NUMBER",
+  "CREDIT_CARD",
+  "IBAN_CODE",
+  "IP_ADDRESS",
+  "LOCATION",
+];
+
+/**
+ * Load test cases from YAML files in test-data directory
+ */
+async function loadTestCases(): Promise<TestCase[]> {
+  const testCases: TestCase[] = [];
+  const glob = new Glob("*.yaml");
+
+  for await (const file of glob.scan(TEST_DATA_DIR)) {
+    const filePath = `${TEST_DATA_DIR}/${file}`;
+    const content = await Bun.file(filePath).text();
+    const data = parseYaml(content) as { test_cases: unknown[] };
+
+    if (!data.test_cases || !Array.isArray(data.test_cases)) {
+      console.warn(`Warning: ${file} has no test_cases array`);
+      continue;
+    }
+
+    for (const testCase of data.test_cases) {
+      const parsed = TestCaseSchema.safeParse(testCase);
+      if (parsed.success) {
+        // Filter by language if specified
+        if (!languageFilter || languageFilter.includes(parsed.data.language)) {
+          testCases.push(parsed.data);
+        }
+      } else {
+        console.warn(`Warning: Invalid test case in ${file}:`, parsed.error.format());
+      }
+    }
+  }
+
+  // Sort by ID for deterministic output
+  return testCases.sort((a, b) => a.id.localeCompare(b.id));
+}
+
+/**
+ * Call Presidio analyzer API
+ */
+async function detectPII(text: string, language: string): Promise<DetectedEntity[]> {
+  if (!text) return [];
+
+  const response = await fetch(`${PRESIDIO_URL}/analyze`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      text,
+      language,
+      entities: ENTITY_TYPES,
+      score_threshold: threshold,
+    }),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Presidio error: ${response.status} ${await response.text()}`);
+  }
+
+  const entities = (await response.json()) as Array<{
+    entity_type: string;
+    start: number;
+    end: number;
+    score: number;
+  }>;
+
+  return entities.map((e) => ({
+    type: e.entity_type,
+    text: text.slice(e.start, e.end),
+    start: e.start,
+    end: e.end,
+    score: e.score,
+  }));
+}
+
+/**
+ * Check if Presidio is available
+ */
+async function checkPresidio(): Promise<boolean> {
+  try {
+    const response = await fetch(`${PRESIDIO_URL}/health`);
+    return response.ok;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Run a single test case and compare results
+ */
+async function runTestCase(testCase: TestCase): Promise<TestResult> {
+  const detected = await detectPII(testCase.text, testCase.language);
+
+  // Match expected entities with detected ones
+  const truePositives: ExpectedEntity[] = [];
+  const falseNegatives: ExpectedEntity[] = [];
+  const matchedDetected = new Set<number>();
+
+  for (const expected of testCase.expected) {
+    // Find a matching detected entity
+    const matchIndex = detected.findIndex(
+      (d, i) =>
+        !matchedDetected.has(i) &&
+        d.type === expected.type &&
+        normalizeText(d.text).includes(normalizeText(expected.text)),
+    );
+
+    if (matchIndex !== -1) {
+      truePositives.push(expected);
+      matchedDetected.add(matchIndex);
+    } else {
+      falseNegatives.push(expected);
+    }
+  }
+
+  // Remaining detected entities are false positives
+  const falsePositives = detected.filter((_, i) => !matchedDetected.has(i));
+
+  const passed = falseNegatives.length === 0 && falsePositives.length === 0;
+
+  return {
+    id: testCase.id,
+    text: testCase.text,
+    language: testCase.language,
+    passed,
+    expected: testCase.expected,
+    detected,
+    falseNegatives,
+    falsePositives,
+    truePositives,
+  };
+}
+
+/**
+ * Normalize text for comparison (lowercase, trim)
+ */
+function normalizeText(text: string): string {
+  return text.toLowerCase().trim();
+}
+
+/**
+ * Calculate accuracy metrics from results
+ */
+function calculateMetrics(results: TestResult[]): AccuracyMetrics {
+  let tp = 0;
+  let fp = 0;
+  let fn = 0;
+
+  for (const result of results) {
+    tp += result.truePositives.length;
+    fp += result.falsePositives.length;
+    fn += result.falseNegatives.length;
+  }
+
+  const precision = tp + fp > 0 ? tp / (tp + fp) : 1;
+  const recall = tp + fn > 0 ? tp / (tp + fn) : 1;
+  const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
+
+  return {
+    total: results.length,
+    passed: results.filter((r) => r.passed).length,
+    failed: results.filter((r) => !r.passed).length,
+    precision,
+    recall,
+    f1,
+    truePositives: tp,
+    falsePositives: fp,
+    falseNegatives: fn,
+  };
+}
+
+/**
+ * Format percentage for display
+ */
+function formatPercent(value: number): string {
+  return `${(value * 100).toFixed(1)}%`;
+}
+
+/**
+ * Print metrics in a formatted way
+ */
+function printMetrics(label: string, metrics: AccuracyMetrics): void {
+  const status = metrics.failed === 0 ? "✓" : "⚠";
+  console.log(
+    `  ${label.padEnd(20)} P=${formatPercent(metrics.precision).padStart(6)}  ` +
+      `R=${formatPercent(metrics.recall).padStart(6)}  ` +
+      `F1=${formatPercent(metrics.f1).padStart(6)}  ${status}`,
+  );
+}
+
+/**
+ * Main benchmark execution
+ */
+async function main(): Promise<void> {
+  console.log("\n╔════════════════════════════════════════════════════════════╗");
+  console.log("║           PII Detection Accuracy Benchmark                 ║");
+  console.log("╚════════════════════════════════════════════════════════════╝\n");
+
+  // Check Presidio availability
+  console.log(`Presidio URL: ${PRESIDIO_URL}`);
+  console.log(`Threshold: ${threshold}`);
+  if (languageFilter) {
+    console.log(`Languages: ${languageFilter.join(", ")}`);
+  }
+
+  if (!(await checkPresidio())) {
+    console.error("\n✗ Presidio is not available. Start it with:");
+    console.error("  docker compose up presidio-analyzer -d\n");
+    process.exit(1);
+  }
+  console.log("Presidio: ✓ Connected\n");
+
+  // Load test cases from directory
+  const testCases = await loadTestCases();
+
+  if (testCases.length === 0) {
+    console.error("No test cases found in", TEST_DATA_DIR);
+    process.exit(1);
+  }
+
+  console.log(`Running ${testCases.length} test cases...\n`);
+
+  // Run all test cases
+  const results: TestResult[] = [];
+  for (const testCase of testCases) {
+    try {
+      const result = await runTestCase(testCase);
+      results.push(result);
+
+      if (verbose) {
+        const icon = result.passed ? "✓" : "✗";
+        console.log(`${icon} ${result.id}`);
+        if (!result.passed) {
+          if (result.falseNegatives.length > 0) {
+            console.log(`    Missed: ${result.falseNegatives.map((e) => `${e.type}:"${e.text}"`).join(", ")}`);
+          }
+          if (result.falsePositives.length > 0) {
+            console.log(
+              `    Wrong:  ${result.falsePositives.map((e) => `${e.type}:"${e.text}" (${e.score.toFixed(2)})`).join(", ")}`,
+            );
+          }
+        }
+      }
+    } catch (error) {
+      console.error(`Error in test ${testCase.id}:`, error);
+    }
+  }
+
+  // Calculate overall metrics
+  const overall = calculateMetrics(results);
+
+  // Calculate metrics by entity type
+  const byEntityType: Record<string, AccuracyMetrics> = {};
+  for (const entityType of ENTITY_TYPES) {
+    const filtered = results.map((r) => ({
+      ...r,
+      expected: r.expected.filter((e) => e.type === entityType),
+      truePositives: r.truePositives.filter((e) => e.type === entityType),
+      falseNegatives: r.falseNegatives.filter((e) => e.type === entityType),
+      falsePositives: r.falsePositives.filter((e) => e.type === entityType),
+    }));
+    // Only include if there are test cases for this type
+    const hasTestCases = filtered.some((r) => r.expected.length > 0 || r.falsePositives.length > 0);
+    if (hasTestCases) {
+      byEntityType[entityType] = calculateMetrics(filtered);
+    }
+  }
+
+  // Calculate metrics by language
+  const languages = [...new Set(results.map((r) => r.language))];
+  const byLanguage: Record<string, AccuracyMetrics> = {};
+  for (const lang of languages) {
+    byLanguage[lang] = calculateMetrics(results.filter((r) => r.language === lang));
+  }
+
+  // Print report
+  console.log("\n────────────────────────────────────────────────────────────");
+  console.log("                         RESULTS");
+  console.log("────────────────────────────────────────────────────────────\n");
+
+  console.log(`Overall: ${overall.passed}/${overall.total} passed\n`);
+
+  console.log("Metrics (P=Precision, R=Recall, F1=F1-Score):\n");
+
+  console.log(
+    `  ${"OVERALL".padEnd(20)} P=${formatPercent(overall.precision).padStart(6)}  ` +
+      `R=${formatPercent(overall.recall).padStart(6)}  ` +
+      `F1=${formatPercent(overall.f1).padStart(6)}\n`,
+  );
+
+  console.log("By Entity Type:");
+  for (const [type, metrics] of Object.entries(byEntityType)) {
+    printMetrics(type, metrics);
+  }
+
+  console.log("\nBy Language:");
+  for (const [lang, metrics] of Object.entries(byLanguage)) {
+    printMetrics(lang.toUpperCase(), metrics);
+  }
+
+  // Print false negatives (missed PII)
+  const allFalseNegatives = results.flatMap((r) =>
+    r.falseNegatives.map((fn) => ({ testId: r.id, text: r.text, entity: fn })),
+  );
+  if (allFalseNegatives.length > 0) {
+    console.log("\n────────────────────────────────────────────────────────────");
+    console.log(`False Negatives (Missed PII): ${allFalseNegatives.length}`);
+    console.log("────────────────────────────────────────────────────────────");
+    for (const { testId, entity } of allFalseNegatives) {
+      console.log(`  ✗ "${entity.text}" (${entity.type}) in ${testId}`);
+    }
+  }
+
+  // Print false positives (wrong detections)
+  const allFalsePositives = results.flatMap((r) =>
+    r.falsePositives.map((fp) => ({ testId: r.id, text: r.text, entity: fp })),
+  );
+  if (allFalsePositives.length > 0) {
+    console.log("\n────────────────────────────────────────────────────────────");
+    console.log(`False Positives (Wrong Detections): ${allFalsePositives.length}`);
+    console.log("────────────────────────────────────────────────────────────");
+    for (const { testId, entity } of allFalsePositives) {
+      console.log(`  ✗ "${entity.text}" (${entity.type}, score=${entity.score.toFixed(2)}) in ${testId}`);
+    }
+  }
+
+  console.log("\n────────────────────────────────────────────────────────────\n");
+
+  // Exit with error code if tests failed
+  if (overall.failed > 0) {
+    process.exit(1);
+  }
+}
+
+main().catch((error) => {
+  console.error("Benchmark failed:", error);
+  process.exit(1);
+});
diff --git a/benchmarks/pii-accuracy/test-data/de.yaml b/benchmarks/pii-accuracy/test-data/de.yaml

new file mode 100644 (file)

index 0000000..530fd04
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/de.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs German context words)
+  - id: de_phone
+    text: "Telefon: +49 171 1234567"
+    language: de
+    expected:
+      - type: PHONE_NUMBER
+        text: "+49 171 1234567"
+
+  # Person (NER model)
+  - id: de_person
+    text: "Meeting mit Max Müller morgen um 10 Uhr"
+    language: de
+    expected:
+      - type: PERSON
+        text: "Max Müller"
+
+  # Location (NER model)
+  - id: de_location
+    text: "Das Meeting findet in München statt"
+    language: de
+    expected:
+      - type: LOCATION
+        text: "München"
+
+  # Mixed (real-world prompt)
+  - id: de_mixed
+    text: "Kontaktdaten von Hans Meier: hans.meier@example.com, Telefon 0171-1234567"
+    language: de
+    expected:
+      - type: PERSON
+        text: "Hans Meier"
+      - type: EMAIL_ADDRESS
+        text: "hans.meier@example.com"
+      - type: PHONE_NUMBER
+        text: "0171-1234567"
+
+  # False Positive
+  - id: de_fp
+    text: "Das Wetter ist heute sehr schön"
+    language: de
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/en.yaml b/benchmarks/pii-accuracy/test-data/en.yaml

new file mode 100644 (file)

index 0000000..13bd6aa
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/en.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs English context words)
+  - id: en_phone
+    text: "Phone: (555) 123-4567"
+    language: en
+    expected:
+      - type: PHONE_NUMBER
+        text: "(555) 123-4567"
+
+  # Person (NER model)
+  - id: en_person
+    text: "Schedule a meeting with John Smith tomorrow"
+    language: en
+    expected:
+      - type: PERSON
+        text: "John Smith"
+
+  # Location (NER model)
+  - id: en_location
+    text: "The conference is in New York next week"
+    language: en
+    expected:
+      - type: LOCATION
+        text: "New York"
+
+  # Mixed (real-world prompt)
+  - id: en_mixed
+    text: "Customer John Doe (john@example.com), phone (555) 987-6543"
+    language: en
+    expected:
+      - type: PERSON
+        text: "John Doe"
+      - type: EMAIL_ADDRESS
+        text: "john@example.com"
+      - type: PHONE_NUMBER
+        text: "(555) 987-6543"
+
+  # False Positive
+  - id: en_fp
+    text: "The weather is nice today"
+    language: en
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/es.yaml b/benchmarks/pii-accuracy/test-data/es.yaml

new file mode 100644 (file)

index 0000000..ac7b305
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/es.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs Spanish context words)
+  - id: es_phone
+    text: "Teléfono: +34 612 345 678"
+    language: es
+    expected:
+      - type: PHONE_NUMBER
+        text: "+34 612 345 678"
+
+  # Person (NER model)
+  - id: es_person
+    text: "Reunión con Ana Martínez mañana a las 10"
+    language: es
+    expected:
+      - type: PERSON
+        text: "Ana Martínez"
+
+  # Location (NER model)
+  - id: es_location
+    text: "La reunión será en Madrid la próxima semana"
+    language: es
+    expected:
+      - type: LOCATION
+        text: "Madrid"
+
+  # Mixed (real-world prompt)
+  - id: es_mixed
+    text: "Contacto de Carlos García: carlos.garcia@example.es, teléfono +34 698 765 432"
+    language: es
+    expected:
+      - type: PERSON
+        text: "Carlos García"
+      - type: EMAIL_ADDRESS
+        text: "carlos.garcia@example.es"
+      - type: PHONE_NUMBER
+        text: "+34 698 765 432"
+
+  # False Positive
+  - id: es_fp
+    text: "El clima está muy agradable hoy"
+    language: es
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/fr.yaml b/benchmarks/pii-accuracy/test-data/fr.yaml

new file mode 100644 (file)

index 0000000..e2423c8
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/fr.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs French context words)
+  - id: fr_phone
+    text: "Téléphone: 06 12 34 56 78"
+    language: fr
+    expected:
+      - type: PHONE_NUMBER
+        text: "06 12 34 56 78"
+
+  # Person (NER model)
+  - id: fr_person
+    text: "J'ai une réunion avec Marie Dubois demain à 10h"
+    language: fr
+    expected:
+      - type: PERSON
+        text: "Marie Dubois"
+
+  # Location (NER model)
+  - id: fr_location
+    text: "La conférence aura lieu à Lyon la semaine prochaine"
+    language: fr
+    expected:
+      - type: LOCATION
+        text: "Lyon"
+
+  # Mixed (real-world prompt)
+  - id: fr_mixed
+    text: "Contact de Jean Dupont: jean.dupont@example.fr, téléphone 06 98 76 54 32"
+    language: fr
+    expected:
+      - type: PERSON
+        text: "Jean Dupont"
+      - type: EMAIL_ADDRESS
+        text: "jean.dupont@example.fr"
+      - type: PHONE_NUMBER
+        text: "06 98 76 54 32"
+
+  # False Positive
+  - id: fr_fp
+    text: "Le temps est magnifique aujourd'hui"
+    language: fr
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/global.yaml b/benchmarks/pii-accuracy/test-data/global.yaml

new file mode 100644 (file)

index 0000000..fa8e8a4
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/global.yaml
@@ -0,0 +1,41 @@
+test_cases:
+  # Pattern-based recognizers - language independent
+  # Tested once since regex/checksum works the same for all languages
+
+  # Email
+  - id: global_email
+    text: "Contact me at john.doe@company.com"
+    language: en
+    expected:
+      - type: EMAIL_ADDRESS
+        text: "john.doe@company.com"
+
+  # IBAN
+  - id: global_iban
+    text: "Transfer to IBAN DE89370400440532013000"
+    language: en
+    expected:
+      - type: IBAN_CODE
+        text: "DE89370400440532013000"
+
+  # Credit Card
+  - id: global_credit_card
+    text: "Card number: 4111 1111 1111 1111"
+    language: en
+    expected:
+      - type: CREDIT_CARD
+        text: "4111 1111 1111 1111"
+
+  # IP Address
+  - id: global_ip
+    text: "Server IP is 8.8.8.8"
+    language: en
+    expected:
+      - type: IP_ADDRESS
+        text: "8.8.8.8"
+
+  # Empty text
+  - id: global_empty
+    text: ""
+    language: en
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/it.yaml b/benchmarks/pii-accuracy/test-data/it.yaml

new file mode 100644 (file)

index 0000000..cc6a37f
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/it.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs Italian context words)
+  - id: it_phone
+    text: "Telefono: 333 1234567"
+    language: it
+    expected:
+      - type: PHONE_NUMBER
+        text: "333 1234567"
+
+  # Person (NER model)
+  - id: it_person
+    text: "Riunione con Giuseppe Verdi domani alle 10"
+    language: it
+    expected:
+      - type: PERSON
+        text: "Giuseppe Verdi"
+
+  # Location (NER model)
+  - id: it_location
+    text: "L'evento si terrà a Milano il prossimo mese"
+    language: it
+    expected:
+      - type: LOCATION
+        text: "Milano"
+
+  # Mixed (real-world prompt)
+  - id: it_mixed
+    text: "Contatto di Marco Rossi: marco.rossi@example.it, telefono 333 9876543"
+    language: it
+    expected:
+      - type: PERSON
+        text: "Marco Rossi"
+      - type: EMAIL_ADDRESS
+        text: "marco.rossi@example.it"
+      - type: PHONE_NUMBER
+        text: "333 9876543"
+
+  # False Positive
+  - id: it_fp
+    text: "Il caffè italiano è il migliore del mondo"
+    language: it
+    expected: []
diff --git a/benchmarks/pii-accuracy/types.ts b/benchmarks/pii-accuracy/types.ts

new file mode 100644 (file)

index 0000000..a78bc5a
--- /dev/null
+++ b/benchmarks/pii-accuracy/types.ts
@@ -0,0 +1,53 @@
+import { z } from "zod";
+
+// Schema for expected PII entity in test data
+export const ExpectedEntitySchema = z.object({
+  type: z.string(),
+  text: z.string(),
+});
+
+// Schema for a single test case
+export const TestCaseSchema = z.object({
+  id: z.string(),
+  text: z.string(),
+  language: z.string(),
+  expected: z.array(ExpectedEntitySchema),
+  description: z.string().optional(),
+});
+
+export type ExpectedEntity = z.infer<typeof ExpectedEntitySchema>;
+export type TestCase = z.infer<typeof TestCaseSchema>;
+
+// Result of running a single test case
+export interface TestResult {
+  id: string;
+  text: string;
+  language: string;
+  passed: boolean;
+  expected: ExpectedEntity[];
+  detected: DetectedEntity[];
+  falseNegatives: ExpectedEntity[]; // Expected but not detected
+  falsePositives: DetectedEntity[]; // Detected but not expected
+  truePositives: ExpectedEntity[]; // Correctly detected
+}
+
+export interface DetectedEntity {
+  type: string;
+  text: string;
+  start: number;
+  end: number;
+  score: number;
+}
+
+// Aggregated metrics
+export interface AccuracyMetrics {
+  total: number;
+  passed: number;
+  failed: number;
+  precision: number; // TP / (TP + FP)
+  recall: number; // TP / (TP + FN)
+  f1: number; // 2 * (P * R) / (P + R)
+  truePositives: number;
+  falsePositives: number;
+  falseNegatives: number;
+}
diff --git a/package.json b/package.json

index 4921794866723bbcf994691c4ae0b714d5cfb80d..ab316a5950de9e20dc9a127c84f750accf57bcac 100644 (file)
--- a/package.json
+++ b/package.json
@@ -12,7 +12,8 @@
      "typecheck": "tsc --noEmit",
      "lint": "biome lint src",
      "format": "biome format src --write",
-    "check": "biome check src"
+    "check": "biome check src",
+    "benchmark:accuracy": "bun run benchmarks/pii-accuracy/run.ts"
    },
    "dependencies": {
      "@hono/zod-validator": "^0.7.6",
diff --git a/presidio/languages.yaml b/presidio/languages.yaml

index 266aa36bab346cb0d146d95eb5ddd5641b4284da..a241e49eaf2f2fd9ff7bfe75de55fb886a7b855a 100644 (file)
--- a/presidio/languages.yaml
+++ b/presidio/languages.yaml
@@ -8,216 +8,151 @@
  
  spacy_version: "3.8.0"
  
+# Phone context words per language (5-7 words each)
+# Covers: phone/telephone, number, mobile, call
+# Based on research of common usage and regional variants
+
  languages:
    # Catalan
    ca:
      name: Catalan
      model: ca_core_news_md
+    phone_context: [telèfon, número, mòbil, trucada, trucar]
  
    # Chinese
    zh:
      name: Chinese
      model: zh_core_web_md
+    phone_context: [电话, 手机, 号码, 打电话, 电话号码, 手机号码]
  
    # Croatian
    hr:
      name: Croatian
      model: hr_core_news_md
+    phone_context: [telefon, broj, mobitel, poziv, nazovi, zvati]
  
    # Danish
    da:
      name: Danish
      model: da_core_news_md
+    phone_context: [telefon, nummer, mobil, mobiltelefon, opkald, ringe]
  
    # Dutch
    nl:
      name: Dutch
      model: nl_core_news_md
-    phone_context:
-      - telefoon
-      - telefoonnummer
-      - mobiel
-      - bellen
-      - fax
-
-  # English
+    phone_context: [telefoon, nummer, mobiel, mobieltje, GSM, bellen]
+
+  # English (Presidio defaults)
    en:
      name: English
      model: en_core_web_lg
-    phone_context:
-      - phone
-      - telephone
-      - cell
-      - mobile
-      - call
-      - fax
+    phone_context: [phone, number, telephone, cell, cellphone, mobile, call]
  
    # Finnish
    fi:
      name: Finnish
      model: fi_core_news_md
+    phone_context: [puhelin, numero, kännykkä, matkapuhelin, soittaa, puhelinnumero]
  
    # French
    fr:
      name: French
      model: fr_core_news_md
-    phone_context:
-      - téléphone
-      - portable
-      - mobile
-      - numéro
-      - appeler
-      - fax
+    phone_context: [téléphone, numéro, portable, mobile, appeler, tél]
  
    # German
    de:
      name: German
-    model: de_core_news_md
-    phone_context:
-      - telefon
-      - telefonnummer
-      - handy
-      - mobil
-      - mobilnummer
-      - fax
-      - anrufen
+    model: de_core_news_lg
+    phone_context: [telefon, nummer, handy, mobiltelefon, anruf, rufnummer]
  
    # Greek
    el:
      name: Greek
      model: el_core_news_md
-    phone_context:
-      - τηλέφωνο
-      - κινητό
-      - φαξ
+    phone_context: [τηλέφωνο, αριθμός, κινητό, κλήση, τηλεφωνώ, καλώ]
  
    # Italian
    it:
      name: Italian
      model: it_core_news_md
-    phone_context:
-      - telefono
-      - cellulare
-      - mobile
-      - numero
-      - chiamare
-      - fax
+    phone_context: [telefono, numero, cellulare, telefonino, chiamare, chiamata]
  
    # Japanese
    ja:
      name: Japanese
      model: ja_core_news_md
-    phone_context:
-      - 電話
-      - 携帯
-      - モバイル
-      - ファックス
+    phone_context: [電話, 携帯, 番号, スマホ, ケータイ, 電話番号]
  
    # Korean
    ko:
      name: Korean
      model: ko_core_news_md
-    phone_context:
-      - 전화
-      - 휴대폰
-      - 모바일
-      - 팩스
+    phone_context: [전화, 휴대폰, 핸드폰, 번호, 전화번호, 통화]
  
    # Lithuanian
    lt:
      name: Lithuanian
      model: lt_core_news_md
+    phone_context: [telefonas, numeris, mobilusis, skambutis, skambinti]
  
    # Macedonian
    mk:
      name: Macedonian
      model: mk_core_news_md
+    phone_context: [телефон, број, мобилен, повик, звони]
  
    # Norwegian Bokmål
    nb:
      name: Norwegian
      model: nb_core_news_md
-    phone_context:
-      - telefon
-      - mobil
-      - ringe
-      - faks
+    phone_context: [telefon, nummer, mobil, mobiltelefon, samtale, ringe]
  
    # Polish
    pl:
      name: Polish
      model: pl_core_news_md
-    phone_context:
-      - telefon
-      - komórka
-      - dzwonić
-      - faks
+    phone_context: [telefon, numer, komórka, komórkowy, dzwoń, zadzwoń]
  
    # Portuguese
    pt:
      name: Portuguese
      model: pt_core_news_md
-    phone_context:
-      - telefone
-      - celular
-      - móvel
-      - ligar
-      - fax
+    phone_context: [telefone, número, celular, telemóvel, ligar, telefonar]
  
    # Romanian
    ro:
      name: Romanian
      model: ro_core_news_md
-    phone_context:
-      - telefon
-      - mobil
-      - apel
-      - fax
+    phone_context: [telefon, număr, mobil, apel, suna]
  
    # Russian
    ru:
      name: Russian
      model: ru_core_news_md
-    phone_context:
-      - телефон
-      - мобильный
-      - звонить
-      - факс
+    phone_context: [телефон, номер, мобильник, мобила, сотовый, звонок]
  
    # Slovenian
    sl:
      name: Slovenian
      model: sl_core_news_md
+    phone_context: [telefon, številka, mobilnik, mobilec, klic, pokliči]
  
    # Spanish
    es:
      name: Spanish
      model: es_core_news_md
-    phone_context:
-      - teléfono
-      - móvil
-      - celular
-      - número
-      - llamar
-      - fax
+    phone_context: [teléfono, número, móvil, celular, llamar, llamada]
  
    # Swedish
    sv:
      name: Swedish
      model: sv_core_news_md
-    phone_context:
-      - telefon
-      - mobil
-      - ringa
-      - fax
+    phone_context: [telefon, nummer, mobil, mobiltelefon, samtal, ringa]
  
    # Ukrainian
    uk:
      name: Ukrainian
      model: uk_core_news_md
-    phone_context:
-      - телефон
-      - мобільний
-      - дзвонити
-      - факс
-
+    phone_context: [телефон, номер, мобільний, мобілка, дзвінок, дзвони]
author	Stefan Gasser <redacted>
	Thu, 8 Jan 2026 16:15:59 +0000 (17:15 +0100)
committer	GitHub <redacted>
	Thu, 8 Jan 2026 16:15:59 +0000 (17:15 +0100)
README.md		patch \| blob \| history
benchmarks/pii-accuracy/run.ts	[new file with mode: 0644]	patch \| blob
benchmarks/pii-accuracy/test-data/de.yaml	[new file with mode: 0644]	patch \| blob
benchmarks/pii-accuracy/test-data/en.yaml	[new file with mode: 0644]	patch \| blob
benchmarks/pii-accuracy/test-data/es.yaml	[new file with mode: 0644]	patch \| blob
benchmarks/pii-accuracy/test-data/fr.yaml	[new file with mode: 0644]	patch \| blob
benchmarks/pii-accuracy/test-data/global.yaml	[new file with mode: 0644]	patch \| blob
benchmarks/pii-accuracy/test-data/it.yaml	[new file with mode: 0644]	patch \| blob
benchmarks/pii-accuracy/types.ts	[new file with mode: 0644]	patch \| blob
package.json		patch \| blob \| history
presidio/languages.yaml		patch \| blob \| history