From: Stefan Gasser <redacted>
Date: Thu, 8 Jan 2026 16:15:59 +0000 (+0100)
Subject: Add PII accuracy benchmark with multi-language phone context (#1)
X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=189e4361fc6b3bf30965de6ff23ce249a18215bf;p=sgasser-llm-shield.git

Add PII accuracy benchmark with multi-language phone context (#1)

- Add benchmark framework with precision/recall/F1 metrics
- Add 30 test cases across 5 languages (DE, EN, ES, FR, IT)
- Add phone_context words for all 24 supported languages
- Each language has 5-7 native words for: phone, number, mobile, call

Test with: bun run benchmark:accuracy
---

diff --git a/README.md b/README.md
index 1bd2793..619234c 100644
--- a/README.md
+++ b/README.md
@@ -229,6 +229,26 @@ bun test                                  # Run tests
 bun run check                             # Lint & format
 ```
 
+## Benchmarks
+
+PII detection accuracy benchmark with test cases for multiple languages:
+
+```bash
+# Start Presidio with all benchmark languages
+LANGUAGES=en,de,fr,it,es docker compose up presidio-analyzer -d
+
+# Run all tests
+bun run benchmarks/pii-accuracy/run.ts
+
+# Run specific languages only
+bun run benchmarks/pii-accuracy/run.ts --languages de,en
+
+# Verbose output
+bun run benchmarks/pii-accuracy/run.ts --verbose
+```
+
+Test data in `benchmarks/pii-accuracy/test-data/` (one file per language).
+
 ## License
 
 [Apache 2.0](LICENSE)
diff --git a/benchmarks/pii-accuracy/run.ts b/benchmarks/pii-accuracy/run.ts
new file mode 100644
index 0000000..b7ab343
--- /dev/null
+++ b/benchmarks/pii-accuracy/run.ts
@@ -0,0 +1,412 @@
+#!/usr/bin/env bun
+/**
+ * PII Detection Accuracy Benchmark
+ *
+ * Measures precision, recall, and F1 score of the PII detection system.
+ *
+ * Usage:
+ *   bun run benchmarks/pii-accuracy/run.ts
+ *   bun run benchmarks/pii-accuracy/run.ts --threshold 0.5
+ *   bun run benchmarks/pii-accuracy/run.ts --languages de,en
+ *   bun run benchmarks/pii-accuracy/run.ts --verbose
+ */
+
+import { parseArgs } from "util";
+import { Glob } from "bun";
+import { parse as parseYaml } from "yaml";
+import {
+  TestCaseSchema,
+  type AccuracyMetrics,
+  type DetectedEntity,
+  type ExpectedEntity,
+  type TestCase,
+  type TestResult,
+} from "./types";
+
+// Configuration
+const DEFAULT_THRESHOLD = 0.7;
+const PRESIDIO_URL = process.env.PRESIDIO_URL || "http://localhost:5002";
+const TEST_DATA_DIR = import.meta.dir + "/test-data";
+
+// Parse command line arguments
+const { values: args } = parseArgs({
+  args: Bun.argv.slice(2),
+  options: {
+    threshold: { type: "string", short: "t" },
+    languages: { type: "string", short: "l" },
+    verbose: { type: "boolean", short: "v", default: false },
+    help: { type: "boolean", short: "h", default: false },
+  },
+});
+
+if (args.help) {
+  console.log(`
+PII Detection Accuracy Benchmark
+
+Usage:
+  bun run benchmarks/pii-accuracy/run.ts [options]
+
+Options:
+  -t, --threshold <value>    Score threshold (default: ${DEFAULT_THRESHOLD})
+  -l, --languages <langs>    Comma-separated languages to test (e.g., de,en)
+  -v, --verbose              Show detailed results for each test case
+  -h, --help                 Show this help message
+
+Examples:
+  bun run benchmarks/pii-accuracy/run.ts
+  bun run benchmarks/pii-accuracy/run.ts --threshold 0.5
+  bun run benchmarks/pii-accuracy/run.ts --languages de,en
+  bun run benchmarks/pii-accuracy/run.ts --verbose
+`);
+  process.exit(0);
+}
+
+const threshold = args.threshold ? Number.parseFloat(args.threshold) : DEFAULT_THRESHOLD;
+const verbose = args.verbose ?? false;
+const languageFilter = args.languages?.split(",").map((l) => l.trim().toLowerCase());
+
+// Entity types we test for
+const ENTITY_TYPES = [
+  "PERSON",
+  "EMAIL_ADDRESS",
+  "PHONE_NUMBER",
+  "CREDIT_CARD",
+  "IBAN_CODE",
+  "IP_ADDRESS",
+  "LOCATION",
+];
+
+/**
+ * Load test cases from YAML files in test-data directory
+ */
+async function loadTestCases(): Promise<TestCase[]> {
+  const testCases: TestCase[] = [];
+  const glob = new Glob("*.yaml");
+
+  for await (const file of glob.scan(TEST_DATA_DIR)) {
+    const filePath = `${TEST_DATA_DIR}/${file}`;
+    const content = await Bun.file(filePath).text();
+    const data = parseYaml(content) as { test_cases: unknown[] };
+
+    if (!data.test_cases || !Array.isArray(data.test_cases)) {
+      console.warn(`Warning: ${file} has no test_cases array`);
+      continue;
+    }
+
+    for (const testCase of data.test_cases) {
+      const parsed = TestCaseSchema.safeParse(testCase);
+      if (parsed.success) {
+        // Filter by language if specified
+        if (!languageFilter || languageFilter.includes(parsed.data.language)) {
+          testCases.push(parsed.data);
+        }
+      } else {
+        console.warn(`Warning: Invalid test case in ${file}:`, parsed.error.format());
+      }
+    }
+  }
+
+  // Sort by ID for deterministic output
+  return testCases.sort((a, b) => a.id.localeCompare(b.id));
+}
+
+/**
+ * Call Presidio analyzer API
+ */
+async function detectPII(text: string, language: string): Promise<DetectedEntity[]> {
+  if (!text) return [];
+
+  const response = await fetch(`${PRESIDIO_URL}/analyze`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      text,
+      language,
+      entities: ENTITY_TYPES,
+      score_threshold: threshold,
+    }),
+  });
+
+  if (!response.ok) {
+    throw new Error(`Presidio error: ${response.status} ${await response.text()}`);
+  }
+
+  const entities = (await response.json()) as Array<{
+    entity_type: string;
+    start: number;
+    end: number;
+    score: number;
+  }>;
+
+  return entities.map((e) => ({
+    type: e.entity_type,
+    text: text.slice(e.start, e.end),
+    start: e.start,
+    end: e.end,
+    score: e.score,
+  }));
+}
+
+/**
+ * Check if Presidio is available
+ */
+async function checkPresidio(): Promise<boolean> {
+  try {
+    const response = await fetch(`${PRESIDIO_URL}/health`);
+    return response.ok;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Run a single test case and compare results
+ */
+async function runTestCase(testCase: TestCase): Promise<TestResult> {
+  const detected = await detectPII(testCase.text, testCase.language);
+
+  // Match expected entities with detected ones
+  const truePositives: ExpectedEntity[] = [];
+  const falseNegatives: ExpectedEntity[] = [];
+  const matchedDetected = new Set<number>();
+
+  for (const expected of testCase.expected) {
+    // Find a matching detected entity
+    const matchIndex = detected.findIndex(
+      (d, i) =>
+        !matchedDetected.has(i) &&
+        d.type === expected.type &&
+        normalizeText(d.text).includes(normalizeText(expected.text)),
+    );
+
+    if (matchIndex !== -1) {
+      truePositives.push(expected);
+      matchedDetected.add(matchIndex);
+    } else {
+      falseNegatives.push(expected);
+    }
+  }
+
+  // Remaining detected entities are false positives
+  const falsePositives = detected.filter((_, i) => !matchedDetected.has(i));
+
+  const passed = falseNegatives.length === 0 && falsePositives.length === 0;
+
+  return {
+    id: testCase.id,
+    text: testCase.text,
+    language: testCase.language,
+    passed,
+    expected: testCase.expected,
+    detected,
+    falseNegatives,
+    falsePositives,
+    truePositives,
+  };
+}
+
+/**
+ * Normalize text for comparison (lowercase, trim)
+ */
+function normalizeText(text: string): string {
+  return text.toLowerCase().trim();
+}
+
+/**
+ * Calculate accuracy metrics from results
+ */
+function calculateMetrics(results: TestResult[]): AccuracyMetrics {
+  let tp = 0;
+  let fp = 0;
+  let fn = 0;
+
+  for (const result of results) {
+    tp += result.truePositives.length;
+    fp += result.falsePositives.length;
+    fn += result.falseNegatives.length;
+  }
+
+  const precision = tp + fp > 0 ? tp / (tp + fp) : 1;
+  const recall = tp + fn > 0 ? tp / (tp + fn) : 1;
+  const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
+
+  return {
+    total: results.length,
+    passed: results.filter((r) => r.passed).length,
+    failed: results.filter((r) => !r.passed).length,
+    precision,
+    recall,
+    f1,
+    truePositives: tp,
+    falsePositives: fp,
+    falseNegatives: fn,
+  };
+}
+
+/**
+ * Format percentage for display
+ */
+function formatPercent(value: number): string {
+  return `${(value * 100).toFixed(1)}%`;
+}
+
+/**
+ * Print metrics in a formatted way
+ */
+function printMetrics(label: string, metrics: AccuracyMetrics): void {
+  const status = metrics.failed === 0 ? "â" : "â ";
+  console.log(
+    `  ${label.padEnd(20)} P=${formatPercent(metrics.precision).padStart(6)}  ` +
+      `R=${formatPercent(metrics.recall).padStart(6)}  ` +
+      `F1=${formatPercent(metrics.f1).padStart(6)}  ${status}`,
+  );
+}
+
+/**
+ * Main benchmark execution
+ */
+async function main(): Promise<void> {
+  console.log("\nââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ");
+  console.log("â           PII Detection Accuracy Benchmark                 â");
+  console.log("ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ\n");
+
+  // Check Presidio availability
+  console.log(`Presidio URL: ${PRESIDIO_URL}`);
+  console.log(`Threshold: ${threshold}`);
+  if (languageFilter) {
+    console.log(`Languages: ${languageFilter.join(", ")}`);
+  }
+
+  if (!(await checkPresidio())) {
+    console.error("\nâ Presidio is not available. Start it with:");
+    console.error("  docker compose up presidio-analyzer -d\n");
+    process.exit(1);
+  }
+  console.log("Presidio: â Connected\n");
+
+  // Load test cases from directory
+  const testCases = await loadTestCases();
+
+  if (testCases.length === 0) {
+    console.error("No test cases found in", TEST_DATA_DIR);
+    process.exit(1);
+  }
+
+  console.log(`Running ${testCases.length} test cases...\n`);
+
+  // Run all test cases
+  const results: TestResult[] = [];
+  for (const testCase of testCases) {
+    try {
+      const result = await runTestCase(testCase);
+      results.push(result);
+
+      if (verbose) {
+        const icon = result.passed ? "â" : "â";
+        console.log(`${icon} ${result.id}`);
+        if (!result.passed) {
+          if (result.falseNegatives.length > 0) {
+            console.log(`    Missed: ${result.falseNegatives.map((e) => `${e.type}:"${e.text}"`).join(", ")}`);
+          }
+          if (result.falsePositives.length > 0) {
+            console.log(
+              `    Wrong:  ${result.falsePositives.map((e) => `${e.type}:"${e.text}" (${e.score.toFixed(2)})`).join(", ")}`,
+            );
+          }
+        }
+      }
+    } catch (error) {
+      console.error(`Error in test ${testCase.id}:`, error);
+    }
+  }
+
+  // Calculate overall metrics
+  const overall = calculateMetrics(results);
+
+  // Calculate metrics by entity type
+  const byEntityType: Record<string, AccuracyMetrics> = {};
+  for (const entityType of ENTITY_TYPES) {
+    const filtered = results.map((r) => ({
+      ...r,
+      expected: r.expected.filter((e) => e.type === entityType),
+      truePositives: r.truePositives.filter((e) => e.type === entityType),
+      falseNegatives: r.falseNegatives.filter((e) => e.type === entityType),
+      falsePositives: r.falsePositives.filter((e) => e.type === entityType),
+    }));
+    // Only include if there are test cases for this type
+    const hasTestCases = filtered.some((r) => r.expected.length > 0 || r.falsePositives.length > 0);
+    if (hasTestCases) {
+      byEntityType[entityType] = calculateMetrics(filtered);
+    }
+  }
+
+  // Calculate metrics by language
+  const languages = [...new Set(results.map((r) => r.language))];
+  const byLanguage: Record<string, AccuracyMetrics> = {};
+  for (const lang of languages) {
+    byLanguage[lang] = calculateMetrics(results.filter((r) => r.language === lang));
+  }
+
+  // Print report
+  console.log("\nââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ");
+  console.log("                         RESULTS");
+  console.log("ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ\n");
+
+  console.log(`Overall: ${overall.passed}/${overall.total} passed\n`);
+
+  console.log("Metrics (P=Precision, R=Recall, F1=F1-Score):\n");
+
+  console.log(
+    `  ${"OVERALL".padEnd(20)} P=${formatPercent(overall.precision).padStart(6)}  ` +
+      `R=${formatPercent(overall.recall).padStart(6)}  ` +
+      `F1=${formatPercent(overall.f1).padStart(6)}\n`,
+  );
+
+  console.log("By Entity Type:");
+  for (const [type, metrics] of Object.entries(byEntityType)) {
+    printMetrics(type, metrics);
+  }
+
+  console.log("\nBy Language:");
+  for (const [lang, metrics] of Object.entries(byLanguage)) {
+    printMetrics(lang.toUpperCase(), metrics);
+  }
+
+  // Print false negatives (missed PII)
+  const allFalseNegatives = results.flatMap((r) =>
+    r.falseNegatives.map((fn) => ({ testId: r.id, text: r.text, entity: fn })),
+  );
+  if (allFalseNegatives.length > 0) {
+    console.log("\nââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ");
+    console.log(`False Negatives (Missed PII): ${allFalseNegatives.length}`);
+    console.log("ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ");
+    for (const { testId, entity } of allFalseNegatives) {
+      console.log(`  â "${entity.text}" (${entity.type}) in ${testId}`);
+    }
+  }
+
+  // Print false positives (wrong detections)
+  const allFalsePositives = results.flatMap((r) =>
+    r.falsePositives.map((fp) => ({ testId: r.id, text: r.text, entity: fp })),
+  );
+  if (allFalsePositives.length > 0) {
+    console.log("\nââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ");
+    console.log(`False Positives (Wrong Detections): ${allFalsePositives.length}`);
+    console.log("ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ");
+    for (const { testId, entity } of allFalsePositives) {
+      console.log(`  â "${entity.text}" (${entity.type}, score=${entity.score.toFixed(2)}) in ${testId}`);
+    }
+  }
+
+  console.log("\nââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ\n");
+
+  // Exit with error code if tests failed
+  if (overall.failed > 0) {
+    process.exit(1);
+  }
+}
+
+main().catch((error) => {
+  console.error("Benchmark failed:", error);
+  process.exit(1);
+});
diff --git a/benchmarks/pii-accuracy/test-data/de.yaml b/benchmarks/pii-accuracy/test-data/de.yaml
new file mode 100644
index 0000000..530fd04
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/de.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs German context words)
+  - id: de_phone
+    text: "Telefon: +49 171 1234567"
+    language: de
+    expected:
+      - type: PHONE_NUMBER
+        text: "+49 171 1234567"
+
+  # Person (NER model)
+  - id: de_person
+    text: "Meeting mit Max MÃ¼ller morgen um 10 Uhr"
+    language: de
+    expected:
+      - type: PERSON
+        text: "Max MÃ¼ller"
+
+  # Location (NER model)
+  - id: de_location
+    text: "Das Meeting findet in MÃ¼nchen statt"
+    language: de
+    expected:
+      - type: LOCATION
+        text: "MÃ¼nchen"
+
+  # Mixed (real-world prompt)
+  - id: de_mixed
+    text: "Kontaktdaten von Hans Meier: hans.meier@example.com, Telefon 0171-1234567"
+    language: de
+    expected:
+      - type: PERSON
+        text: "Hans Meier"
+      - type: EMAIL_ADDRESS
+        text: "hans.meier@example.com"
+      - type: PHONE_NUMBER
+        text: "0171-1234567"
+
+  # False Positive
+  - id: de_fp
+    text: "Das Wetter ist heute sehr schÃ¶n"
+    language: de
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/en.yaml b/benchmarks/pii-accuracy/test-data/en.yaml
new file mode 100644
index 0000000..13bd6aa
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/en.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs English context words)
+  - id: en_phone
+    text: "Phone: (555) 123-4567"
+    language: en
+    expected:
+      - type: PHONE_NUMBER
+        text: "(555) 123-4567"
+
+  # Person (NER model)
+  - id: en_person
+    text: "Schedule a meeting with John Smith tomorrow"
+    language: en
+    expected:
+      - type: PERSON
+        text: "John Smith"
+
+  # Location (NER model)
+  - id: en_location
+    text: "The conference is in New York next week"
+    language: en
+    expected:
+      - type: LOCATION
+        text: "New York"
+
+  # Mixed (real-world prompt)
+  - id: en_mixed
+    text: "Customer John Doe (john@example.com), phone (555) 987-6543"
+    language: en
+    expected:
+      - type: PERSON
+        text: "John Doe"
+      - type: EMAIL_ADDRESS
+        text: "john@example.com"
+      - type: PHONE_NUMBER
+        text: "(555) 987-6543"
+
+  # False Positive
+  - id: en_fp
+    text: "The weather is nice today"
+    language: en
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/es.yaml b/benchmarks/pii-accuracy/test-data/es.yaml
new file mode 100644
index 0000000..ac7b305
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/es.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs Spanish context words)
+  - id: es_phone
+    text: "TelÃ©fono: +34 612 345 678"
+    language: es
+    expected:
+      - type: PHONE_NUMBER
+        text: "+34 612 345 678"
+
+  # Person (NER model)
+  - id: es_person
+    text: "ReuniÃ³n con Ana MartÃ­nez maÃ±ana a las 10"
+    language: es
+    expected:
+      - type: PERSON
+        text: "Ana MartÃ­nez"
+
+  # Location (NER model)
+  - id: es_location
+    text: "La reuniÃ³n serÃ¡ en Madrid la prÃ³xima semana"
+    language: es
+    expected:
+      - type: LOCATION
+        text: "Madrid"
+
+  # Mixed (real-world prompt)
+  - id: es_mixed
+    text: "Contacto de Carlos GarcÃ­a: carlos.garcia@example.es, telÃ©fono +34 698 765 432"
+    language: es
+    expected:
+      - type: PERSON
+        text: "Carlos GarcÃ­a"
+      - type: EMAIL_ADDRESS
+        text: "carlos.garcia@example.es"
+      - type: PHONE_NUMBER
+        text: "+34 698 765 432"
+
+  # False Positive
+  - id: es_fp
+    text: "El clima estÃ¡ muy agradable hoy"
+    language: es
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/fr.yaml b/benchmarks/pii-accuracy/test-data/fr.yaml
new file mode 100644
index 0000000..e2423c8
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/fr.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs French context words)
+  - id: fr_phone
+    text: "TÃ©lÃ©phone: 06 12 34 56 78"
+    language: fr
+    expected:
+      - type: PHONE_NUMBER
+        text: "06 12 34 56 78"
+
+  # Person (NER model)
+  - id: fr_person
+    text: "J'ai une rÃ©union avec Marie Dubois demain Ã  10h"
+    language: fr
+    expected:
+      - type: PERSON
+        text: "Marie Dubois"
+
+  # Location (NER model)
+  - id: fr_location
+    text: "La confÃ©rence aura lieu Ã  Lyon la semaine prochaine"
+    language: fr
+    expected:
+      - type: LOCATION
+        text: "Lyon"
+
+  # Mixed (real-world prompt)
+  - id: fr_mixed
+    text: "Contact de Jean Dupont: jean.dupont@example.fr, tÃ©lÃ©phone 06 98 76 54 32"
+    language: fr
+    expected:
+      - type: PERSON
+        text: "Jean Dupont"
+      - type: EMAIL_ADDRESS
+        text: "jean.dupont@example.fr"
+      - type: PHONE_NUMBER
+        text: "06 98 76 54 32"
+
+  # False Positive
+  - id: fr_fp
+    text: "Le temps est magnifique aujourd'hui"
+    language: fr
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/global.yaml b/benchmarks/pii-accuracy/test-data/global.yaml
new file mode 100644
index 0000000..fa8e8a4
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/global.yaml
@@ -0,0 +1,41 @@
+test_cases:
+  # Pattern-based recognizers - language independent
+  # Tested once since regex/checksum works the same for all languages
+
+  # Email
+  - id: global_email
+    text: "Contact me at john.doe@company.com"
+    language: en
+    expected:
+      - type: EMAIL_ADDRESS
+        text: "john.doe@company.com"
+
+  # IBAN
+  - id: global_iban
+    text: "Transfer to IBAN DE89370400440532013000"
+    language: en
+    expected:
+      - type: IBAN_CODE
+        text: "DE89370400440532013000"
+
+  # Credit Card
+  - id: global_credit_card
+    text: "Card number: 4111 1111 1111 1111"
+    language: en
+    expected:
+      - type: CREDIT_CARD
+        text: "4111 1111 1111 1111"
+
+  # IP Address
+  - id: global_ip
+    text: "Server IP is 8.8.8.8"
+    language: en
+    expected:
+      - type: IP_ADDRESS
+        text: "8.8.8.8"
+
+  # Empty text
+  - id: global_empty
+    text: ""
+    language: en
+    expected: []
diff --git a/benchmarks/pii-accuracy/test-data/it.yaml b/benchmarks/pii-accuracy/test-data/it.yaml
new file mode 100644
index 0000000..cc6a37f
--- /dev/null
+++ b/benchmarks/pii-accuracy/test-data/it.yaml
@@ -0,0 +1,42 @@
+test_cases:
+  # Phone (needs Italian context words)
+  - id: it_phone
+    text: "Telefono: 333 1234567"
+    language: it
+    expected:
+      - type: PHONE_NUMBER
+        text: "333 1234567"
+
+  # Person (NER model)
+  - id: it_person
+    text: "Riunione con Giuseppe Verdi domani alle 10"
+    language: it
+    expected:
+      - type: PERSON
+        text: "Giuseppe Verdi"
+
+  # Location (NER model)
+  - id: it_location
+    text: "L'evento si terrÃ  a Milano il prossimo mese"
+    language: it
+    expected:
+      - type: LOCATION
+        text: "Milano"
+
+  # Mixed (real-world prompt)
+  - id: it_mixed
+    text: "Contatto di Marco Rossi: marco.rossi@example.it, telefono 333 9876543"
+    language: it
+    expected:
+      - type: PERSON
+        text: "Marco Rossi"
+      - type: EMAIL_ADDRESS
+        text: "marco.rossi@example.it"
+      - type: PHONE_NUMBER
+        text: "333 9876543"
+
+  # False Positive
+  - id: it_fp
+    text: "Il caffÃ¨ italiano Ã¨ il migliore del mondo"
+    language: it
+    expected: []
diff --git a/benchmarks/pii-accuracy/types.ts b/benchmarks/pii-accuracy/types.ts
new file mode 100644
index 0000000..a78bc5a
--- /dev/null
+++ b/benchmarks/pii-accuracy/types.ts
@@ -0,0 +1,53 @@
+import { z } from "zod";
+
+// Schema for expected PII entity in test data
+export const ExpectedEntitySchema = z.object({
+  type: z.string(),
+  text: z.string(),
+});
+
+// Schema for a single test case
+export const TestCaseSchema = z.object({
+  id: z.string(),
+  text: z.string(),
+  language: z.string(),
+  expected: z.array(ExpectedEntitySchema),
+  description: z.string().optional(),
+});
+
+export type ExpectedEntity = z.infer<typeof ExpectedEntitySchema>;
+export type TestCase = z.infer<typeof TestCaseSchema>;
+
+// Result of running a single test case
+export interface TestResult {
+  id: string;
+  text: string;
+  language: string;
+  passed: boolean;
+  expected: ExpectedEntity[];
+  detected: DetectedEntity[];
+  falseNegatives: ExpectedEntity[]; // Expected but not detected
+  falsePositives: DetectedEntity[]; // Detected but not expected
+  truePositives: ExpectedEntity[]; // Correctly detected
+}
+
+export interface DetectedEntity {
+  type: string;
+  text: string;
+  start: number;
+  end: number;
+  score: number;
+}
+
+// Aggregated metrics
+export interface AccuracyMetrics {
+  total: number;
+  passed: number;
+  failed: number;
+  precision: number; // TP / (TP + FP)
+  recall: number; // TP / (TP + FN)
+  f1: number; // 2 * (P * R) / (P + R)
+  truePositives: number;
+  falsePositives: number;
+  falseNegatives: number;
+}
diff --git a/package.json b/package.json
index 4921794..ab316a5 100644
--- a/package.json
+++ b/package.json
@@ -12,7 +12,8 @@
     "typecheck": "tsc --noEmit",
     "lint": "biome lint src",
     "format": "biome format src --write",
-    "check": "biome check src"
+    "check": "biome check src",
+    "benchmark:accuracy": "bun run benchmarks/pii-accuracy/run.ts"
   },
   "dependencies": {
     "@hono/zod-validator": "^0.7.6",
diff --git a/presidio/languages.yaml b/presidio/languages.yaml
index 266aa36..a241e49 100644
--- a/presidio/languages.yaml
+++ b/presidio/languages.yaml
@@ -8,216 +8,151 @@
 
 spacy_version: "3.8.0"
 
+# Phone context words per language (5-7 words each)
+# Covers: phone/telephone, number, mobile, call
+# Based on research of common usage and regional variants
+
 languages:
   # Catalan
   ca:
     name: Catalan
     model: ca_core_news_md
+    phone_context: [telÃ¨fon, nÃºmero, mÃ²bil, trucada, trucar]
 
   # Chinese
   zh:
     name: Chinese
     model: zh_core_web_md
+    phone_context: [çµè¯, ææº, å·ç , æçµè¯, çµè¯å·ç , ææºå·ç ]
 
   # Croatian
   hr:
     name: Croatian
     model: hr_core_news_md
+    phone_context: [telefon, broj, mobitel, poziv, nazovi, zvati]
 
   # Danish
   da:
     name: Danish
     model: da_core_news_md
+    phone_context: [telefon, nummer, mobil, mobiltelefon, opkald, ringe]
 
   # Dutch
   nl:
     name: Dutch
     model: nl_core_news_md
-    phone_context:
-      - telefoon
-      - telefoonnummer
-      - mobiel
-      - bellen
-      - fax
-
-  # English
+    phone_context: [telefoon, nummer, mobiel, mobieltje, GSM, bellen]
+
+  # English (Presidio defaults)
   en:
     name: English
     model: en_core_web_lg
-    phone_context:
-      - phone
-      - telephone
-      - cell
-      - mobile
-      - call
-      - fax
+    phone_context: [phone, number, telephone, cell, cellphone, mobile, call]
 
   # Finnish
   fi:
     name: Finnish
     model: fi_core_news_md
+    phone_context: [puhelin, numero, kÃ¤nnykkÃ¤, matkapuhelin, soittaa, puhelinnumero]
 
   # French
   fr:
     name: French
     model: fr_core_news_md
-    phone_context:
-      - tÃ©lÃ©phone
-      - portable
-      - mobile
-      - numÃ©ro
-      - appeler
-      - fax
+    phone_context: [tÃ©lÃ©phone, numÃ©ro, portable, mobile, appeler, tÃ©l]
 
   # German
   de:
     name: German
-    model: de_core_news_md
-    phone_context:
-      - telefon
-      - telefonnummer
-      - handy
-      - mobil
-      - mobilnummer
-      - fax
-      - anrufen
+    model: de_core_news_lg
+    phone_context: [telefon, nummer, handy, mobiltelefon, anruf, rufnummer]
 
   # Greek
   el:
     name: Greek
     model: el_core_news_md
-    phone_context:
-      - ÏÎ·Î»Î­ÏÏÎ½Î¿
-      - ÎºÎ¹Î½Î·ÏÏ
-      - ÏÎ±Î¾
+    phone_context: [ÏÎ·Î»Î­ÏÏÎ½Î¿, Î±ÏÎ¹Î¸Î¼ÏÏ, ÎºÎ¹Î½Î·ÏÏ, ÎºÎ»Î®ÏÎ·, ÏÎ·Î»ÎµÏÏÎ½Ï, ÎºÎ±Î»Ï]
 
   # Italian
   it:
     name: Italian
     model: it_core_news_md
-    phone_context:
-      - telefono
-      - cellulare
-      - mobile
-      - numero
-      - chiamare
-      - fax
+    phone_context: [telefono, numero, cellulare, telefonino, chiamare, chiamata]
 
   # Japanese
   ja:
     name: Japanese
     model: ja_core_news_md
-    phone_context:
-      - é»è©±
-      - æºå¸¯
-      - ã¢ãã¤ã«
-      - ãã¡ãã¯ã¹
+    phone_context: [é»è©±, æºå¸¯, çªå·, ã¹ãã, ã±ã¼ã¿ã¤, é»è©±çªå·]
 
   # Korean
   ko:
     name: Korean
     model: ko_core_news_md
-    phone_context:
-      - ì í
-      - í´ëí°
-      - ëª¨ë°ì¼
-      - í©ì¤
+    phone_context: [ì í, í´ëí°, í¸ëí°, ë²í¸, ì íë²í¸, íµí]
 
   # Lithuanian
   lt:
     name: Lithuanian
     model: lt_core_news_md
+    phone_context: [telefonas, numeris, mobilusis, skambutis, skambinti]
 
   # Macedonian
   mk:
     name: Macedonian
     model: mk_core_news_md
+    phone_context: [ÑÐµÐ»ÐµÑÐ¾Ð½, Ð±ÑÐ¾Ñ, Ð¼Ð¾Ð±Ð¸Ð»ÐµÐ½, Ð¿Ð¾Ð²Ð¸Ðº, Ð·Ð²Ð¾Ð½Ð¸]
 
   # Norwegian BokmÃ¥l
   nb:
     name: Norwegian
     model: nb_core_news_md
-    phone_context:
-      - telefon
-      - mobil
-      - ringe
-      - faks
+    phone_context: [telefon, nummer, mobil, mobiltelefon, samtale, ringe]
 
   # Polish
   pl:
     name: Polish
     model: pl_core_news_md
-    phone_context:
-      - telefon
-      - komÃ³rka
-      - dzwoniÄ
-      - faks
+    phone_context: [telefon, numer, komÃ³rka, komÃ³rkowy, dzwoÅ, zadzwoÅ]
 
   # Portuguese
   pt:
     name: Portuguese
     model: pt_core_news_md
-    phone_context:
-      - telefone
-      - celular
-      - mÃ³vel
-      - ligar
-      - fax
+    phone_context: [telefone, nÃºmero, celular, telemÃ³vel, ligar, telefonar]
 
   # Romanian
   ro:
     name: Romanian
     model: ro_core_news_md
-    phone_context:
-      - telefon
-      - mobil
-      - apel
-      - fax
+    phone_context: [telefon, numÄr, mobil, apel, suna]
 
   # Russian
   ru:
     name: Russian
     model: ru_core_news_md
-    phone_context:
-      - ÑÐµÐ»ÐµÑÐ¾Ð½
-      - Ð¼Ð¾Ð±Ð¸Ð»ÑÐ½ÑÐ¹
-      - Ð·Ð²Ð¾Ð½Ð¸ÑÑ
-      - ÑÐ°ÐºÑ
+    phone_context: [ÑÐµÐ»ÐµÑÐ¾Ð½, Ð½Ð¾Ð¼ÐµÑ, Ð¼Ð¾Ð±Ð¸Ð»ÑÐ½Ð¸Ðº, Ð¼Ð¾Ð±Ð¸Ð»Ð°, ÑÐ¾ÑÐ¾Ð²ÑÐ¹, Ð·Ð²Ð¾Ð½Ð¾Ðº]
 
   # Slovenian
   sl:
     name: Slovenian
     model: sl_core_news_md
+    phone_context: [telefon, Å¡tevilka, mobilnik, mobilec, klic, pokliÄi]
 
   # Spanish
   es:
     name: Spanish
     model: es_core_news_md
-    phone_context:
-      - telÃ©fono
-      - mÃ³vil
-      - celular
-      - nÃºmero
-      - llamar
-      - fax
+    phone_context: [telÃ©fono, nÃºmero, mÃ³vil, celular, llamar, llamada]
 
   # Swedish
   sv:
     name: Swedish
     model: sv_core_news_md
-    phone_context:
-      - telefon
-      - mobil
-      - ringa
-      - fax
+    phone_context: [telefon, nummer, mobil, mobiltelefon, samtal, ringa]
 
   # Ukrainian
   uk:
     name: Ukrainian
     model: uk_core_news_md
-    phone_context:
-      - ÑÐµÐ»ÐµÑÐ¾Ð½
-      - Ð¼Ð¾Ð±ÑÐ»ÑÐ½Ð¸Ð¹
-      - Ð´Ð·Ð²Ð¾Ð½Ð¸ÑÐ¸
-      - ÑÐ°ÐºÑ
-
+    phone_context: [ÑÐµÐ»ÐµÑÐ¾Ð½, Ð½Ð¾Ð¼ÐµÑ, Ð¼Ð¾Ð±ÑÐ»ÑÐ½Ð¸Ð¹, Ð¼Ð¾Ð±ÑÐ»ÐºÐ°, Ð´Ð·Ð²ÑÐ½Ð¾Ðº, Ð´Ð·Ð²Ð¾Ð½Ð¸]