From: Stefan Gasser Date: Thu, 8 Jan 2026 16:15:59 +0000 (+0100) Subject: Add PII accuracy benchmark with multi-language phone context (#1) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=189e4361fc6b3bf30965de6ff23ce249a18215bf;p=sgasser-llm-shield.git Add PII accuracy benchmark with multi-language phone context (#1) - Add benchmark framework with precision/recall/F1 metrics - Add 30 test cases across 5 languages (DE, EN, ES, FR, IT) - Add phone_context words for all 24 supported languages - Each language has 5-7 native words for: phone, number, mobile, call Test with: bun run benchmark:accuracy --- diff --git a/README.md b/README.md index 1bd2793..619234c 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,26 @@ bun test # Run tests bun run check # Lint & format ``` +## Benchmarks + +PII detection accuracy benchmark with test cases for multiple languages: + +```bash +# Start Presidio with all benchmark languages +LANGUAGES=en,de,fr,it,es docker compose up presidio-analyzer -d + +# Run all tests +bun run benchmarks/pii-accuracy/run.ts + +# Run specific languages only +bun run benchmarks/pii-accuracy/run.ts --languages de,en + +# Verbose output +bun run benchmarks/pii-accuracy/run.ts --verbose +``` + +Test data in `benchmarks/pii-accuracy/test-data/` (one file per language). + ## License [Apache 2.0](LICENSE) diff --git a/benchmarks/pii-accuracy/run.ts b/benchmarks/pii-accuracy/run.ts new file mode 100644 index 0000000..b7ab343 --- /dev/null +++ b/benchmarks/pii-accuracy/run.ts @@ -0,0 +1,412 @@ +#!/usr/bin/env bun +/** + * PII Detection Accuracy Benchmark + * + * Measures precision, recall, and F1 score of the PII detection system. + * + * Usage: + * bun run benchmarks/pii-accuracy/run.ts + * bun run benchmarks/pii-accuracy/run.ts --threshold 0.5 + * bun run benchmarks/pii-accuracy/run.ts --languages de,en + * bun run benchmarks/pii-accuracy/run.ts --verbose + */ + +import { parseArgs } from "util"; +import { Glob } from "bun"; +import { parse as parseYaml } from "yaml"; +import { + TestCaseSchema, + type AccuracyMetrics, + type DetectedEntity, + type ExpectedEntity, + type TestCase, + type TestResult, +} from "./types"; + +// Configuration +const DEFAULT_THRESHOLD = 0.7; +const PRESIDIO_URL = process.env.PRESIDIO_URL || "http://localhost:5002"; +const TEST_DATA_DIR = import.meta.dir + "/test-data"; + +// Parse command line arguments +const { values: args } = parseArgs({ + args: Bun.argv.slice(2), + options: { + threshold: { type: "string", short: "t" }, + languages: { type: "string", short: "l" }, + verbose: { type: "boolean", short: "v", default: false }, + help: { type: "boolean", short: "h", default: false }, + }, +}); + +if (args.help) { + console.log(` +PII Detection Accuracy Benchmark + +Usage: + bun run benchmarks/pii-accuracy/run.ts [options] + +Options: + -t, --threshold Score threshold (default: ${DEFAULT_THRESHOLD}) + -l, --languages Comma-separated languages to test (e.g., de,en) + -v, --verbose Show detailed results for each test case + -h, --help Show this help message + +Examples: + bun run benchmarks/pii-accuracy/run.ts + bun run benchmarks/pii-accuracy/run.ts --threshold 0.5 + bun run benchmarks/pii-accuracy/run.ts --languages de,en + bun run benchmarks/pii-accuracy/run.ts --verbose +`); + process.exit(0); +} + +const threshold = args.threshold ? Number.parseFloat(args.threshold) : DEFAULT_THRESHOLD; +const verbose = args.verbose ?? false; +const languageFilter = args.languages?.split(",").map((l) => l.trim().toLowerCase()); + +// Entity types we test for +const ENTITY_TYPES = [ + "PERSON", + "EMAIL_ADDRESS", + "PHONE_NUMBER", + "CREDIT_CARD", + "IBAN_CODE", + "IP_ADDRESS", + "LOCATION", +]; + +/** + * Load test cases from YAML files in test-data directory + */ +async function loadTestCases(): Promise { + const testCases: TestCase[] = []; + const glob = new Glob("*.yaml"); + + for await (const file of glob.scan(TEST_DATA_DIR)) { + const filePath = `${TEST_DATA_DIR}/${file}`; + const content = await Bun.file(filePath).text(); + const data = parseYaml(content) as { test_cases: unknown[] }; + + if (!data.test_cases || !Array.isArray(data.test_cases)) { + console.warn(`Warning: ${file} has no test_cases array`); + continue; + } + + for (const testCase of data.test_cases) { + const parsed = TestCaseSchema.safeParse(testCase); + if (parsed.success) { + // Filter by language if specified + if (!languageFilter || languageFilter.includes(parsed.data.language)) { + testCases.push(parsed.data); + } + } else { + console.warn(`Warning: Invalid test case in ${file}:`, parsed.error.format()); + } + } + } + + // Sort by ID for deterministic output + return testCases.sort((a, b) => a.id.localeCompare(b.id)); +} + +/** + * Call Presidio analyzer API + */ +async function detectPII(text: string, language: string): Promise { + if (!text) return []; + + const response = await fetch(`${PRESIDIO_URL}/analyze`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + text, + language, + entities: ENTITY_TYPES, + score_threshold: threshold, + }), + }); + + if (!response.ok) { + throw new Error(`Presidio error: ${response.status} ${await response.text()}`); + } + + const entities = (await response.json()) as Array<{ + entity_type: string; + start: number; + end: number; + score: number; + }>; + + return entities.map((e) => ({ + type: e.entity_type, + text: text.slice(e.start, e.end), + start: e.start, + end: e.end, + score: e.score, + })); +} + +/** + * Check if Presidio is available + */ +async function checkPresidio(): Promise { + try { + const response = await fetch(`${PRESIDIO_URL}/health`); + return response.ok; + } catch { + return false; + } +} + +/** + * Run a single test case and compare results + */ +async function runTestCase(testCase: TestCase): Promise { + const detected = await detectPII(testCase.text, testCase.language); + + // Match expected entities with detected ones + const truePositives: ExpectedEntity[] = []; + const falseNegatives: ExpectedEntity[] = []; + const matchedDetected = new Set(); + + for (const expected of testCase.expected) { + // Find a matching detected entity + const matchIndex = detected.findIndex( + (d, i) => + !matchedDetected.has(i) && + d.type === expected.type && + normalizeText(d.text).includes(normalizeText(expected.text)), + ); + + if (matchIndex !== -1) { + truePositives.push(expected); + matchedDetected.add(matchIndex); + } else { + falseNegatives.push(expected); + } + } + + // Remaining detected entities are false positives + const falsePositives = detected.filter((_, i) => !matchedDetected.has(i)); + + const passed = falseNegatives.length === 0 && falsePositives.length === 0; + + return { + id: testCase.id, + text: testCase.text, + language: testCase.language, + passed, + expected: testCase.expected, + detected, + falseNegatives, + falsePositives, + truePositives, + }; +} + +/** + * Normalize text for comparison (lowercase, trim) + */ +function normalizeText(text: string): string { + return text.toLowerCase().trim(); +} + +/** + * Calculate accuracy metrics from results + */ +function calculateMetrics(results: TestResult[]): AccuracyMetrics { + let tp = 0; + let fp = 0; + let fn = 0; + + for (const result of results) { + tp += result.truePositives.length; + fp += result.falsePositives.length; + fn += result.falseNegatives.length; + } + + const precision = tp + fp > 0 ? tp / (tp + fp) : 1; + const recall = tp + fn > 0 ? tp / (tp + fn) : 1; + const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0; + + return { + total: results.length, + passed: results.filter((r) => r.passed).length, + failed: results.filter((r) => !r.passed).length, + precision, + recall, + f1, + truePositives: tp, + falsePositives: fp, + falseNegatives: fn, + }; +} + +/** + * Format percentage for display + */ +function formatPercent(value: number): string { + return `${(value * 100).toFixed(1)}%`; +} + +/** + * Print metrics in a formatted way + */ +function printMetrics(label: string, metrics: AccuracyMetrics): void { + const status = metrics.failed === 0 ? "✓" : "⚠"; + console.log( + ` ${label.padEnd(20)} P=${formatPercent(metrics.precision).padStart(6)} ` + + `R=${formatPercent(metrics.recall).padStart(6)} ` + + `F1=${formatPercent(metrics.f1).padStart(6)} ${status}`, + ); +} + +/** + * Main benchmark execution + */ +async function main(): Promise { + console.log("\n╔════════════════════════════════════════════════════════════╗"); + console.log("║ PII Detection Accuracy Benchmark ║"); + console.log("╚════════════════════════════════════════════════════════════╝\n"); + + // Check Presidio availability + console.log(`Presidio URL: ${PRESIDIO_URL}`); + console.log(`Threshold: ${threshold}`); + if (languageFilter) { + console.log(`Languages: ${languageFilter.join(", ")}`); + } + + if (!(await checkPresidio())) { + console.error("\n✗ Presidio is not available. Start it with:"); + console.error(" docker compose up presidio-analyzer -d\n"); + process.exit(1); + } + console.log("Presidio: ✓ Connected\n"); + + // Load test cases from directory + const testCases = await loadTestCases(); + + if (testCases.length === 0) { + console.error("No test cases found in", TEST_DATA_DIR); + process.exit(1); + } + + console.log(`Running ${testCases.length} test cases...\n`); + + // Run all test cases + const results: TestResult[] = []; + for (const testCase of testCases) { + try { + const result = await runTestCase(testCase); + results.push(result); + + if (verbose) { + const icon = result.passed ? "✓" : "✗"; + console.log(`${icon} ${result.id}`); + if (!result.passed) { + if (result.falseNegatives.length > 0) { + console.log(` Missed: ${result.falseNegatives.map((e) => `${e.type}:"${e.text}"`).join(", ")}`); + } + if (result.falsePositives.length > 0) { + console.log( + ` Wrong: ${result.falsePositives.map((e) => `${e.type}:"${e.text}" (${e.score.toFixed(2)})`).join(", ")}`, + ); + } + } + } + } catch (error) { + console.error(`Error in test ${testCase.id}:`, error); + } + } + + // Calculate overall metrics + const overall = calculateMetrics(results); + + // Calculate metrics by entity type + const byEntityType: Record = {}; + for (const entityType of ENTITY_TYPES) { + const filtered = results.map((r) => ({ + ...r, + expected: r.expected.filter((e) => e.type === entityType), + truePositives: r.truePositives.filter((e) => e.type === entityType), + falseNegatives: r.falseNegatives.filter((e) => e.type === entityType), + falsePositives: r.falsePositives.filter((e) => e.type === entityType), + })); + // Only include if there are test cases for this type + const hasTestCases = filtered.some((r) => r.expected.length > 0 || r.falsePositives.length > 0); + if (hasTestCases) { + byEntityType[entityType] = calculateMetrics(filtered); + } + } + + // Calculate metrics by language + const languages = [...new Set(results.map((r) => r.language))]; + const byLanguage: Record = {}; + for (const lang of languages) { + byLanguage[lang] = calculateMetrics(results.filter((r) => r.language === lang)); + } + + // Print report + console.log("\n────────────────────────────────────────────────────────────"); + console.log(" RESULTS"); + console.log("────────────────────────────────────────────────────────────\n"); + + console.log(`Overall: ${overall.passed}/${overall.total} passed\n`); + + console.log("Metrics (P=Precision, R=Recall, F1=F1-Score):\n"); + + console.log( + ` ${"OVERALL".padEnd(20)} P=${formatPercent(overall.precision).padStart(6)} ` + + `R=${formatPercent(overall.recall).padStart(6)} ` + + `F1=${formatPercent(overall.f1).padStart(6)}\n`, + ); + + console.log("By Entity Type:"); + for (const [type, metrics] of Object.entries(byEntityType)) { + printMetrics(type, metrics); + } + + console.log("\nBy Language:"); + for (const [lang, metrics] of Object.entries(byLanguage)) { + printMetrics(lang.toUpperCase(), metrics); + } + + // Print false negatives (missed PII) + const allFalseNegatives = results.flatMap((r) => + r.falseNegatives.map((fn) => ({ testId: r.id, text: r.text, entity: fn })), + ); + if (allFalseNegatives.length > 0) { + console.log("\n────────────────────────────────────────────────────────────"); + console.log(`False Negatives (Missed PII): ${allFalseNegatives.length}`); + console.log("────────────────────────────────────────────────────────────"); + for (const { testId, entity } of allFalseNegatives) { + console.log(` ✗ "${entity.text}" (${entity.type}) in ${testId}`); + } + } + + // Print false positives (wrong detections) + const allFalsePositives = results.flatMap((r) => + r.falsePositives.map((fp) => ({ testId: r.id, text: r.text, entity: fp })), + ); + if (allFalsePositives.length > 0) { + console.log("\n────────────────────────────────────────────────────────────"); + console.log(`False Positives (Wrong Detections): ${allFalsePositives.length}`); + console.log("────────────────────────────────────────────────────────────"); + for (const { testId, entity } of allFalsePositives) { + console.log(` ✗ "${entity.text}" (${entity.type}, score=${entity.score.toFixed(2)}) in ${testId}`); + } + } + + console.log("\n────────────────────────────────────────────────────────────\n"); + + // Exit with error code if tests failed + if (overall.failed > 0) { + process.exit(1); + } +} + +main().catch((error) => { + console.error("Benchmark failed:", error); + process.exit(1); +}); diff --git a/benchmarks/pii-accuracy/test-data/de.yaml b/benchmarks/pii-accuracy/test-data/de.yaml new file mode 100644 index 0000000..530fd04 --- /dev/null +++ b/benchmarks/pii-accuracy/test-data/de.yaml @@ -0,0 +1,42 @@ +test_cases: + # Phone (needs German context words) + - id: de_phone + text: "Telefon: +49 171 1234567" + language: de + expected: + - type: PHONE_NUMBER + text: "+49 171 1234567" + + # Person (NER model) + - id: de_person + text: "Meeting mit Max Müller morgen um 10 Uhr" + language: de + expected: + - type: PERSON + text: "Max Müller" + + # Location (NER model) + - id: de_location + text: "Das Meeting findet in München statt" + language: de + expected: + - type: LOCATION + text: "München" + + # Mixed (real-world prompt) + - id: de_mixed + text: "Kontaktdaten von Hans Meier: hans.meier@example.com, Telefon 0171-1234567" + language: de + expected: + - type: PERSON + text: "Hans Meier" + - type: EMAIL_ADDRESS + text: "hans.meier@example.com" + - type: PHONE_NUMBER + text: "0171-1234567" + + # False Positive + - id: de_fp + text: "Das Wetter ist heute sehr schön" + language: de + expected: [] diff --git a/benchmarks/pii-accuracy/test-data/en.yaml b/benchmarks/pii-accuracy/test-data/en.yaml new file mode 100644 index 0000000..13bd6aa --- /dev/null +++ b/benchmarks/pii-accuracy/test-data/en.yaml @@ -0,0 +1,42 @@ +test_cases: + # Phone (needs English context words) + - id: en_phone + text: "Phone: (555) 123-4567" + language: en + expected: + - type: PHONE_NUMBER + text: "(555) 123-4567" + + # Person (NER model) + - id: en_person + text: "Schedule a meeting with John Smith tomorrow" + language: en + expected: + - type: PERSON + text: "John Smith" + + # Location (NER model) + - id: en_location + text: "The conference is in New York next week" + language: en + expected: + - type: LOCATION + text: "New York" + + # Mixed (real-world prompt) + - id: en_mixed + text: "Customer John Doe (john@example.com), phone (555) 987-6543" + language: en + expected: + - type: PERSON + text: "John Doe" + - type: EMAIL_ADDRESS + text: "john@example.com" + - type: PHONE_NUMBER + text: "(555) 987-6543" + + # False Positive + - id: en_fp + text: "The weather is nice today" + language: en + expected: [] diff --git a/benchmarks/pii-accuracy/test-data/es.yaml b/benchmarks/pii-accuracy/test-data/es.yaml new file mode 100644 index 0000000..ac7b305 --- /dev/null +++ b/benchmarks/pii-accuracy/test-data/es.yaml @@ -0,0 +1,42 @@ +test_cases: + # Phone (needs Spanish context words) + - id: es_phone + text: "Teléfono: +34 612 345 678" + language: es + expected: + - type: PHONE_NUMBER + text: "+34 612 345 678" + + # Person (NER model) + - id: es_person + text: "Reunión con Ana Martínez mañana a las 10" + language: es + expected: + - type: PERSON + text: "Ana Martínez" + + # Location (NER model) + - id: es_location + text: "La reunión será en Madrid la próxima semana" + language: es + expected: + - type: LOCATION + text: "Madrid" + + # Mixed (real-world prompt) + - id: es_mixed + text: "Contacto de Carlos García: carlos.garcia@example.es, teléfono +34 698 765 432" + language: es + expected: + - type: PERSON + text: "Carlos García" + - type: EMAIL_ADDRESS + text: "carlos.garcia@example.es" + - type: PHONE_NUMBER + text: "+34 698 765 432" + + # False Positive + - id: es_fp + text: "El clima está muy agradable hoy" + language: es + expected: [] diff --git a/benchmarks/pii-accuracy/test-data/fr.yaml b/benchmarks/pii-accuracy/test-data/fr.yaml new file mode 100644 index 0000000..e2423c8 --- /dev/null +++ b/benchmarks/pii-accuracy/test-data/fr.yaml @@ -0,0 +1,42 @@ +test_cases: + # Phone (needs French context words) + - id: fr_phone + text: "Téléphone: 06 12 34 56 78" + language: fr + expected: + - type: PHONE_NUMBER + text: "06 12 34 56 78" + + # Person (NER model) + - id: fr_person + text: "J'ai une réunion avec Marie Dubois demain à 10h" + language: fr + expected: + - type: PERSON + text: "Marie Dubois" + + # Location (NER model) + - id: fr_location + text: "La conférence aura lieu à Lyon la semaine prochaine" + language: fr + expected: + - type: LOCATION + text: "Lyon" + + # Mixed (real-world prompt) + - id: fr_mixed + text: "Contact de Jean Dupont: jean.dupont@example.fr, téléphone 06 98 76 54 32" + language: fr + expected: + - type: PERSON + text: "Jean Dupont" + - type: EMAIL_ADDRESS + text: "jean.dupont@example.fr" + - type: PHONE_NUMBER + text: "06 98 76 54 32" + + # False Positive + - id: fr_fp + text: "Le temps est magnifique aujourd'hui" + language: fr + expected: [] diff --git a/benchmarks/pii-accuracy/test-data/global.yaml b/benchmarks/pii-accuracy/test-data/global.yaml new file mode 100644 index 0000000..fa8e8a4 --- /dev/null +++ b/benchmarks/pii-accuracy/test-data/global.yaml @@ -0,0 +1,41 @@ +test_cases: + # Pattern-based recognizers - language independent + # Tested once since regex/checksum works the same for all languages + + # Email + - id: global_email + text: "Contact me at john.doe@company.com" + language: en + expected: + - type: EMAIL_ADDRESS + text: "john.doe@company.com" + + # IBAN + - id: global_iban + text: "Transfer to IBAN DE89370400440532013000" + language: en + expected: + - type: IBAN_CODE + text: "DE89370400440532013000" + + # Credit Card + - id: global_credit_card + text: "Card number: 4111 1111 1111 1111" + language: en + expected: + - type: CREDIT_CARD + text: "4111 1111 1111 1111" + + # IP Address + - id: global_ip + text: "Server IP is 8.8.8.8" + language: en + expected: + - type: IP_ADDRESS + text: "8.8.8.8" + + # Empty text + - id: global_empty + text: "" + language: en + expected: [] diff --git a/benchmarks/pii-accuracy/test-data/it.yaml b/benchmarks/pii-accuracy/test-data/it.yaml new file mode 100644 index 0000000..cc6a37f --- /dev/null +++ b/benchmarks/pii-accuracy/test-data/it.yaml @@ -0,0 +1,42 @@ +test_cases: + # Phone (needs Italian context words) + - id: it_phone + text: "Telefono: 333 1234567" + language: it + expected: + - type: PHONE_NUMBER + text: "333 1234567" + + # Person (NER model) + - id: it_person + text: "Riunione con Giuseppe Verdi domani alle 10" + language: it + expected: + - type: PERSON + text: "Giuseppe Verdi" + + # Location (NER model) + - id: it_location + text: "L'evento si terrà a Milano il prossimo mese" + language: it + expected: + - type: LOCATION + text: "Milano" + + # Mixed (real-world prompt) + - id: it_mixed + text: "Contatto di Marco Rossi: marco.rossi@example.it, telefono 333 9876543" + language: it + expected: + - type: PERSON + text: "Marco Rossi" + - type: EMAIL_ADDRESS + text: "marco.rossi@example.it" + - type: PHONE_NUMBER + text: "333 9876543" + + # False Positive + - id: it_fp + text: "Il caffè italiano è il migliore del mondo" + language: it + expected: [] diff --git a/benchmarks/pii-accuracy/types.ts b/benchmarks/pii-accuracy/types.ts new file mode 100644 index 0000000..a78bc5a --- /dev/null +++ b/benchmarks/pii-accuracy/types.ts @@ -0,0 +1,53 @@ +import { z } from "zod"; + +// Schema for expected PII entity in test data +export const ExpectedEntitySchema = z.object({ + type: z.string(), + text: z.string(), +}); + +// Schema for a single test case +export const TestCaseSchema = z.object({ + id: z.string(), + text: z.string(), + language: z.string(), + expected: z.array(ExpectedEntitySchema), + description: z.string().optional(), +}); + +export type ExpectedEntity = z.infer; +export type TestCase = z.infer; + +// Result of running a single test case +export interface TestResult { + id: string; + text: string; + language: string; + passed: boolean; + expected: ExpectedEntity[]; + detected: DetectedEntity[]; + falseNegatives: ExpectedEntity[]; // Expected but not detected + falsePositives: DetectedEntity[]; // Detected but not expected + truePositives: ExpectedEntity[]; // Correctly detected +} + +export interface DetectedEntity { + type: string; + text: string; + start: number; + end: number; + score: number; +} + +// Aggregated metrics +export interface AccuracyMetrics { + total: number; + passed: number; + failed: number; + precision: number; // TP / (TP + FP) + recall: number; // TP / (TP + FN) + f1: number; // 2 * (P * R) / (P + R) + truePositives: number; + falsePositives: number; + falseNegatives: number; +} diff --git a/package.json b/package.json index 4921794..ab316a5 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,8 @@ "typecheck": "tsc --noEmit", "lint": "biome lint src", "format": "biome format src --write", - "check": "biome check src" + "check": "biome check src", + "benchmark:accuracy": "bun run benchmarks/pii-accuracy/run.ts" }, "dependencies": { "@hono/zod-validator": "^0.7.6", diff --git a/presidio/languages.yaml b/presidio/languages.yaml index 266aa36..a241e49 100644 --- a/presidio/languages.yaml +++ b/presidio/languages.yaml @@ -8,216 +8,151 @@ spacy_version: "3.8.0" +# Phone context words per language (5-7 words each) +# Covers: phone/telephone, number, mobile, call +# Based on research of common usage and regional variants + languages: # Catalan ca: name: Catalan model: ca_core_news_md + phone_context: [telèfon, número, mòbil, trucada, trucar] # Chinese zh: name: Chinese model: zh_core_web_md + phone_context: [电话, 手机, 号码, 打电话, 电话号码, 手机号码] # Croatian hr: name: Croatian model: hr_core_news_md + phone_context: [telefon, broj, mobitel, poziv, nazovi, zvati] # Danish da: name: Danish model: da_core_news_md + phone_context: [telefon, nummer, mobil, mobiltelefon, opkald, ringe] # Dutch nl: name: Dutch model: nl_core_news_md - phone_context: - - telefoon - - telefoonnummer - - mobiel - - bellen - - fax - - # English + phone_context: [telefoon, nummer, mobiel, mobieltje, GSM, bellen] + + # English (Presidio defaults) en: name: English model: en_core_web_lg - phone_context: - - phone - - telephone - - cell - - mobile - - call - - fax + phone_context: [phone, number, telephone, cell, cellphone, mobile, call] # Finnish fi: name: Finnish model: fi_core_news_md + phone_context: [puhelin, numero, kännykkä, matkapuhelin, soittaa, puhelinnumero] # French fr: name: French model: fr_core_news_md - phone_context: - - téléphone - - portable - - mobile - - numéro - - appeler - - fax + phone_context: [téléphone, numéro, portable, mobile, appeler, tél] # German de: name: German - model: de_core_news_md - phone_context: - - telefon - - telefonnummer - - handy - - mobil - - mobilnummer - - fax - - anrufen + model: de_core_news_lg + phone_context: [telefon, nummer, handy, mobiltelefon, anruf, rufnummer] # Greek el: name: Greek model: el_core_news_md - phone_context: - - τηλέφωνο - - κινητό - - φαξ + phone_context: [τηλέφωνο, αριθμός, κινητό, κλήση, τηλεφωνώ, καλώ] # Italian it: name: Italian model: it_core_news_md - phone_context: - - telefono - - cellulare - - mobile - - numero - - chiamare - - fax + phone_context: [telefono, numero, cellulare, telefonino, chiamare, chiamata] # Japanese ja: name: Japanese model: ja_core_news_md - phone_context: - - 電話 - - 携帯 - - モバイル - - ファックス + phone_context: [電話, 携帯, 番号, スマホ, ケータイ, 電話番号] # Korean ko: name: Korean model: ko_core_news_md - phone_context: - - 전화 - - 휴대폰 - - 모바일 - - 팩스 + phone_context: [전화, 휴대폰, 핸드폰, 번호, 전화번호, 통화] # Lithuanian lt: name: Lithuanian model: lt_core_news_md + phone_context: [telefonas, numeris, mobilusis, skambutis, skambinti] # Macedonian mk: name: Macedonian model: mk_core_news_md + phone_context: [телефон, број, мобилен, повик, звони] # Norwegian Bokmål nb: name: Norwegian model: nb_core_news_md - phone_context: - - telefon - - mobil - - ringe - - faks + phone_context: [telefon, nummer, mobil, mobiltelefon, samtale, ringe] # Polish pl: name: Polish model: pl_core_news_md - phone_context: - - telefon - - komórka - - dzwonić - - faks + phone_context: [telefon, numer, komórka, komórkowy, dzwoń, zadzwoń] # Portuguese pt: name: Portuguese model: pt_core_news_md - phone_context: - - telefone - - celular - - móvel - - ligar - - fax + phone_context: [telefone, número, celular, telemóvel, ligar, telefonar] # Romanian ro: name: Romanian model: ro_core_news_md - phone_context: - - telefon - - mobil - - apel - - fax + phone_context: [telefon, număr, mobil, apel, suna] # Russian ru: name: Russian model: ru_core_news_md - phone_context: - - телефон - - мобильный - - звонить - - факс + phone_context: [телефон, номер, мобильник, мобила, сотовый, звонок] # Slovenian sl: name: Slovenian model: sl_core_news_md + phone_context: [telefon, številka, mobilnik, mobilec, klic, pokliči] # Spanish es: name: Spanish model: es_core_news_md - phone_context: - - teléfono - - móvil - - celular - - número - - llamar - - fax + phone_context: [teléfono, número, móvil, celular, llamar, llamada] # Swedish sv: name: Swedish model: sv_core_news_md - phone_context: - - telefon - - mobil - - ringa - - fax + phone_context: [telefon, nummer, mobil, mobiltelefon, samtal, ringa] # Ukrainian uk: name: Ukrainian model: uk_core_news_md - phone_context: - - телефон - - мобільний - - дзвонити - - факс - + phone_context: [телефон, номер, мобільний, мобілка, дзвінок, дзвони]