From: Stefan Gasser Date: Mon, 9 Feb 2026 08:05:13 +0000 (+0100) Subject: Fix missing Presidio recognizers for URL, US_SSN, CRYPTO, etc. (#69) X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=5871fa526a7a63c4c91af2d598736ef0cb6e9236;p=sgasser-llm-shield.git Fix missing Presidio recognizers for URL, US_SSN, CRYPTO, etc. (#69) The config generator only included 6 recognizers, missing standard ones like UrlRecognizer, UsSsnRecognizer, CryptoRecognizer. This caused detection failures when users enabled these entity types. Changes: - Add GLOBAL_RECOGNIZERS for pattern-based detection (7 recognizers) - Add LANGUAGE_RECOGNIZERS for language-specific detection - Only load language-specific recognizers when that language is configured - EN: US + UK recognizers (8) - ES: Spanish NIF/NIE (2) - IT: Italian documents (5) - PL: Polish PESEL (1) - KO: Korean RRN (1) Fixes #67 --- diff --git a/docker/presidio/generate-configs.py b/docker/presidio/generate-configs.py index 2444c5f..6a12998 100644 --- a/docker/presidio/generate-configs.py +++ b/docker/presidio/generate-configs.py @@ -101,10 +101,54 @@ def generate_analyzer_config(languages: list[str]) -> dict: return {"supported_languages": languages, "default_score_threshold": 0} +# Global recognizers - pattern-based, work for any language +GLOBAL_RECOGNIZERS = [ + "CreditCardRecognizer", + "CryptoRecognizer", + "DateRecognizer", + "EmailRecognizer", + "IbanRecognizer", + "IpRecognizer", + "UrlRecognizer", +] + +# Language-specific recognizers - only loaded when that language is configured +LANGUAGE_RECOGNIZERS = { + "en": [ + # US + "UsSsnRecognizer", + "UsPassportRecognizer", + "UsItinRecognizer", + "UsBankRecognizer", + "UsLicenseRecognizer", + "MedicalLicenseRecognizer", + # UK + "UkNinoRecognizer", + "NhsRecognizer", + ], + "es": [ + "EsNifRecognizer", + "EsNieRecognizer", + ], + "it": [ + "ItDriverLicenseRecognizer", + "ItFiscalCodeRecognizer", + "ItVatCodeRecognizer", + "ItIdentityCardRecognizer", + "ItPassportRecognizer", + ], + "pl": [ + "PlPeselRecognizer", + ], + "ko": [ + "KrRrnRecognizer", + ], +} + + def generate_recognizers_config(languages: list[str], registry: dict) -> dict: """Generate recognizers-config.yaml content.""" - # Build language entries for each recognizer type - spacy_langs = [{"language": lang} for lang in languages] + all_langs = [{"language": lang} for lang in languages] # Phone recognizer needs context words per language phone_langs = [] @@ -115,41 +159,42 @@ def generate_recognizers_config(languages: list[str], registry: dict) -> dict: entry["context"] = lang_config["phone_context"] phone_langs.append(entry) + recognizers = [ + { + "name": "SpacyRecognizer", + "supported_languages": all_langs, + "type": "predefined", + }, + { + "name": "PhoneRecognizer", + "supported_languages": phone_langs, + "type": "predefined", + }, + ] + + # Add global recognizers for all configured languages + for name in GLOBAL_RECOGNIZERS: + recognizers.append({ + "name": name, + "supported_languages": all_langs, + "type": "predefined", + }) + + # Add language-specific recognizers only if that language is configured + for lang in languages: + if lang in LANGUAGE_RECOGNIZERS: + lang_entry = [{"language": lang}] + for name in LANGUAGE_RECOGNIZERS[lang]: + recognizers.append({ + "name": name, + "supported_languages": lang_entry, + "type": "predefined", + }) + return { "supported_languages": languages, "global_regex_flags": 26, - "recognizers": [ - { - "name": "SpacyRecognizer", - "supported_languages": spacy_langs, - "type": "predefined", - }, - { - "name": "EmailRecognizer", - "supported_languages": spacy_langs, - "type": "predefined", - }, - { - "name": "PhoneRecognizer", - "supported_languages": phone_langs, - "type": "predefined", - }, - { - "name": "CreditCardRecognizer", - "supported_languages": spacy_langs, - "type": "predefined", - }, - { - "name": "IbanRecognizer", - "supported_languages": spacy_langs, - "type": "predefined", - }, - { - "name": "IpRecognizer", - "supported_languages": spacy_langs, - "type": "predefined", - }, - ], + "recognizers": recognizers, }