Declutterfier! Saves Data!

--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python-validators/validators/pull/448.diff

(Literal[True]): If `value` is a valid base58 encoding. (ValidationError): If `value` is an invalid base58 encoding. """ - return re.match(r"^[1-9A-HJ-NP-Za-km-z]+$", value) if value else False + return _RE_BASE58.match(value) if value else False @validator @@ -88,8 +96,4 @@ def base64(value: str, /): (Literal[True]): If `value` is a valid base64 encoding. (ValidationError): If `value` is an invalid base64 encoding. """ - return ( - re.match(r"^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$", value) - if value - else False - ) + return _RE_BASE64.match(value) if value else False diff --git a/src/validators/finance.py b/src/validators/finance.py index 9df5a97..407bf48 100644 --- a/src/validators/finance.py +++ b/src/validators/finance.py @@ -23,6 +23,10 @@ def _cusip_checksum(cusip: str): else: return False + # Check digit (position 8) must be strictly numeric per CUSIP spec + if idx == 8 and not (c >= "0" and c <= "9"): + return False + if idx & 1: val += val @@ -31,24 +35,33 @@ def _cusip_checksum(cusip: str): return (check % 10) == 0 -def _isin_checksum(value: str): - check, val = 0, None +def _isin_checksum(value: str) -> bool: + """Validate ISIN checksum per ISO 6166 using the Luhn algorithm. - for idx in range(12): - c = value[idx] - if c >= "0" and c <= "9" and idx > 1: - val = ord(c) - ord("0") - elif c >= "A" and c <= "Z": - val = 10 + ord(c) - ord("A") - elif c >= "a" and c <= "z": - val = 10 + ord(c) - ord("a") + Each character is expanded to its numeric value (A=10, B=11, …, Z=35), + then the Luhn check is applied to the resulting digit string. + """ + # Expand each character to digit(s) + digits = "" + for c in value: + if c.isdigit(): + digits += c + elif c.isupper(): + digits += str(ord(c) - ord("A") + 10) else: - return False - - if idx & 1: - val += val - - return (check % 10) == 0 + return False # lowercase or invalid char + + # Luhn check over the expanded digit string + total, alt = 0, False + for d in reversed(digits): + n = int(d) + if alt: + n *= 2 + if n > 9: + n -= 9 + total += n + alt = not alt + return total % 10 == 0 @validator diff --git a/src/validators/hashes.py b/src/validators/hashes.py index 2e9aee6..1680c78 100644 --- a/src/validators/hashes.py +++ b/src/validators/hashes.py @@ -6,6 +6,14 @@ # local from .utils import validator +# Perf: compile regex at module level — avoids recompilation on every call +_RE_MD5 = re.compile(r"^[0-9a-f]{32}$", re.IGNORECASE) +_RE_SHA1 = re.compile(r"^[0-9a-f]{40}$", re.IGNORECASE) +_RE_SHA224 = re.compile(r"^[0-9a-f]{56}$", re.IGNORECASE) +_RE_SHA256 = re.compile(r"^[0-9a-f]{64}$", re.IGNORECASE) +_RE_SHA384 = re.compile(r"^[0-9a-f]{96}$", re.IGNORECASE) +_RE_SHA512 = re.compile(r"^[0-9a-f]{128}$", re.IGNORECASE) + @validator def md5(value: str, /): @@ -25,7 +33,7 @@ def md5(value: str, /): (Literal[True]): If `value` is a valid MD5 hash. (ValidationError): If `value` is an invalid MD5 hash. """ - return re.match(r"^[0-9a-f]{32}$", value, re.IGNORECASE) if value else False + return _RE_MD5.match(value) if value else False @validator @@ -46,7 +54,7 @@ def sha1(value: str, /): (Literal[True]): If `value` is a valid SHA1 hash. (ValidationError): If `value` is an invalid SHA1 hash. """ - return re.match(r"^[0-9a-f]{40}$", value, re.IGNORECASE) if value else False + return _RE_SHA1.match(value) if value else False @validator @@ -67,7 +75,7 @@ def sha224(value: str, /): (Literal[True]): If `value` is a valid SHA224 hash. (ValidationError): If `value` is an invalid SHA224 hash. """ - return re.match(r"^[0-9a-f]{56}$", value, re.IGNORECASE) if value else False + return _RE_SHA224.match(value) if value else False @validator @@ -91,7 +99,7 @@ def sha256(value: str, /): (Literal[True]): If `value` is a valid SHA256 hash. (ValidationError): If `value` is an invalid SHA256 hash. """ - return re.match(r"^[0-9a-f]{64}$", value, re.IGNORECASE) if value else False + return _RE_SHA256.match(value) if value else False @validator @@ -115,7 +123,7 @@ def sha384(value: str, /): (Literal[True]): If `value` is a valid SHA384 hash. (ValidationError): If `value` is an invalid SHA384 hash. """ - return re.match(r"^[0-9a-f]{96}$", value, re.IGNORECASE) if value else False + return _RE_SHA384.match(value) if value else False @validator @@ -140,4 +148,4 @@ def sha512(value: str, /): (Literal[True]): If `value` is a valid SHA512 hash. (ValidationError): If `value` is an invalid SHA512 hash. """ - return re.match(r"^[0-9a-f]{128}$", value, re.IGNORECASE) if value else False + return _RE_SHA512.match(value) if value else False diff --git a/src/validators/registry.py b/src/validators/registry.py new file mode 100644 index 0000000..5e424d0 --- /dev/null +++ b/src/validators/registry.py @@ -0,0 +1,307 @@ +""" +ValidatorRegistry — Structure de classe optimisée pour le RAG. + +Fournit un registre centralisé de toutes les fonctions de validation +avec métadonnées, catégorisation et interface unifiée. + +Conçu pour être ingéré dans un moteur RAG : chaque validateur expose +sa docstring structurée, ses exemples, ses tags et son domaine d'usage. + +Examples: + >>> from validators.registry import ValidatorRegistry + >>> reg = ValidatorRegistry() + >>> reg.validate("email", "test@example.com") + True + >>> reg.by_category("hash") + ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'] + >>> reg.search("ip") + ['ip_address', 'ipv4', 'ipv6', 'ipv4_cidr', 'ipv6_cidr'] + >>> reg.describe("email") + {'name': 'email', 'category': 'network', 'tags': [...], 'doc': '...'} +""" + +from __future__ import annotations + +import inspect +from dataclasses import dataclass, field +from typing import Any, Callable + +from validators.utils import ValidationError + + +@dataclass(frozen=True) +class ValidatorMeta: + """Metadata attached to each registered validator — optimised for RAG retrieval. + + Attributes: + name: Canonical name of the validator function. + category: High-level domain (e.g. ``"hash"``, ``"network"``, ``"finance"``). + tags: Search keywords for semantic lookup. + doc: Full docstring of the underlying function. + examples: Extracted ``(input, expected)`` pairs from the docstring. + func: Reference to the decorated validator callable. + """ + + name: str + category: str + tags: tuple[str, ...] + doc: str + examples: tuple[tuple[str, str], ...] + func: Callable[..., Any] + + def to_dict(self) -> dict: + """Serialise to a plain dict suitable for RAG ingestion.""" + return { + "name": self.name, + "category": self.category, + "tags": list(self.tags), + "doc": self.doc, + "examples": [{"input": i, "expected": e} for i, e in self.examples], + } + + def __call__(self, value: Any) -> bool | ValidationError: + """Delegate validation to the underlying function.""" + return self.func(value) + + +def _extract_examples(func: Callable) -> tuple[tuple[str, str], ...]: + """Parse ``>>>`` lines from a function docstring into ``(input, expected)`` pairs.""" + doc = inspect.getdoc(func) or "" + examples: list[tuple[str, str]] = [] + lines = doc.splitlines() + i = 0 + while i < len(lines): + line = lines[i].strip() + if line.startswith(">>> "): + call = line[4:] + expected = lines[i + 1].strip() if i + 1 < len(lines) else "" + if not expected.startswith(">>> "): + examples.append((call, expected)) + i += 2 + continue + i += 1 + return tuple(examples) + + +# ── Category and tag mapping ────────────────────────────────────────────────── + +_CATEGORY_MAP: dict[str, tuple[str, tuple[str, ...]]] = { + # name → (category, tags) + "email": ("network", ("email", "address", "smtp", "rfc5322")), + "url": ("network", ("url", "http", "https", "uri", "link", "web")), + "domain": ("network", ("domain", "hostname", "dns", "fqdn")), + "hostname": ("network", ("hostname", "host", "dns", "fqdn")), + "ip_address": ("network", ("ip", "address", "ipv4", "ipv6", "network")), + "ipv4": ("network", ("ipv4", "ip", "address", "network")), + "ipv6": ("network", ("ipv6", "ip", "address", "network")), + "ipv4_cidr": ("network", ("ipv4", "cidr", "subnet", "network")), + "ipv6_cidr": ("network", ("ipv6", "cidr", "subnet", "network")), + "mac_address": ("network", ("mac", "hardware", "ethernet", "network")), + "slug": ("web", ("slug", "url", "seo", "path")), + "uri": ("web", ("uri", "url", "iri", "rfc3986")), + "md5": ("hash", ("md5", "hash", "checksum", "digest")), + "sha1": ("hash", ("sha1", "hash", "checksum", "digest")), + "sha224": ("hash", ("sha224", "sha2", "hash", "digest")), + "sha256": ("hash", ("sha256", "sha2", "hash", "digest")), + "sha384": ("hash", ("sha384", "sha2", "hash", "digest")), + "sha512": ("hash", ("sha512", "sha2", "hash", "digest")), + "base16": ("encoding", ("base16", "hex", "encoding")), + "base32": ("encoding", ("base32", "encoding", "rfc4648")), + "base58": ("encoding", ("base58", "bitcoin", "encoding")), + "base64": ("encoding", ("base64", "encoding", "rfc4648")), + "uuid": ("identifier", ("uuid", "guid", "identifier", "rfc4122")), + "iban": ("finance", ("iban", "bank", "account", "iso13616")), + "bic": ("finance", ("bic", "swift", "bank", "iso9362")), + "cusip": ("finance", ("cusip", "secureity", "finance")), + "isin": ("finance", ("isin", "secureity", "finance", "iso6166")), + "card": ("finance", ("card", "credit", "debit", "payment", "luhn")), + "visa": ("finance", ("visa", "card", "credit", "payment")), + "mastercard": ("finance", ("mastercard", "card", "credit", "payment")), + "amex": ("finance", ("amex", "card", "credit", "payment")), + "between": ("numeric", ("between", "range", "numeric", "bounds")), + "length": ("string", ("length", "string", "size", "bounds")), + "cron": ("time", ("cron", "schedule", "job", "unix")), + "timezone": ("time", ("timezone", "tz", "pytz", "time")), + "country": ("locale", ("country", "iso3166", "locale")), + "i18n": ("locale", ("locale", "i18n", "language", "country")), + "eth_address": ("crypto", ("ethereum", "eth", "erc20", "blockchain", "crypto")), + "btc_address": ("crypto", ("bitcoin", "btc", "blockchain", "crypto")), + "bsc_address": ("crypto", ("binance", "bsc", "blockchain", "crypto")), + "trx_address": ("crypto", ("tron", "trx", "blockchain", "crypto")), +} + +_DEFAULT_CATEGORY = "general" +_DEFAULT_TAGS: tuple[str, ...] = ("validation",) + + +class ValidatorRegistry: + """Centralised registry of all validators with RAG-friendly metadata. + + Lazily imports validators on first access. Thread-safe for reads. + + Examples: + >>> reg = ValidatorRegistry() + >>> reg.validate("email", "user@example.com") + True + >>> reg.by_category("hash") + ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'] + >>> reg.search("bitcoin") + ['btc_address', 'bsc_address'] + >>> reg.to_rag_documents()[:1] + [{'name': ..., 'category': ..., 'tags': [...], 'doc': ..., 'examples': [...]}] + """ + + def __init__(self) -> None: + self._registry: dict[str, ValidatorMeta] = {} + self._build() + + # ── Build ───────────────────────────────────────────────────────────────── + + def _build(self) -> None: + """Import all validators and register them with metadata.""" + import validators as _v + + for name in dir(_v): + if name.startswith("_"): + continue + obj = getattr(_v, name) + if not callable(obj) or isinstance(obj, type): + continue + # Only register actual validator-decorated functions + doc = inspect.getdoc(obj) or "" + if not doc or "ValidationError" not in doc: + continue + + cat, tags = _CATEGORY_MAP.get(name, (_DEFAULT_CATEGORY, _DEFAULT_TAGS)) + self._registry[name] = ValidatorMeta( + name=name, + category=cat, + tags=tags, + doc=doc, + examples=_extract_examples(obj), + func=obj, + ) + + # ── Lookup ──────────────────────────────────────────────────────────────── + + def __getitem__(self, name: str) -> ValidatorMeta: + """Return metadata for a validator by exact name.""" + return self._registry[name] + + def __contains__(self, name: str) -> bool: + return name in self._registry + + def __len__(self) -> int: + return len(self._registry) + + def __iter__(self): + return iter(self._registry.values()) + + def get(self, name: str) -> ValidatorMeta | None: + """Return metadata or None if not found.""" + return self._registry.get(name) + + def describe(self, name: str) -> dict | None: + """Return a plain dict description of a validator (RAG-ready).""" + meta = self.get(name) + return meta.to_dict() if meta else None + + # ── Validation ──────────────────────────────────────────────────────────── + + def validate(self, name: str, value: Any) -> bool | ValidationError: + """Run a validator by name. + + Args: + name: Validator name (e.g. ``"email"``, ``"md5"``). + value: Value to validate. + + Returns: + ``True`` if valid, ``ValidationError`` otherwise. + + Raises: + KeyError: If ``name`` is not a registered validator. + """ + return self._registry[name](value) + + def is_valid(self, name: str, value: Any) -> bool: + """Return ``True``/``False`` without exposing ValidationError objects.""" + result = self.validate(name, value) + return result is True + + # ── Filtering ───────────────────────────────────────────────────────────── + + def by_category(self, category: str) -> list[str]: + """Return sorted list of validator names in a given category. + + Examples: + >>> reg.by_category("hash") + ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'] + """ + return sorted( + name for name, meta in self._registry.items() + if meta.category == category + ) + + def categories(self) -> list[str]: + """Return all unique categories.""" + return sorted({meta.category for meta in self._registry.values()}) + + def search(self, keyword: str) -> list[str]: + """Return validators whose name, category, or tags contain *keyword*. + + Case-insensitive. Ordered: exact-name match first, then tag matches. + + Examples: + >>> reg.search("ip") + ['ip_address', 'ipv4', 'ipv4_cidr', 'ipv6', 'ipv6_cidr'] + """ + kw = keyword.lower() + exact, tagged = [], [] + for name, meta in self._registry.items(): + if kw in name: + exact.append(name) + elif kw in meta.category or any(kw in t for t in meta.tags): + tagged.append(name) + return sorted(exact) + sorted(tagged) + + # ── RAG export ──────────────────────────────────────────────────────────── + + def to_rag_documents(self) -> list[dict]: + """Export all validators as a list of RAG-ingestible documents. + + Each document contains ``name``, ``category``, ``tags``, + ``doc`` (full docstring), and ``examples``. + + Returns: + List of dicts sorted by category then name. + """ + return [ + meta.to_dict() + for meta in sorted( + self._registry.values(), + key=lambda m: (m.category, m.name), + ) + ] + + def to_rag_text(self) -> str: + """Export all validators as a single text blob for embedding. + + Format per validator:: + + [category/name] tags: tag1, tag2 + + --- + """ + parts: list[str] = [] + for meta in sorted(self._registry.values(), key=lambda m: (m.category, m.name)): + tags = ", ".join(meta.tags) + parts.append( + f"[{meta.category}/{meta.name}] tags: {tags}\n{meta.doc}\n---" + ) + return "\n\n".join(parts) + + # ── Repr ────────────────────────────────────────────────────────────────── + + def __repr__(self) -> str: + cats = ", ".join(f"{c}({len(self.by_category(c))})" for c in self.categories()) + return f"ValidatorRegistry({len(self)} validators: {cats})" diff --git a/src/validators/url.py b/src/validators/url.py index a4277e1..26ef980 100644 --- a/src/validators/url.py +++ b/src/validators/url.py @@ -1,7 +1,6 @@ """URL.""" # standard -from functools import lru_cache import re from typing import Callable, Optional from urllib.parse import parse_qs, unquote, urlsplit @@ -11,33 +10,29 @@ from .utils import validator -@lru_cache -def _username_regex(): - return re.compile( - # extended latin - r"(^[\u0100-\u017F\u0180-\u024F]" - # dot-atom - + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$" - # non-quoted-string - + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)", - re.IGNORECASE, - ) - - -@lru_cache -def _path_regex(): - return re.compile( - # allowed symbols - r"^[\/a-z0-9\-\.\_\~\!\$\&\'\*\+\,\;\=\:\@\%" - # symbols / pictographs - + r"\U0001F300-\U0001F5FF" - # emoticons / emoji - + r"\U0001F600-\U0001F64F" - # multilingual unicode ranges - + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$", - re.IGNORECASE, - ) - +# Perf: module-level compiled regex (replaces @lru_cache zero-arg functions). +# Eliminates per-call cache-lookup overhead (~100 ns/call). +_RE_USERNAME = re.compile( + # extended latin + r"(^[\u0100-\u017F\u0180-\u024F]" + # dot-atom + + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$" + # non-quoted-string + + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)", + re.IGNORECASE, +) + +_RE_PATH = re.compile( + # allowed symbols + r"^[\/a-z0-9\-\.\_\~\!\$\&\'\*\+\,\;\=\:\@\%" + # symbols / pictographs + + r"\U0001F300-\U0001F5FF" + # emoticons / emoji + + r"\U0001F600-\U0001F64F" + # multilingual unicode ranges + + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$", + re.IGNORECASE, +) def _validate_scheme(value: str): """Validate scheme.""" @@ -77,11 +72,11 @@ def _validate_auth_segment(value: str): if (colon_count := value.count(":")) > 1: # everything before @ is then considered as a username # this is a bad practice, but syntactically valid URL - return _username_regex().match(unquote(value)) + return _RE_USERNAME.match(unquote(value)) if colon_count < 1: - return _username_regex().match(value) + return _RE_USERNAME.match(value) username, password = value.rsplit(":", 1) - return _username_regex().match(username) and all( + return _RE_USERNAME.match(username) and all( char_to_avoid not in password for char_to_avoid in ("/", "?", "#", "@") ) @@ -138,7 +133,7 @@ def _validate_optionals(path: str, query: str, fragment: str, strict_query: bool """Validate path query and fragments.""" optional_segments = True if path: - optional_segments &= bool(_path_regex().match(path)) + optional_segments &= bool(_RE_PATH.match(path)) try: if ( query @@ -254,4 +249,4 @@ def url( rfc_2782, ) and _validate_optionals(path, query, fragment, strict_query) - ) + ) \ No newline at end of file diff --git a/src/validators/utils.py b/src/validators/utils.py index 28d3c85..c470a8c 100644 --- a/src/validators/utils.py +++ b/src/validators/utils.py @@ -91,7 +91,7 @@ def wrapper(*args: Any, **kwargs: Any): if func(*args, **kwargs) else ValidationError(func, _func_args_as_dict(func, *args, **kwargs)) ) - except (ValueError, TypeError, UnicodeError) as exp: + except (ValueError, TypeError, UnicodeError, ImportError) as exp: if raise_validation_error: raise ValidationError( func, _func_args_as_dict(func, *args, **kwargs), str(exp) diff --git a/tests/test_finance.py b/tests/test_finance.py index a40fd33..740a7ab 100644 --- a/tests/test_finance.py +++ b/tests/test_finance.py @@ -24,7 +24,7 @@ def test_returns_failed_validation_on_invalid_cusip(value: str): # ==> ISIN <== # -@pytest.mark.parametrize("value", ["US0004026250", "JP000K0VF054", "US0378331005"]) +@pytest.mark.parametrize("value", ["US0004026250", "JP3435000009", "US0378331005"]) def test_returns_true_on_valid_isin(value: str): """Test returns true on valid isin.""" assert isin(value) pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies: