Source code for bridge.pipelines.utils.cleaning

"""
Utilities for cleaning and canonicalizing objects.

The functions are intentionally conservative: they avoid guessing semantics
and focus only on reversible, mechanical clean-ups and canonical forms that
improve comparison and stability.
"""

import html
import re
from typing import Any
from urllib.parse import parse_qsl, quote, urlencode, urlparse, urlsplit, urlunparse, urlunsplit


[docs] def canonicalize_shields_url(url: str) -> str: """ Canonicalize a shields.io image URL for stable comparison. This helper focuses on shields.io badge URLs served from `img.shields.io`. The main goal is to strip out purely cosmetic parameters that shouldn't affect logical equality (e.g. user-chosen logos) and to provide a stable query parameter ordering. Behaviour: - If the URL does *not* contain `img.shields.io`, it is returned unchanged. - If it does, the query string is parsed, the `logo` parameter is removed (case-insensitive), and the remaining parameters are sorted lexicographically and re-encoded. Parameters ---------- url : str The shields.io URL to canonicalize. Returns ------- str The canonicalized URL, suitable for equality checks or deduplication. """ if "img.shields.io" not in url: return url parts = urlsplit(url) # parse query params, drop any "logo" parameter, sort rest for stability q_pairs = parse_qsl(parts.query, keep_blank_values=True) q_pairs = [(k, v) for (k, v) in q_pairs if k.lower() != "logo"] q_pairs.sort() new_query = urlencode(q_pairs) return urlunsplit((parts.scheme, parts.netloc, parts.path, new_query, parts.fragment))
[docs] def canonicalize_url(url: str) -> str: """ Canonicalize a generic URL for stable comparison. This performs a minimal, well-defined normalization intended to make string-based URL comparisons less fragile without changing semantics for typical HTTP(S) URLs. Normalizations applied: - Lowercase the scheme and netloc (host + port). - Strip trailing slashes from the path, but ensure the path is at least "/". - Parse the query string into key/value pairs, sort them, and re-encode (preserving multiplicity via `doseq=True`). - Drop the fragment entirely (anything after '#'). Parameters ---------- url : str The URL to canonicalize. Returns ------- str The canonicalized URL. """ parsed = urlparse(url) query = urlencode(sorted(parse_qsl(parsed.query)), doseq=True) path = parsed.path.rstrip("/") or "/" return urlunparse( parsed._replace( scheme=parsed.scheme.lower(), netloc=parsed.netloc.lower(), path=path, query=query, fragment="", ) )
[docs] def escape_shields_part(value: str) -> str: """ Prepare a label or message for use in a Shields.io badge path segment. Shields.io encodes meaning into certain characters in the path portion of the URL. This function escapes a free-form string so that it can be safely embedded in that position without accidentally triggering Shields' special syntax. Shields path semantics: - `-` = segment separator - `--` = literal `-` - `_` = space - `__` = literal `_` This function: - Converts `-` to `--`. - Converts `_` to `__`. - Percent-encodes anything else that needs escaping, but leaves `_` unchanged so that Shields can interpret it as a space. Parameters ---------- value : str The label or message to escape. Returns ------- str The escaped value, suitable for direct inclusion in a Shields.io URL path segment. """ value = str(value) value = value.replace("-", "--") value = value.replace("_", "__") return quote(value, safe="_")
[docs] def normalize_color(value: str) -> str: """ Normalize a color value by stripping a leading '#' and percent-encoding. Behaviour: - Coerces the value to a string and strips surrounding whitespace. - Removes a leading `#` if present. - Percent-encodes the remaining value with no safe characters. Parameters ---------- value : str The color value to normalize (e.g. "#4c1", "brightgreen"). Returns ------- str The normalized color string, without a leading '#', and percent-encoded for safe use in URLs. """ value = str(value).strip() if value.startswith("#"): value = value[1:] return quote(value, safe="")
[docs] def normalize_text(value: str | None, normalize_multiline: bool = True) -> str | None: """ Clean and normalize free-text values. This is a general-purpose text scrubber intended to remove presentation artefacts (HTML entities/tags, box-drawing characters) and normalize whitespace so that strings are more suitable for storage, comparison, or inclusion in metadata formats. Steps performed: 1. Decode HTML entities (e.g. ``&lt;i&gt;`` → ``<i>``, ``&amp;`` → ``&``). 2. Strip all remaining HTML tags (e.g. ``<i>name</i>`` → ``name``). 3. Replace the box-drawing dash ``\u2500`` with a plain ASCII ``-``. 4. Remove non-printable characters. 5. Optionally collapse all whitespace, including newlines, into single spaces (`normalize_multiline=True`). 6. Strip leading and trailing whitespace. Parameters ---------- value : str | None The text to normalize. If ``None``, the function returns ``None``. normalize_multiline : bool, optional If True (default), newlines and runs of whitespace are collapsed into single spaces. If False, existing line breaks are preserved and only non-printable characters and HTML artefacts are removed. Returns ------- str | None The normalized text, or ``None`` if the input was ``None``. """ if value is None: return None # decode HTML entities: &lt;i&gt; → <i>, &amp; → &, etc. text = html.unescape(value) # strip any remaining HTML tags: <i>name</i> -> name tag_re = re.compile(r"<[^>]+>") text = tag_re.sub("", text) # replace box-drawing dash with a normal ASCII dash text = text.replace("\u2500", "-") # drop non-printable characters text = "".join(c for c in text if c.isprintable()) if normalize_multiline: # replace newlines and multiple spaces with a single space text = re.sub(r"\s+", " ", text) # strip outer whitespace return text.strip()
def _normalize_structure(obj: Any) -> Any: """ Recursively normalize strings inside common container types. This helper is used by higher-level normalization functions to apply :func:`normalize_text` to all string-like values in nested structures. Supported types: - str / None: passed through `normalize_text`. - dict: keys left as-is, values normalized recursively. - list, tuple, set: elements normalized recursively, container type preserved. - Pydantic models: delegated to `normalize_pydantic_model_strings`. - Anything else: returned unchanged. Parameters ---------- obj : Any Arbitrary Python object to normalize. Returns ------- Any A normalized version of `obj`, with the same overall structure. """ # scalar text if isinstance(obj, str) or obj is None: return normalize_text(obj) # dict: normalize values recursively, keep keys untouched if isinstance(obj, dict): return {k: _normalize_structure(v) for k, v in obj.items()} # lists: normalize each element, keep as list if isinstance(obj, list): return [_normalize_structure(v) for v in obj] # tuples: normalize each element, keep as tuple if isinstance(obj, tuple): return tuple(_normalize_structure(v) for v in obj) # sets: normalize each element, keep as set if isinstance(obj, set): return {_normalize_structure(v) for v in obj} # Pydantic model: normalize fields in-place, return the same object obj_cls = obj.__class__ if hasattr(obj_cls, "model_fields") or hasattr(obj_cls, "__fields__"): return normalize_pydantic_model_strings(obj) return obj
[docs] def normalize_dict_strings(d: dict[str, Any]) -> dict[str, Any]: """ Recursively normalize all string-like values in a dictionary. This function walks the entire structure of the given dict and applies :func:`normalize_text` to any string or ``None`` value it encounters. Keys are left unchanged. Non-string scalar values (numbers, booleans, etc.) are preserved as-is. Parameters ---------- d : dict[str, Any] The dictionary whose string values (and nested structures) should be normalized. Returns ------- dict[str, Any] A new dictionary with the same structure as `d`, where all nested string/None values have been normalized. """ return _normalize_structure(d)
[docs] def normalize_pydantic_model_strings(model: Any) -> Any: """ Recursively normalize all string fields in a Pydantic model in-place. For each declared field on the model: - If the current value is a string or ``None``, it is passed through :func:`normalize_text`. - If the current value is a container (dict, list, tuple, set), its contents are normalized recursively via :func:`_normalize_structure`. - If the current value is another Pydantic model, it is normalized recursively by calling `normalize_pydantic_model_strings` on it. If the object does not look like a Pydantic model (i.e. has neither ``model_fields`` nor ``__fields__``), it is returned unchanged. Parameters ---------- model : Any The Pydantic model instance to normalize, or any other object. Returns ------- Any The same ``model`` object, potentially modified in-place if it is a Pydantic model with string fields or nested containers. """ model_cls = model.__class__ if hasattr(model_cls, "model_fields"): fields = model_cls.model_fields elif hasattr(model_cls, "__fields__"): fields = model_cls.__fields__ else: return model for field_name in fields.keys(): value = getattr(model, field_name, None) normalized_value = _normalize_structure(value) setattr(model, field_name, normalized_value) return model