"""
Utilities for cleaning and canonicalizing objects.
The functions are intentionally conservative: they avoid guessing semantics
and focus only on reversible, mechanical clean-ups and canonical forms that
improve comparison and stability.
"""
import html
import re
from typing import Any
from urllib.parse import parse_qsl, quote, urlencode, urlparse, urlsplit, urlunparse, urlunsplit
[docs]
def canonicalize_shields_url(url: str) -> str:
"""
Canonicalize a shields.io image URL for stable comparison.
This helper focuses on shields.io badge URLs served from `img.shields.io`.
The main goal is to strip out purely cosmetic parameters that shouldn't
affect logical equality (e.g. user-chosen logos) and to provide a stable
query parameter ordering.
Behaviour:
- If the URL does *not* contain `img.shields.io`, it is returned unchanged.
- If it does, the query string is parsed, the `logo` parameter is removed
(case-insensitive), and the remaining parameters are sorted
lexicographically and re-encoded.
Parameters
----------
url : str
The shields.io URL to canonicalize.
Returns
-------
str
The canonicalized URL, suitable for equality checks or deduplication.
"""
if "img.shields.io" not in url:
return url
parts = urlsplit(url)
# parse query params, drop any "logo" parameter, sort rest for stability
q_pairs = parse_qsl(parts.query, keep_blank_values=True)
q_pairs = [(k, v) for (k, v) in q_pairs if k.lower() != "logo"]
q_pairs.sort()
new_query = urlencode(q_pairs)
return urlunsplit((parts.scheme, parts.netloc, parts.path, new_query, parts.fragment))
[docs]
def canonicalize_url(url: str) -> str:
"""
Canonicalize a generic URL for stable comparison.
This performs a minimal, well-defined normalization intended to make
string-based URL comparisons less fragile without changing semantics
for typical HTTP(S) URLs.
Normalizations applied:
- Lowercase the scheme and netloc (host + port).
- Strip trailing slashes from the path, but ensure the path is at least "/".
- Parse the query string into key/value pairs, sort them, and re-encode
(preserving multiplicity via `doseq=True`).
- Drop the fragment entirely (anything after '#').
Parameters
----------
url : str
The URL to canonicalize.
Returns
-------
str
The canonicalized URL.
"""
parsed = urlparse(url)
query = urlencode(sorted(parse_qsl(parsed.query)), doseq=True)
path = parsed.path.rstrip("/") or "/"
return urlunparse(
parsed._replace(
scheme=parsed.scheme.lower(),
netloc=parsed.netloc.lower(),
path=path,
query=query,
fragment="",
)
)
[docs]
def escape_shields_part(value: str) -> str:
"""
Prepare a label or message for use in a Shields.io badge path segment.
Shields.io encodes meaning into certain characters in the path portion
of the URL. This function escapes a free-form string so that it can be
safely embedded in that position without accidentally triggering
Shields' special syntax.
Shields path semantics:
- `-` = segment separator
- `--` = literal `-`
- `_` = space
- `__` = literal `_`
This function:
- Converts `-` to `--`.
- Converts `_` to `__`.
- Percent-encodes anything else that needs escaping, but leaves `_`
unchanged so that Shields can interpret it as a space.
Parameters
----------
value : str
The label or message to escape.
Returns
-------
str
The escaped value, suitable for direct inclusion in a Shields.io URL
path segment.
"""
value = str(value)
value = value.replace("-", "--")
value = value.replace("_", "__")
return quote(value, safe="_")
[docs]
def normalize_color(value: str) -> str:
"""
Normalize a color value by stripping a leading '#' and percent-encoding.
Behaviour:
- Coerces the value to a string and strips surrounding whitespace.
- Removes a leading `#` if present.
- Percent-encodes the remaining value with no safe characters.
Parameters
----------
value : str
The color value to normalize (e.g. "#4c1", "brightgreen").
Returns
-------
str
The normalized color string, without a leading '#', and
percent-encoded for safe use in URLs.
"""
value = str(value).strip()
if value.startswith("#"):
value = value[1:]
return quote(value, safe="")
[docs]
def normalize_text(value: str | None, normalize_multiline: bool = True) -> str | None:
"""
Clean and normalize free-text values.
This is a general-purpose text scrubber intended to remove presentation
artefacts (HTML entities/tags, box-drawing characters) and normalize
whitespace so that strings are more suitable for storage, comparison,
or inclusion in metadata formats.
Steps performed:
1. Decode HTML entities (e.g. ``<i>`` → ``<i>``, ``&`` → ``&``).
2. Strip all remaining HTML tags (e.g. ``<i>name</i>`` → ``name``).
3. Replace the box-drawing dash ``\u2500`` with a plain ASCII ``-``.
4. Remove non-printable characters.
5. Optionally collapse all whitespace, including newlines, into single
spaces (`normalize_multiline=True`).
6. Strip leading and trailing whitespace.
Parameters
----------
value : str | None
The text to normalize. If ``None``, the function returns ``None``.
normalize_multiline : bool, optional
If True (default), newlines and runs of whitespace are collapsed into
single spaces. If False, existing line breaks are preserved and only
non-printable characters and HTML artefacts are removed.
Returns
-------
str | None
The normalized text, or ``None`` if the input was ``None``.
"""
if value is None:
return None
# decode HTML entities: <i> → <i>, & → &, etc.
text = html.unescape(value)
# strip any remaining HTML tags: <i>name</i> -> name
tag_re = re.compile(r"<[^>]+>")
text = tag_re.sub("", text)
# replace box-drawing dash with a normal ASCII dash
text = text.replace("\u2500", "-")
# drop non-printable characters
text = "".join(c for c in text if c.isprintable())
if normalize_multiline:
# replace newlines and multiple spaces with a single space
text = re.sub(r"\s+", " ", text)
# strip outer whitespace
return text.strip()
def _normalize_structure(obj: Any) -> Any:
"""
Recursively normalize strings inside common container types.
This helper is used by higher-level normalization functions to apply
:func:`normalize_text` to all string-like values in nested structures.
Supported types:
- str / None: passed through `normalize_text`.
- dict: keys left as-is, values normalized recursively.
- list, tuple, set: elements normalized recursively, container type preserved.
- Pydantic models: delegated to `normalize_pydantic_model_strings`.
- Anything else: returned unchanged.
Parameters
----------
obj : Any
Arbitrary Python object to normalize.
Returns
-------
Any
A normalized version of `obj`, with the same overall structure.
"""
# scalar text
if isinstance(obj, str) or obj is None:
return normalize_text(obj)
# dict: normalize values recursively, keep keys untouched
if isinstance(obj, dict):
return {k: _normalize_structure(v) for k, v in obj.items()}
# lists: normalize each element, keep as list
if isinstance(obj, list):
return [_normalize_structure(v) for v in obj]
# tuples: normalize each element, keep as tuple
if isinstance(obj, tuple):
return tuple(_normalize_structure(v) for v in obj)
# sets: normalize each element, keep as set
if isinstance(obj, set):
return {_normalize_structure(v) for v in obj}
# Pydantic model: normalize fields in-place, return the same object
obj_cls = obj.__class__
if hasattr(obj_cls, "model_fields") or hasattr(obj_cls, "__fields__"):
return normalize_pydantic_model_strings(obj)
return obj
[docs]
def normalize_dict_strings(d: dict[str, Any]) -> dict[str, Any]:
"""
Recursively normalize all string-like values in a dictionary.
This function walks the entire structure of the given dict and applies
:func:`normalize_text` to any string or ``None`` value it encounters.
Keys are left unchanged. Non-string scalar values (numbers, booleans, etc.)
are preserved as-is.
Parameters
----------
d : dict[str, Any]
The dictionary whose string values (and nested structures) should be
normalized.
Returns
-------
dict[str, Any]
A new dictionary with the same structure as `d`, where all nested
string/None values have been normalized.
"""
return _normalize_structure(d)
[docs]
def normalize_pydantic_model_strings(model: Any) -> Any:
"""
Recursively normalize all string fields in a Pydantic model in-place.
For each declared field on the model:
- If the current value is a string or ``None``, it is passed through
:func:`normalize_text`.
- If the current value is a container (dict, list, tuple, set), its
contents are normalized recursively via :func:`_normalize_structure`.
- If the current value is another Pydantic model, it is normalized
recursively by calling `normalize_pydantic_model_strings` on it.
If the object does not look like a Pydantic model (i.e. has neither
``model_fields`` nor ``__fields__``), it is returned unchanged.
Parameters
----------
model : Any
The Pydantic model instance to normalize, or any other object.
Returns
-------
Any
The same ``model`` object, potentially modified in-place if it is a
Pydantic model with string fields or nested containers.
"""
model_cls = model.__class__
if hasattr(model_cls, "model_fields"):
fields = model_cls.model_fields
elif hasattr(model_cls, "__fields__"):
fields = model_cls.__fields__
else:
return model
for field_name in fields.keys():
value = getattr(model, field_name, None)
normalized_value = _normalize_structure(value)
setattr(model, field_name, normalized_value)
return model