Source code for bridge.pipelines.gh2bt_for_meta.map_funcs.documentation
"""
Mapping functions for documentation metadata.
This module maps GitHub repository features (wiki, code of conduct, GitHub Pages)
to the bio.tools ``documentation`` field by adding appropriate
``DocumentationItem`` entries when they are not already present.
"""
import subprocess
from typing import Any
from urllib.parse import urljoin
from pydantic import AnyUrl
from bridge.core.biotools import DocumentationItem, TypeEnum1
from bridge.core.github_pages import GitHubPages
from bridge.core.github_repo import CodeOfConduct
from bridge.logging import get_user_logger
from bridge.pipelines.utils import canonicalize_url
logger = get_user_logger()
def _add_doc_if_not_exists(
bt_documentation: list[DocumentationItem] | None, url: str, doc_type: TypeEnum1
) -> list[DocumentationItem]:
"""
Add a documentation item for the given URL if it does not already exist.
Parameters
----------
bt_documentation : list[DocumentationItem] | None
Existing bio.tools documentation list, or ``None`` if unset.
url : str
Documentation URL to add.
doc_type : TypeEnum1
Documentation type to associate with this URL.
Returns
-------
list[DocumentationItem]
Updated list of documentation items.
"""
if not bt_documentation:
bt_documentation = []
# Normalize the incoming URL for comparison
normalized_url = canonicalize_url(url)
url_exists = any(canonicalize_url(str(doc.url.root)) == normalized_url for doc in bt_documentation)
if not url_exists:
doc_item = DocumentationItem(url=url, type=[doc_type])
bt_documentation.append(doc_item)
logger.added(f"documentation URL '{url}' as type '{doc_type.value}'.")
else:
logger.exact(f"documentation URL '{url}' of type '{doc_type.value}' already exists, not adding.")
return bt_documentation
def _wiki_repo_exists(gh_html_url: AnyUrl | None, timeout: float = 5.0) -> bool:
"""
Check if the GitHub wiki repository exists by probing with `git ls-remote`.
"""
if not gh_html_url:
return False
try:
wiki_git_url = str(gh_html_url).rstrip("/") + ".wiki.git"
result = subprocess.run(
["git", "ls-remote", wiki_git_url],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=timeout,
)
return result.returncode == 0
except subprocess.TimeoutExpired:
logger.warning(f"Timeout expired while checking wiki repository at '{wiki_git_url}'.")
return False
def _map_wiki(
gh_html_url: AnyUrl | None, gh_has_wiki: bool | None, bt_documentation: list[DocumentationItem] | None
) -> list[DocumentationItem] | None:
"""
Map GitHub wiki presence to bio.tools documentation.
If the repository has a wiki enabled and a repository URL is available,
a documentation entry of type ``TypeEnum1.General`` pointing to
``<repo_url>/wiki`` is added when not already present.
Parameters
----------
gh_html_url : AnyUrl | None
GitHub repository HTML URL (e.g. ``https://github.com/user/repo``).
gh_has_wiki : bool | None
Flag indicating whether the repository has wiki enabled.
bt_documentation : list[DocumentationItem] | None
Existing bio.tools documentation entries.
Returns
-------
list[DocumentationItem] | None
Updated documentation list, or the original list if nothing changed.
"""
if gh_has_wiki and gh_html_url and _wiki_repo_exists(gh_html_url):
repo_url = str(gh_html_url)
wiki_raw = urljoin(repo_url.rstrip("/") + "/", "wiki")
wiki_url = canonicalize_url(wiki_raw)
return _add_doc_if_not_exists(bt_documentation, wiki_url, TypeEnum1.General)
logger.unchanged("no GitHub wiki found, nothing to map.")
return bt_documentation
def _map_code_of_conduct(
gh_code_of_conduct: CodeOfConduct | None, bt_documentation: list[DocumentationItem] | None
) -> list[DocumentationItem] | None:
"""
Map GitHub code of conduct presence to bio.tools documentation.
If a code of conduct is configured on GitHub and an ``html_url`` is
available, a documentation entry of type ``TypeEnum1.Code_of_conduct``
is added when not already present.
Parameters
----------
gh_code_of_conduct : CodeOfConduct | None
GitHub code of conduct metadata, expected to contain
an ``"html_url"`` key when present.
bt_documentation : list[DocumentationItem] | None
Existing bio.tools documentation entries.
Returns
-------
list[DocumentationItem] | None
Updated documentation list, or the original list if nothing changed.
"""
if gh_code_of_conduct and gh_code_of_conduct.html_url:
coc_url = gh_code_of_conduct.html_url
coc_url_str = canonicalize_url(str(coc_url))
return _add_doc_if_not_exists(bt_documentation, coc_url_str, TypeEnum1.Code_of_conduct)
logger.unchanged("no GitHub code of conduct found, nothing to map.")
return bt_documentation
def _map_github_pages(
gh_pages: GitHubPages | None, bt_documentation: list[DocumentationItem] | None
) -> list[DocumentationItem] | None:
"""
Map GitHub Pages configuration to bio.tools documentation.
If a GitHub Pages URL is configured, a documentation entry of type
``TypeEnum1.General`` is added when not already present.
Parameters
----------
gh_pages : GitHubPages | None
Parsed GitHub Pages information, expected to expose an ``html_url``
attribute when configured.
bt_documentation : list[DocumentationItem] | None
Existing bio.tools documentation entries.
Returns
-------
list[DocumentationItem] | None
Updated documentation list, or the original list if nothing changed.
"""
if gh_pages and gh_pages.html_url:
pages_url = gh_pages.html_url
pages_url_str = canonicalize_url(str(pages_url))
return _add_doc_if_not_exists(bt_documentation, pages_url_str, TypeEnum1.General)
logger.unchanged("no GitHub Pages site found, nothing to map.")
return bt_documentation
[docs]
def map_documentation(
gh_repo_data: dict[str, Any] | None, bt_documentation: list[DocumentationItem] | None
) -> list[DocumentationItem] | None:
"""
Map and reconcile GitHub documentation-related metadata to the
bio.tools documentation field.
This function applies the documentation mapping policies for all
supported GitHub documentation sources:
- Repository wiki
- Code of conduct
- GitHub Pages site
Each source is mapped independently and contributes a
``DocumentationItem`` entry when a corresponding URL is present on
GitHub and not already recorded in bio.tools.
Parameters
----------
gh_repo_data : dict[str, Any] | None
GitHub repository metadata dictionary. Expected keys include:
- ``"html_url"``
- ``"has_wiki"``
- ``"code_of_conduct"``
- ``"github_pages"``
bt_documentation : list[DocumentationItem] | None
Existing bio.tools documentation entries.
Returns
-------
list[DocumentationItem] | None
The updated bio.tools documentation list after applying all
documentation mappings.
"""
if not gh_repo_data:
logger.unchanged("no GitHub repository documentation data found, nothing to map.")
return bt_documentation
gh_html_url: AnyUrl | None = gh_repo_data.get("html_url")
gh_has_wiki: bool | None = gh_repo_data.get("has_wiki")
gh_code_of_conduct: CodeOfConduct | None = gh_repo_data.get("code_of_conduct")
gh_pages: GitHubPages | None = gh_repo_data.get("github_pages")
bt_documentation = _map_wiki(gh_html_url, gh_has_wiki, bt_documentation)
bt_documentation = _map_code_of_conduct(gh_code_of_conduct, bt_documentation)
bt_documentation = _map_github_pages(gh_pages, bt_documentation)
return bt_documentation