Source code for bridge.pipelines.bt2gh_for_pr_issues.map_funcs.topics
"""
Functions for mapping bio.tools EDAM annotations to GitHub topics.
This module inspects EDAM-based annotations from bio.tools (topics and
function-level operation/input/output terms) and compares them to the
existing set of GitHub topics for a repository. When one or more EDAM
terms are missing from the GitHub topics, it proposes a GitHub issue
suggesting that these terms be added as topics, using a generic
bio.tools-on-top-of-GitHub additive policy.
"""
import re
from bridge.core.biotools import FunctionItem, TopicItem
from bridge.logging import get_user_logger
from bridge.pipelines.policies.bt2gh import reconcile_bt_ontop_gh
logger = get_user_logger()
def _normalize_edam_term(term: str) -> str:
"""
Normalize an EDAM term by replacing spaces with hyphens and converting to lowercase.
This makes EDAM terms suitable for use as GitHub topics and ensures
that multi-word terms are treated as a single topic. Non-alphanumeric
characters (except hyphens) are stripped to comply with GitHub topic rules.
Parameters
----------
term : str
The EDAM term to normalize.
Returns
-------
str
The normalized EDAM term.
"""
# Replace spaces with hyphens and lowercase
normalized = term.replace(" ", "-").lower()
# Replace any non-alphanumeric/hyphen chars with hyphens
normalized = re.sub(r"[^a-z0-9-]", "-", normalized)
# Collapse multiple consecutive hyphens and strip leading/trailing hyphens
normalized = re.sub(r"-{2,}", "-", normalized).strip("-")
return normalized
def _normalize_gh_topic(gh_topic: str) -> str:
"""
Normalize a GitHub topic by converting to lowercase.
Parameters
----------
gh_topic : str
The GitHub topic to normalize.
Returns
-------
str
The normalized GitHub topic.
"""
return gh_topic.lower()
def _flatten_function(function: list[FunctionItem]) -> list[str]:
"""
Flatten bio.tools function annotations for operation, input, and output.
Relevant terms:
- topic[].term
- function[].operation[].term
- function[].input[].data.term
- function[].input[].format[].term
- function[].output[].data.term
- function[].output[].format[].term
Parameters
----------
function : list[FunctionItem]
List of bio.tools function annotations using EDAM ontology.
Returns
-------
list[str]
Flattened list of EDAM terms strings.
"""
if not function:
return []
function_flat = []
# extract all EDAM terms
for fnc_item in function:
for op_item in fnc_item.operation or []:
if op_item.term:
function_flat.append(op_item.term)
io_items = (fnc_item.input or []) + (fnc_item.output or [])
for io_item in io_items:
if io_item.data and io_item.data.term:
function_flat.append(io_item.data.term)
for form_item in io_item.format or []:
if form_item.term:
function_flat.append(form_item.term)
return function_flat
[docs]
def map_topics(gh_topics: list[str] | None, bt_edam: dict[str, any] | None) -> dict[str, str] | None:
"""
Propose a GitHub issue to add missing EDAM-based topics from bio.tools.
This function inspects EDAM annotations from bio.tools (both high-level
topics and function-level operation/input/output terms), normalizes them
to GitHub-topic-style slugs, and compares them against the normalized
set of existing GitHub topics. It then applies the generic
bio.tools-on-top-of-GitHub additive policy to decide whether an issue
should be proposed and which EDAM terms to include.
Parameters
----------
gh_topics : list[str] | None
Current list of GitHub topics for the repository, or ``None`` if
no topics are set.
bt_edam : dict[str, Any] | None
EDAM-related metadata from bio.tools.
Expected keys include:
- "topics" : list[TopicItem]
- "functions" : list[FunctionItem]
Returns
-------
dict[str, str] | None
A mapping with the issue title as key and the issue body as value,
or ``None`` if no issue is needed under the policy.
"""
if bt_edam is None:
logger.unchanged("No bio.tools EDAM annotations found, nothing to map")
return None
topic_items: list[TopicItem] = bt_edam.get("topics") or []
function_items: list[FunctionItem] = bt_edam.get("functions") or []
topic_terms = [_normalize_edam_term(ti.term) for ti in topic_items if ti.term]
function_terms = [_normalize_edam_term(term) for term in _flatten_function(function_items)]
bt_terms = set(topic_terms + function_terms)
gh_terms = {_normalize_gh_topic(t) for t in gh_topics or []}
def make_issue(missing: set[str]) -> dict[str, str]:
num_missing = len(missing)
terms = "\n".join(sorted(missing))
# adjust message based on singular/plural
noun, verb, pronoun = ("term", "is", "it") if num_missing == 1 else ("terms", "are", "them")
# Construct the JSON payload for the API
names_json = ",".join(f'"{term}"' for term in sorted(missing))
return {
"Add EDAM annotations from bio.tools metadata": (
f"The bio.tools EDAM annotations contain {num_missing} EDAM {noun} "
f"that {verb} not included in the GitHub topics:\n\n{terms}\n\n"
f"Please consider adding {pronoun} to the GitHub repository.\n\n"
f"We recommend one of the following methods:\n\n"
f"1. _Use the GitHub CLI (one --add-topic flag per term):_\n"
f"```\n"
f"gh repo edit <OWNER/REPO> " + " ".join(f"--add-topic {term}" for term in sorted(missing)) + "\n"
f"```\n\n"
f"2. _Use the GitHub API via curl (requires token):_\n"
f"```\n"
f"curl -L -X PUT \\\n"
f" -H 'Accept: application/vnd.github+json' \\\n"
f" -H 'Authorization: Bearer <YOUR-TOKEN>' \\\n"
f" -H 'X-GitHub-Api-Version: 2022-11-28' \\\n"
f" https://api.github.com/repos/OWNER/REPO/topics \\\n"
f" -d '{{\"names\": [{names_json}]}}'\n"
f"```\n\n"
f"3. _Visit the GitHub web UI:_\n"
f"You can also add topics manually via the repository settings:\n"
f"https://github.com/OWNER/REPO/settings\n\n"
f"Please replace `<OWNER/REPO>` and `<YOUR-TOKEN>` with the appropriate values."
)
}
return reconcile_bt_ontop_gh(
gh_norm=gh_terms,
bt_norm=bt_terms,
make_output=make_issue,
log_label="EDAM terms",
)
# TODO: add function for mapping to GitHub content (README) resulting in pull request. Compare citation.py
# def map_function2readme():