Source code for bridge.pipelines.gh2bt_for_meta.map_funcs.biotools_id
"""
Map GitHub repository name to bio.tools ID.
This module reconciles the GitHub repository name with the bio.tools ID.
It applies a policy that prefers the GitHub name while ensuring uniqueness
within bio.tools, generating alternative IDs if necessary.
"""
import httpx
from bridge.builders import compose_biotools_metadata
from bridge.logging import get_user_logger
from bridge.pipelines.utils import normalize_text, str_contain_each_other
logger = get_user_logger()
async def _matching_biotools_id_exists(biotools_id: str) -> bool:
"""
Check if a bio.tools entry with the given ID already exists.
The check is performed by attempting to fetch bio.tools metadata
for the given ID. If a 404 Not Found error is returned, the ID
does not exist. Any other error is treated as an indication that
the ID may exist, to avoid false negatives.
Parameters
----------
biotools_id : str
Candidate bio.tools ID to check.
Returns
-------
bool
True if a matching bio.tools entry exists, False otherwise.
"""
try:
matching_bt_metadata = await compose_biotools_metadata(identifier=biotools_id)
except Exception as e:
if isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 404:
return False
else:
logger.added(f"Failed to check existing bio.tools ID '{biotools_id}': {e}. Assuming it exists.")
return True
return matching_bt_metadata is not None
[docs]
async def map_biotools_id(gh_name: str | None, bt_id: str | None) -> str | None:
"""
Map and reconcile GitHub repository name to bio.tools ID.
Policy:
1. If no GitHub repo name is available, preserve existing bio.tools ID (even if None).
2. If the existing bio.tools ID and GitHub repo name contain each other
(case-insensitive), preserve the existing bio.tools ID.
3. If they do not contain each other, log a conflict but continue.
4. If the GitHub repo name is not already used as a bio.tools ID,
use it as the new bio.tools ID.
5. If the GitHub repo name is already used as a bio.tools ID,
attempt to generate a unique ID by appending suffixes ``-1``, ``-2``, ...
up to ``-99``. If a unique ID is found, use it.
6. If no unique ID can be generated, log a note and return ``None``,
requiring manual intervention.
Parameters
----------
gh_name : str | None
GitHub repository name.
bt_id : str | None
Existing bio.tools ID.
Returns
-------
str | None
Mapped bio.tools ID, or ``None`` if mapping failed.
"""
if gh_name is None:
logger.unchanged("No GitHub repo name found, nothing to map.")
return bt_id
gh_norm = normalize_text(gh_name)
bt_norm = normalize_text(bt_id or "")
if bt_id is not None and str_contain_each_other(gh_norm, bt_norm):
logger.exact(f"bio.tools ID '{bt_id}' and GitHub repo name '{gh_name}' contain each other")
return bt_id
if bt_id is not None and not str_contain_each_other(gh_norm, bt_norm):
logger.conflict(f"bio.tools ID '{bt_id}' and GitHub repo name '{gh_name}' do not contain each other")
if not await _matching_biotools_id_exists(gh_norm):
logger.added(f"Using GitHub repo name '{gh_name}' as bio.tools ID")
return gh_norm
logger.conflict(
f"GitHub repo name '{gh_name}' cannot be used as bio.tools ID because it matches an existing entry. "
"Trying to generate a unique bio.tools ID."
)
for suffix in range(1, 100):
candidate_id = f"{gh_norm}-{suffix}"
if not await _matching_biotools_id_exists(candidate_id):
logger.added(f"Using generated bio.tools ID '{candidate_id}'")
return candidate_id
logger.note(
f"Failed to generate unique bio.tools ID based on GitHub repo name '{gh_name}'. "
"Please set bio.tools ID manually."
)
return None