Source code for bridge.pipelines.gh2bt_for_meta.map_funcs.language

"""
Map language metadata from GitHub to bio.tools.

This module aligns programming language information
between GitHub and bio.tools. It compares the languages
reported by GitHub for a repository with those listed
in the bio.tools metadata, and suggests updates when
discrepancies are found.
"""

from bridge.core.biotools import LanguageEnum
from bridge.core.github_languages import Language
from bridge.logging import get_user_logger
from bridge.pipelines.policies.gh2bt import reconcile_gh_over_bt
from bridge.pipelines.utils import find_matching_enum_member

logger = get_user_logger()


def _to_lang_set_gh(gh_languages: Language | None) -> set[str] | None:
    """
    Normalize the GitHub language set to a lowercased string set.

    Parameters
    ----------
    gh_languages : Language | None
        GitHub languages object, or ``None`` if no language data is present.

    Returns
    -------
    set[str] | None
        A set of lowercased language names as reported by GitHub, or ``None``
        if GitHub provides no language data.
    """
    if gh_languages is None or not gh_languages.root:
        return None
    return {lang.lower() for lang in gh_languages.root.keys()}


def _to_lang_set_bt(bt_languages: list[LanguageEnum] | None) -> set[str] | None:
    """
    Normalize the bio.tools language list to a lowercased string set.

    Parameters
    ----------
    bt_languages : list[LanguageEnum] | None
        Existing bio.tools language annotations, or ``None`` if unset.

    Returns
    -------
    set[str] | None
        A set of lowercased language names derived from the ``LanguageEnum``
        values, or ``None`` if no languages are recorded.
    """
    if not bt_languages:
        return None
    return {lang.value.lower() for lang in bt_languages}


def _cast_to_biotools_languages(languages: set[str]) -> list[LanguageEnum] | None:
    """
    Convert a set of GitHub language names to a list of bio.tools language enums.

    For each language name in the input set, this function tries to resolve it
    to a `LanguageEnum` using `_find_matching_bt_language`. Languages that
    cannot be resolved are skipped, and a log message is emitted to indicate
    that an unknown language value was encountered.

    Parameters
    ----------
    languages : set[str]
        Set of language names as reported by GitHub.

    Returns
    -------
    list[LanguageEnum] | None
        List of successfully mapped languages as `LanguageEnum` members.
        The order corresponds to the iteration order of the input set.
        Returns ``None`` if no languages could be mapped.
    """
    bt_languages: list[LanguageEnum] = []
    for lang in languages:
        # matched_lang = _find_matching_bt_language(lang)
        matched_lang = find_matching_enum_member(lang, LanguageEnum)
        if matched_lang:
            bt_languages.append(matched_lang)
        else:
            logger.note(f"Unknown language '{lang}' not found in bio.tools LanguageEnum.")
    return bt_languages or None


[docs] def map_language(gh_languages: Language | None, bt_languages: list[LanguageEnum] | None) -> list[LanguageEnum] | None: """ Map and reconcile GitHub and bio.tools programming languages using the generic GitHub-over-bio.tools policy. GitHub language keys and bio.tools ``LanguageEnum`` values are normalized to lowercased string sets for comparison. When GitHub is authoritative, the GitHub set is mapped back to ``LanguageEnum`` values; unknown languages are skipped with a log entry. Parameters ---------- gh_languages : Language | None GitHub languages object, or ``None`` if no language data is available. bt_languages : list[LanguageEnum] | None Existing bio.tools language annotations for the tool, or ``None`` if no languages are currently recorded in bio.tools. Returns ------- list[LanguageEnum] | None The reconciled list of bio.tools language enums following the policy. May be ``None`` if both inputs are ``None``. """ gh_norm = _to_lang_set_gh(gh_languages) bt_norm = _to_lang_set_bt(bt_languages) # if GitHub has nothing, keep existing if gh_norm is None or len(gh_norm) == 0: logger.unchanged("No GitHub languages found, nothing to map.") return bt_languages # use the generic reconciler on the *set* representation return reconcile_gh_over_bt( gh_norm=gh_norm, bt_norm=bt_norm, bt_value=bt_languages, build_bt_from_gh=_cast_to_biotools_languages, log_label="languages", )