Source code for bridge.pipelines.shared.publications

"""
Shared publication utilities for pipelines.
"""

from collections.abc import Mapping
from typing import Any

from bridge.core import Publication
from bridge.core.biotools import PublicationItem
from bridge.pipelines.utils import normalize_dict_strings, normalize_text


[docs] def ref_ids(ref: Publication | PublicationItem | Mapping[str, Any]) -> set[str]: """ Extract a normalized set of identifiers for a publication-like object. This function collects all available identifiers and returns them as normalized strings, which can then be used for deduplication. Supports: - Europe PMC Publication model - bio.tools PublicationItem model - plain dict-like mappings Normalized identifiers include (when present): - DOI - PMID - PMCID - title Parameters ---------- ref : Publication | Mapping[str, Any] The Publication object or dictionary representing a publication. Returns ------- set[str] A set of normalized identifier strings. The set may be empty if no identifiers are present. Raises ------ TypeError If `ref` is neither a `Publication` instance nor a mapping. """ if isinstance(ref, Mapping): doi = ref.get("doi", None) pmid = ref.get("pmid", None) pmcid = ref.get("pmcid", None) title = ref.get("title", None) elif isinstance(ref, (Publication, PublicationItem)): doi = getattr(ref, "doi", None) pmid = getattr(ref, "pmid", None) pmcid = getattr(ref, "pmcid", None) title = getattr(ref, "title", None) else: raise TypeError("ref must be a Publication, PublicationItem, or a Mapping") ids: set[str] = set() if doi: ids.add(f"doi:{normalize_text(str(doi))}") if pmid: ids.add(f"pmid:{str(pmid).strip()}") if pmcid: ids.add(f"pmcid:{str(pmcid).strip()}") if title: ids.add(f"title:{normalize_text(str(title))}") return ids
[docs] def deduplicate_references( references: list[Publication | PublicationItem | Mapping[str, Any]], ) -> list[Publication | PublicationItem | Mapping[str, Any]]: """ Deduplicate a list of publication-like objects based on shared identifiers. Two references are considered duplicates if they share at least one normalized identifier (DOI, PMID, PMCID, or title). The first occurrence in the input list is retained; all later duplicates are dropped. Parameters ---------- references : list[Publication | Mapping[str, Any]] List of references (models or dicts) to deduplicate. Returns ------- list[Publication | Mapping[str, Any]] Deduplicated list of references, preserving original order for the first occurrence of each logical publication. """ seen_identifier_sets: list[set[str]] = [] deduplicated: list[Publication | PublicationItem | Mapping[str, Any]] = [] for ref in references: current_ids = ref_ids(ref) # if we have no identifiers at all, treat as unique if not current_ids: deduplicated.append(ref) continue duplicate = False for seen_ids in seen_identifier_sets: if current_ids & seen_ids: # intersection duplicate = True # merge identifier sets to strengthen future matching seen_ids.update(current_ids) break if duplicate: continue seen_identifier_sets.append(set(current_ids)) deduplicated.append(ref) return deduplicated
[docs] def extract_cff_references( citation_cff: dict[str, Any] | None, ) -> tuple[list[dict[str, Any]], dict[str, Any] | None]: """ Extract publication information from CITATION.cff dictionary. This helper parses an existing CFF structure and returns: - the list of reference entries, and - the preferred-citation entry, if present. The preferred citation is ensured to be part of the references list: if it is not already present, it is appended. Parameters ---------- citation_cff : dict[str, Any] | None Parsed content of an existing CITATION.cff file, or ``None`` if no file exists. Returns ------- tuple[list[dict[str, Any]], dict[str, Any] | None] A tuple of: - A list of reference dictionaries extracted from the CFF. - The preferred-citation dictionary, or ``None`` if not present. """ if not citation_cff: return [], None citation_cff = normalize_dict_strings(citation_cff) references = citation_cff.get("references") or [] references = [r for r in references if isinstance(r, Mapping)] preferred = citation_cff.get("preferred-citation") if isinstance(preferred, Mapping): preferred = dict(preferred) # shallow copy else: preferred = None # ensure preferred-citation is included in references if present if preferred is not None: pref_ids = ref_ids(preferred) in_refs = any(ref_ids(r) & pref_ids for r in references) if not in_refs: references.append(preferred) return references, preferred