Source code for bridge.builders.europe_pmc.europe_pmc_transformer

"""
Transformer converting raw Europe PMC API JSON into a Publication model.
"""

import re

from bridge.builders.protocols import Transformer
from bridge.core import Publication
from bridge.core.publication import Author
from bridge.services import EuropePMCIngestor


[docs] class EuropePMCTransformer(Transformer): """ Transform raw data from EuropePMCIngestor into a Publication model. Parameters ---------- ingestor : EuropePMCIngestor An instance of EuropePMCIngestor to fetch raw publication data. Attributes ---------- ingestor : EuropePMCIngestor The ingestor instance used to fetch raw publication data. """ def __init__(self, ingestor: EuropePMCIngestor): self.ingestor = ingestor
[docs] async def transform(self) -> Publication: """ Transform raw Europe PMC data into a Publication model. Parameters ---------- data : dict The raw data from Europe PMC. Returns ------- Publication The transformed Publication model. """ raw_data = await self.ingestor.fetch() authors = self._get_authors(raw_data) page_start, page_end = self._get_page_range(raw_data) return Publication( doi=raw_data.get("doi"), title=raw_data.get("title"), authors=authors, year=int(raw_data.get("pubYear")), journal=raw_data.get("journalTitle"), volume=raw_data.get("journalVolume"), issue=raw_data.get("issue"), page_start=page_start, page_end=page_end, )
def _get_authors(self, raw_data: dict) -> list[Author]: """ Extract authors from raw Europe PMC data. Parameters ---------- raw_data : dict The raw data from Europe PMC. Returns ------- list[Author] A list of Author models. """ out = [] for a in raw_data.get("authorList", {}).get("author", []): if a.get("lastName") or a.get("firstName"): out.append(Author(first_name=a.get("firstName", None), last_name=a.get("lastName", None))) elif a.get("fullName"): out.append(Author(name=a["fullName"])) return out def _get_page_range(self, raw_data: dict) -> tuple[str | None, str | None]: """ Extract the page range from raw Europe PMC data. Parameters ---------- raw_data : dict The raw data from Europe PMC. Returns ------- tuple[str or None, str or None] A tuple containing the start and end pages. If the page information cannot be parsed, both values are None. """ pg = raw_data.get("pageInfo") or "" m = re.match(r"^\s*([\w\-]+)\s*[-–]\s*([\w\-]+)\s*$", pg) return (m.group(1), m.group(2)) if m else (None, None)