Source code for bridge.builders.europe_pmc.europe_pmc_transformer

"""
Transformer converting raw Europe PMC API JSON into a Publication model.
"""

import re

from bridge.builders.protocols import Transformer
from bridge.core import Publication
from bridge.core.publication import Author
from bridge.services import EuropePMCIngestor



[docs]
class EuropePMCTransformer(Transformer):
    """
    Transform raw data from EuropePMCIngestor into a Publication model.

    Parameters
    ----------
    ingestor : EuropePMCIngestor
        An instance of EuropePMCIngestor to fetch raw publication data.


    Attributes
    ----------
    ingestor : EuropePMCIngestor
        The ingestor instance used to fetch raw publication data.
    """

    def __init__(self, ingestor: EuropePMCIngestor):
        self.ingestor = ingestor


[docs]
    async def transform(self) -> Publication:
        """
        Transform raw Europe PMC data into a Publication model.

        Parameters
        ----------
        data : dict
            The raw data from Europe PMC.

        Returns
        -------
        Publication
            The transformed Publication model.
        """
        raw_data = await self.ingestor.fetch()

        authors = self._get_authors(raw_data)
        page_start, page_end = self._get_page_range(raw_data)

        return Publication(
            doi=raw_data.get("doi"),
            title=raw_data.get("title"),
            authors=authors,
            year=int(raw_data.get("pubYear")),
            journal=raw_data.get("journalTitle"),
            volume=raw_data.get("journalVolume"),
            issue=raw_data.get("issue"),
            page_start=page_start,
            page_end=page_end,
        )


    def _get_authors(self, raw_data: dict) -> list[Author]:
        """
        Extract authors from raw Europe PMC data.

        Parameters
        ----------
        raw_data : dict
            The raw data from Europe PMC.

        Returns
        -------
        list[Author]
            A list of Author models.
        """
        out = []
        for a in raw_data.get("authorList", {}).get("author", []):
            if a.get("lastName") or a.get("firstName"):
                out.append(Author(first_name=a.get("firstName", None), last_name=a.get("lastName", None)))
            elif a.get("fullName"):
                out.append(Author(name=a["fullName"]))
        return out

    def _get_page_range(self, raw_data: dict) -> tuple[str | None, str | None]:
        """
        Extract the page range from raw Europe PMC data.

        Parameters
        ----------
        raw_data : dict
            The raw data from Europe PMC.

        Returns
        -------
        tuple[str or None, str or None]
            A tuple containing the start and end pages. If the page information
            cannot be parsed, both values are None.
        """
        pg = raw_data.get("pageInfo") or ""
        m = re.match(r"^\s*([\w\-]+)\s*[-–]\s*([\w\-]+)\s*$", pg)
        return (m.group(1), m.group(2)) if m else (None, None)