Source code for bridge.services.spdx.spdx_ingestor

"""
Async client for the SPDX license list.
Fetches a license by SPDX identifier and returns the full JSON payload,
including the canonical license text.
"""

import logging
from typing import Any

import httpx

from bridge.config import settings
from bridge.services.protocols import Ingestor

logger = logging.getLogger(__name__)


[docs] class SPDXLicenseNotFoundError(Exception): """Raised when no SPDX license matches the given SPDX identifier."""
[docs] class SPDXLicenseIngestor(Ingestor): """ Ingest license metadata from the SPDX license list by SPDX identifier. This client fetches the JSON representation of a single license from the SPDX license list, which includes the canonical license text (``licenseText``), name, and other metadata. Parameters ---------- spdx_id : str SPDX license identifier (e.g. ``"MIT"``, ``"GPL-3.0-only"``). """ def __init__(self, spdx_id: str): if not spdx_id: raise ValueError("SPDX identifier must be a non-empty string.") self.spdx_id = spdx_id async def _get(self) -> dict[str, Any]: """ Perform async GET request to the SPDX license JSON endpoint. Returns ------- dict JSON response for the given SPDX license. Raises ------ httpx.RequestError For network-related issues. httpx.HTTPStatusError For non-2xx HTTP responses. """ # SPDX per-license JSON: {base}/{ID}.json, e.g. https://spdx.org/licenses/MIT.json base = settings.spdx_license_base url = f"{base}/{self.spdx_id}.json" try: logger.debug(f"Fetching SPDX license JSON: {url}") async with httpx.AsyncClient(timeout=10) as client: resp = await client.get(url) resp.raise_for_status() data = resp.json() logger.debug(f"Fetched SPDX license data for ID {self.spdx_id} successfully") return data except httpx.RequestError as e: logger.error(f"Network error while fetching {url}: {e}") raise except httpx.HTTPStatusError as e: logger.warning(f"HTTP error from SPDX license endpoint: {e.response.status_code} for {url}") raise
[docs] async def fetch(self) -> dict[str, Any]: """ Fetch the SPDX license record for the specified SPDX identifier. Returns ------- dict JSON metadata for the SPDX license, including at least ``licenseId``, ``name``, and ``licenseText`` fields. Raises ------ SPDXLicenseNotFoundError If the SPDX service does not return a valid license record. httpx.RequestError, httpx.HTTPStatusError For network/HTTP issues. """ data = await self._get() # SPDX license JSON should have a "licenseId" matching the request. license_id = data.get("licenseId") if not license_id: msg = f"SPDX license JSON for ID {self.spdx_id!r} missing 'licenseId' field" logger.warning(msg) raise SPDXLicenseNotFoundError(msg) logger.info(f"Ingested SPDX license data for {self.spdx_id} successfully") return data