Source code for bridge.services.github.github_ingestor

"""
Async client for the GitHub API.
Fetches a repository and returns the raw JSON (single call).
"""

import logging
from typing import Any

import httpx

from bridge.config import settings
from bridge.services.protocols import Ingestor

from .github_auth import get_github_headers

logger = logging.getLogger(__name__)


[docs] class GitHubIngestor(Ingestor): """ Ingest GitHub repository metadata via the GitHub REST API (raw JSON). """ def __init__(self, owner: str, repo: str): self.owner = owner self.repo = repo async def _get(self, url: str, *, params: dict | None = None, headers: dict | None = None) -> dict[str, Any]: # Ask for topics in the repo payload as well (mercy preview still matters in practice). base_headers = get_github_headers() want_topics = "application/vnd.github.mercy-preview+json" accept = base_headers.get("Accept", "application/vnd.github+json") merged = {**base_headers, **(headers or {}), "Accept": f"{accept}; {want_topics}"} try: async with httpx.AsyncClient(timeout=15) as client: resp = await client.get(url, headers=merged, params=params) resp.raise_for_status() return resp.json() except httpx.RequestError as e: logger.error(f"Network error while fetching {url}: {e}") raise except httpx.HTTPStatusError as e: logger.warning(f"HTTP error {e.response.status_code} from {url}") raise
[docs] async def fetch(self) -> dict[str, Any]: """ Fetch the full repository object (single endpoint). Returns ------- dict Raw JSON for the repository from GET /repos/{owner}/{repo}. """ repo_data = await self.fetch_repo() latest_release_data = await self.fetch_latest_release() github_pages = await self.fetch_github_pages() readme = await self.fetch_readme() languages = await self.fetch_languages() result: dict[str, Any] = { "repo": repo_data, "latest_release": latest_release_data, "github_pages": github_pages, "readme": readme, "languages": languages, } return result
[docs] async def fetch_repo(self) -> dict[str, Any]: """ Fetch the full repository object (raw JSON). Returns ------- dict Raw JSON for the repository from GET /repos/{owner}/{repo}. """ base = settings.github_api_base url = f"{base}/repos/{self.owner}/{self.repo}" logger.debug(f"Fetching repository: {url}") data = await self._get(url) logger.info(f"Ingested data for {self.owner}/{self.repo} successfully") return data
[docs] async def fetch_latest_release(self) -> dict[str, Any] | None: """ Fetch the latest release (raw JSON) or return None if the repo has no releases. Returns ------- dict | None Latest release JSON, or None when GitHub returns 404 (no releases). """ base = settings.github_api_base url = f"{base}/repos/{self.owner}/{self.repo}/releases/latest" # Include the version header GitHub recommends; Accept is already set in get_github_headers(). headers = {"X-GitHub-Api-Version": "2022-11-28"} try: logger.debug(f"Fetching latest release: {url}") return await self._get(url, headers=headers) except httpx.HTTPStatusError as e: if e.response.status_code == 404: logger.info(f"No releases for {self.owner}/{self.repo}") return None raise
[docs] async def fetch_languages(self) -> dict[str, Any] | None: """ Fetch the programming languages used in the repository. Returns ------- dict | None Raw JSON for the languages from GET /repos/{owner}/{repo}/languages. """ base = settings.github_api_base url = f"{base}/repos/{self.owner}/{self.repo}/languages" headers = {"X-GitHub-Api-Version": "2022-11-28"} try: logger.debug(f"Fetching languages: {url}") return await self._get(url, headers=headers) except httpx.HTTPStatusError as e: if e.response.status_code == 404: logger.info(f"No languages found for {self.owner}/{self.repo}") return None raise
[docs] async def fetch_github_pages(self) -> dict[str, Any]: """ Fetch the GitHub Pages information for the repository. Returns ------- dict Raw JSON for the GitHub Pages from GET /repos/{owner}/{repo}/pages. """ base = settings.github_api_base url = f"{base}/repos/{self.owner}/{self.repo}/pages" headers = {"X-GitHub-Api-Version": "2022-11-28"} try: logger.debug(f"Fetching GitHub pages: {url}") return await self._get(url, headers=headers) except httpx.HTTPStatusError as e: if e.response.status_code == 404: logger.info(f"No GitHub pages found for {self.owner}/{self.repo}") return None raise
[docs] async def fetch_readme(self) -> str | None: """ Fetch the README content (decoded) or return None if not found. Returns ------- str | None Decoded README content, or None if not found. """ base = settings.github_api_base url = f"{base}/repos/{self.owner}/{self.repo}/readme" headers = {"X-GitHub-Api-Version": "2022-11-28"} try: logger.debug(f"Fetching README: {url}") data = await self._get(url, headers=headers) import base64 content_encoded = data.get("content", "") content_bytes = base64.b64decode(content_encoded) content_str = content_bytes.decode("utf-8", errors="replace") logger.info(f"Ingested README for {self.owner}/{self.repo} successfully") return content_str except httpx.HTTPStatusError as e: if e.response.status_code == 404: logger.info(f"No README found for {self.owner}/{self.repo}") return None raise
[docs] async def get_user(self, username: str) -> dict[str, Any]: """ Fetch a GitHub user by username. Parameters ---------- username : str GitHub username. Returns ------- dict Raw JSON for the user from GET /users/{username}. """ base = settings.github_api_base url = f"{base}/users/{username}" return await self._get(url)