Source code for bridge.services.huggingface.huggingface_provider

"""
LLM provider wrapping huggingface_hub’s InferenceClient for chat-style text generation used by pipelines.
"""

import asyncio
import logging
from typing import Literal

from huggingface_hub import InferenceClient

from bridge.config import settings
from bridge.services.protocols import ChatMessage, LLMProvider

logger = logging.getLogger(__name__)

# Type for Hugging Face inference providers.
type HF_Provider = Literal[
    "black-forest-labs",
    "cerebras",
    "clarifai",
    "cohere",
    "fal-ai",
    "featherless-ai",
    "fireworks-ai",
    "groq",
    "hf-inference",
    "hyperbolic",
    "nebius",
    "novita",
    "nscale",
    "openai",
    "publicai",
    "replicate",
    "sambanova",
    "scaleway",
    "together",
    "zai-org",
    "auto",
] | None



[docs]
class HuggingFaceProvider(LLMProvider):
    """
    Hugging Face provider for chat-capable models using InferenceClient.
    Supports models compatible with the chat.completions API.

    Parameters
    ----------
    model : str
        The Hugging Face model identifier to use for chat generation.
        Default is "Qwen/Qwen3-8B" (https://huggingface.co/Qwen/Qwen2.5-7B).
    provider : HF_Provider
        The inference provider to use. Default is "featherless-ai".
    """

    def __init__(self, model: str = "Qwen/Qwen3-8B", provider: HF_Provider = "featherless-ai"):
        settings.require_huggingface_token()

        self.model = model
        self._client = InferenceClient(provider=provider, model=model, token=settings.huggingface_token)


[docs]
    async def generate(self, messages: list[ChatMessage], **kwargs) -> ChatMessage:
        """
        Generate a chat-based response from the model.

        Parameters
        ----------
        messages : list[ChatMessage]
            A list of chat messages forming the conversation history.
        **kwargs
            Additional generation parameters such as `max_new_tokens` and `temperature`.

        Returns
        -------
        ChatMessage
            The generated chat message (response) from the model.

        Raises
        ------
        ValueError
            If `messages` is empty.
        RuntimeError
            If the model response is missing required fields.
        Exception
            For any other errors during generation.
        """
        if not messages:
            raise ValueError("`messages` cannot be empty — provide a chat-style message list.")

        try:
            response = await asyncio.to_thread(
                self._client.chat.completions.create,
                model=self.model,
                messages=[m.model_dump() for m in messages],
                max_tokens=kwargs.get("max_new_tokens", 500),
                temperature=kwargs.get("temperature", 0.7),
            )

            message = response.choices[0].message
            role = getattr(message, "role", None)
            content = getattr(message, "content", None)
            if not role:
                raise RuntimeError("Response from model missing 'role' field.")
            if not content:
                raise RuntimeError("Empty response received from model.")

            return ChatMessage(role=role, content=content)

        except Exception as e:
            logger.error(f"Chat generation failed for model '{self.model}': {e}")
            raise