diff --git a/backend/services/llm_providers/huggingface_provider.py b/backend/services/llm_providers/huggingface_provider.py index 0606aa0e..e7f926a0 100644 --- a/backend/services/llm_providers/huggingface_provider.py +++ b/backend/services/llm_providers/huggingface_provider.py @@ -1,62 +1,25 @@ """ -Hugging Face Provider Module for ALwrity +Hugging Face Provider Module for ALwrity. -This module provides functions for interacting with Hugging Face's Inference Providers API -using the Responses API (beta) which provides a unified interface for model interactions. - -Key Features: -- Text response generation with retry logic -- Structured JSON response generation with schema validation -- Comprehensive error handling and logging -- Automatic API key management -- Support for various Hugging Face models via Inference Providers - -Best Practices: -1. Use structured output for complex, multi-field responses -2. Keep schemas simple and flat to avoid truncation -3. Set appropriate token limits (8192 for complex outputs) -4. Use low temperature (0.1-0.3) for consistent structured output -5. Implement proper error handling in calling functions -6. Use the Responses API for better compatibility - -Usage Examples: - # Text response - result = huggingface_text_response(prompt, temperature=0.7, max_tokens=2048) - - # Structured JSON response - schema = { - "type": "object", - "properties": { - "tasks": { - "type": "array", - "items": {"type": "object", "properties": {...}} - } - } - } - result = huggingface_structured_json_response(prompt, schema, temperature=0.2, max_tokens=8192) - -Dependencies: -- openai (for Hugging Face Responses API) -- tenacity (for retry logic) -- logging (for debugging) -- json (for fallback parsing) - -Author: ALwrity Team -Version: 1.0 -Last Updated: January 2025 +Provides text and structured JSON generation through Hugging Face Router +(OpenAI-compatible API), with retry and explicit fallback controls. """ +<<<<<<< HEAD <<<<<<< HEAD import os ======= import hashlib >>>>>>> pr-419 +======= +>>>>>>> pr-437 import json import os import re <<<<<<< HEAD <<<<<<< HEAD from functools import lru_cache +<<<<<<< HEAD from typing import Optional, Dict, Any ======= from typing import Optional, Dict, Any, List, Iterable @@ -66,49 +29,38 @@ import time from threading import Lock from typing import Optional, Dict, Any >>>>>>> pr-419 +======= +from typing import Any, Dict, List, Optional + +from tenacity import retry, retry_if_exception, stop_after_attempt, wait_random_exponential +>>>>>>> pr-437 -from loguru import logger from utils.logger_utils import get_service_logger from .routing_policy import PREMIUM_DEFAULT_MODEL, SIF_LOW_COST_MODEL_DEFAULTS -# Use service-specific logger to avoid conflicts logger = get_service_logger("huggingface_provider") -<<<<<<< HEAD -from tenacity import ( - retry, - retry_if_exception, - stop_after_attempt, - wait_random_exponential, -) -======= ->>>>>>> pr-416 - try: - from openai import OpenAI - OPENAI_AVAILABLE = True -except ImportError: - OPENAI_AVAILABLE = False - logger.warn("OpenAI library not available. Install with: pip install openai") + from openai import NotFoundError, OpenAI + + OPENAI_AVAILABLE = True +except ImportError: # pragma: no cover - environment-dependent + OPENAI_AVAILABLE = False + OpenAI = None + NotFoundError = Exception + logger.warning("OpenAI library not available. Install with: pip install openai") -<<<<<<< HEAD HF_FALLBACK_MODELS = [ -<<<<<<< HEAD - "openai/gpt-oss-120b:cerebras", - "moonshotai/Kimi-K2-Instruct-0905:cerebras", - "meta-llama/Llama-3.1-8B-Instruct:cerebras", - "mistralai/Mistral-7B-Instruct-v0.3:cerebras", -======= PREMIUM_DEFAULT_MODEL, "moonshotai/Kimi-K2-Instruct-0905:groq", "meta-llama/Llama-3.1-8B-Instruct:groq", SIF_LOW_COST_MODEL_DEFAULTS[0], ->>>>>>> pr-417 ] _HF_CLIENT_CACHE: Dict[str, Any] = {} _HF_CLIENT_CACHE_LOCK = Lock() +<<<<<<< HEAD def _masked_key_id(api_key: str) -> str: return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12] @@ -134,14 +86,23 @@ def get_huggingface_client(api_key: str): def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True): """Yield model ids to try for a single logical model preference.""" +======= +def _candidate_model_variants(model: str): + """Yield model IDs to try for a single logical model preference.""" +>>>>>>> pr-437 if not model: return - # Try configured model first (supports provider suffixes like ":cerebras") + # Try configured model first (supports provider suffixes like ':groq'). yield model +<<<<<<< HEAD # Fallback to base repo id when provider suffix is not recognized by the router if allow_model_variant_fallback and ":" in model: +======= + # Fallback to base repo id when provider suffix isn't recognized. + if ":" in model: +>>>>>>> pr-437 base_model = model.split(":", 1)[0] if base_model: yield base_model @@ -149,12 +110,16 @@ def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = T <<<<<<< HEAD def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = None): - # IMPORTANT: Do not apply implicit global fallback chains. - # Callers must explicitly provide fallback_models when they want multi-model retries. + """Yield unique model candidates preserving caller-defined order. + + IMPORTANT: no implicit global fallback chain is applied here; callers must + explicitly pass fallback_models if they want multi-model retries. + """ if fallback_models: sequence = [model] + fallback_models else: sequence = [model] +<<<<<<< HEAD ======= def _fallback_model_sequence( model: str, @@ -170,6 +135,9 @@ def _fallback_model_sequence( sequence = [model] + list(fallback_models) >>>>>>> pr-418 +======= + +>>>>>>> pr-437 seen = set() for preferred_model in sequence: for candidate in _candidate_model_variants( @@ -182,11 +150,9 @@ def _fallback_model_sequence( def _is_non_retryable_hf_error(exc: Exception) -> bool: - """Skip retries for deterministic HF failures (e.g., unknown model ids, billing).""" msg = str(exc).lower() status = getattr(exc, "status_code", None) - - # Non-retryable errors + if isinstance(exc, NotFoundError) or "not found" in msg or "404" in msg: return True if status == 402 or "402" in msg or "depleted" in msg or "credits" in msg: @@ -195,7 +161,6 @@ def _is_non_retryable_hf_error(exc: Exception) -> bool: return True if status == 403 or "forbidden" in msg or "403" in msg: return True - return False @@ -204,7 +169,6 @@ def _should_retry_hf_error(exc: Exception) -> bool: def _classify_hf_error(exc: Exception) -> str: - """Classify HF failures for actionable logs.""" msg = str(exc).lower() if any(token in msg for token in ["insufficient", "balance", "quota", "billing", "payment", "402"]): return "billing_or_quota" @@ -215,62 +179,30 @@ def _classify_hf_error(exc: Exception) -> str: return "unknown" -def _hf_error_details(exc: Exception) -> str: - """Return compact, actionable exception details for logs.""" - status = getattr(exc, "status_code", None) - err_type = type(exc).__name__ - message = str(exc) - raw_body = getattr(exc, "body", None) - details = f"type={err_type}" - if status is not None: - details += f", status={status}" - if message: - details += f", message={message}" - if raw_body: - details += f", body={raw_body}" - details += f", repr={repr(exc)}" - return details - -def get_huggingface_api_key() -> str: -======= - - -def _classify_hf_error(error: Exception) -> str: - message = str(error or "").lower() - if any(x in message for x in ["insufficient", "quota", "billing", "payment", "credits", "balance"]): - return "billing_or_quota" - if any(x in message for x in ["unauthorized", "forbidden", "permission", "invalid api key", "authentication"]): - return "auth_or_permission" - if ("not found" in message) or ("404" in message): - return "model_not_found" - return "other" - - -def _error_details(error: Exception) -> Dict[str, str]: +def _error_details(exc: Exception) -> Dict[str, str]: return { - "type": type(error).__name__, - "message": str(error), - "repr": repr(error), + "type": type(exc).__name__, + "message": str(exc), + "repr": repr(exc), } def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str: ->>>>>>> pr-416 - """Get Hugging Face API key with proper error handling.""" - api_key = explicit_api_key or os.getenv('HF_TOKEN') + """Get Hugging Face API key with basic validation.""" + api_key = explicit_api_key or os.getenv("HF_TOKEN") if not api_key: error_msg = "HF_TOKEN environment variable is not set. Please set it in your .env file." logger.error(error_msg) raise ValueError(error_msg) - - # Validate API key format (basic check) - if not api_key.startswith('hf_'): + + if not api_key.startswith("hf_"): error_msg = "HF_TOKEN appears to be invalid. It should start with 'hf_'." logger.error(error_msg) raise ValueError(error_msg) - + return api_key +<<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD @retry( @@ -279,11 +211,15 @@ def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str: stop=stop_after_attempt(6), ) ======= +======= + +>>>>>>> pr-437 @lru_cache(maxsize=16) def _get_hf_client(api_key: str): return OpenAI(base_url="https://router.huggingface.co/v1", api_key=api_key) +<<<<<<< HEAD >>>>>>> pr-416 ======= @retry( @@ -292,14 +228,17 @@ def _get_hf_client(api_key: str): reraise=True, ) >>>>>>> pr-419 +======= +@retry( + retry=retry_if_exception(_should_retry_hf_error), + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), +) +>>>>>>> pr-437 def huggingface_text_response( prompt: str, -<<<<<<< HEAD - model: str = "openai/gpt-oss-120b:cerebras", - fallback_models: Optional[List[str]] = None, -======= model: str = PREMIUM_DEFAULT_MODEL, ->>>>>>> pr-417 + fallback_models: Optional[List[str]] = None, temperature: float = 0.7, max_tokens: int = 2048, top_p: float = 0.9, @@ -311,48 +250,11 @@ def huggingface_text_response( allow_model_variant_fallback: bool = True, >>>>>>> pr-418 ) -> str: - """ - Generate text response using Hugging Face Inference Providers API. - - This function uses the Hugging Face Responses API which provides a unified interface - for model interactions with built-in retry logic and error handling. - - Args: - prompt (str): The input prompt for the AI model - model (str): Hugging Face model identifier (default: "openai/gpt-oss-120b:groq") - temperature (float): Controls randomness (0.0-1.0) - max_tokens (int): Maximum tokens in response - top_p (float): Nucleus sampling parameter (0.0-1.0) - system_prompt (str, optional): System instruction for the model - - Returns: - str: Generated text response - - Raises: - Exception: If API key is missing or API call fails - - Best Practices: - - Use appropriate temperature for your use case (0.7 for creative, 0.1-0.3 for factual) - - Set max_tokens based on expected response length - - Use system_prompt to guide model behavior - - Handle errors gracefully in calling functions - - Example: - result = huggingface_text_response( - prompt="Write a blog post about AI", -<<<<<<< HEAD - model="openai/gpt-oss-120b:cerebras", -======= - model=PREMIUM_DEFAULT_MODEL, ->>>>>>> pr-417 - temperature=0.7, - max_tokens=2048, - system_prompt="You are a professional content writer." - ) - """ + """Generate text with explicit fallback model sequence.""" try: if not OPENAI_AVAILABLE: raise ImportError("OpenAI library not available. Install with: pip install openai") +<<<<<<< HEAD # Get API key with proper error handling api_key = get_huggingface_api_key(api_key) @@ -376,23 +278,18 @@ def huggingface_text_response( client = get_huggingface_client(api_key) >>>>>>> pr-419 logger.info("✅ Hugging Face client initialized for text response") +======= +>>>>>>> pr-437 + + hf_api_key = get_huggingface_api_key(api_key) + client = _get_hf_client(hf_api_key) - # Prepare input for the API messages = [] - - # Add system prompt if provided if system_prompt: - messages.append({ - "role": "system", - "content": system_prompt - }) - - # Add user prompt - messages.append({ - "role": "user", - "content": prompt - }) + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) +<<<<<<< HEAD # Add debugging for API call logger.info( "Hugging Face text call | model={} | prompt_len={} | temp={} | top_p={} | max_tokens={}", @@ -496,20 +393,32 @@ def huggingface_text_response( logger.error(f"🔍 HF Error Diagnostics:") logger.error(f" - Status: {e.response.status_code}") logger.error(f" - Headers: {dict(e.response.headers)}") - try: - body_json = e.response.json() - logger.error(f" - Body JSON: {json.dumps(body_json, indent=2)}") - except Exception: - logger.error(f" - Body Raw: {e.response.text[:1000]}") - else: - logger.error(f"🔍 No HTTP response attached to exception object.") - ======= - details = _error_details(e) - logger.error("❌ Hugging Face text generation failed | error_class={} | type={} | message={} | repr={}", error_class, details["type"], details["message"], details["repr"]) ->>>>>>> pr-416 - raise Exception(f"Hugging Face text generation failed: {str(e)}") + response = None + last_error = None + for candidate_model in _fallback_model_sequence(model, fallback_models): +>>>>>>> pr-437 + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + ) + if candidate_model != model: + logger.warning("HF text fallback model used: {}", candidate_model) + break + except NotFoundError as nf_err: + last_error = nf_err + logger.warning("HF text model not found: {}", candidate_model) + continue + except Exception as call_err: + last_error = call_err + logger.warning("HF text call failed for model {}: {}", candidate_model, _error_details(call_err)) + continue +<<<<<<< HEAD <<<<<<< HEAD ======= @retry( @@ -518,15 +427,38 @@ def huggingface_text_response( reraise=True, ) >>>>>>> pr-419 +======= + if response is None: + raise last_error or RuntimeError("All fallback models failed") + + generated_text = response.choices[0].message.content or "" + generated_text = re.sub(r"```[a-zA-Z]*\n?", "", generated_text) + generated_text = re.sub(r"```\n?", "", generated_text).strip() + return generated_text + + except Exception as exc: + details = _error_details(exc) + logger.error( + "❌ Hugging Face text generation failed | error_class={} | type={} | message={} | repr={}", + _classify_hf_error(exc), + details["type"], + details["message"], + details["repr"], + ) + raise Exception(f"Hugging Face text generation failed: {exc}") from exc + + +@retry( + retry=retry_if_exception(_should_retry_hf_error), + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), +) +>>>>>>> pr-437 def huggingface_structured_json_response( prompt: str, schema: Dict[str, Any], -<<<<<<< HEAD - model: str = "openai/gpt-oss-120b:cerebras", - fallback_models: Optional[List[str]] = None, -======= model: str = PREMIUM_DEFAULT_MODEL, ->>>>>>> pr-417 + fallback_models: Optional[List[str]] = None, temperature: float = 0.7, max_tokens: int = 8192, system_prompt: Optional[str] = None, @@ -537,54 +469,11 @@ def huggingface_structured_json_response( allow_model_variant_fallback: bool = True, >>>>>>> pr-418 ) -> Dict[str, Any]: - """ - Generate structured JSON response using Hugging Face Inference Providers API. - - This function uses the Hugging Face Responses API with structured output support - to generate JSON responses that match a provided schema. - - Args: - prompt (str): The input prompt for the AI model - schema (dict): JSON schema defining the expected output structure - model (str): Hugging Face model identifier (default: "openai/gpt-oss-120b:groq") - temperature (float): Controls randomness (0.0-1.0). Use 0.1-0.3 for structured output - max_tokens (int): Maximum tokens in response. Use 8192 for complex outputs - system_prompt (str, optional): System instruction for the model - - Returns: - dict: Parsed JSON response matching the provided schema - - Raises: - Exception: If API key is missing or API call fails - - Best Practices: - - Keep schemas simple and flat to avoid truncation - - Use low temperature (0.1-0.3) for consistent structured output - - Set max_tokens to 8192 for complex multi-field responses - - Avoid deeply nested schemas with many required fields - - Test with smaller outputs first, then scale up - - Example: - schema = { - "type": "object", - "properties": { - "tasks": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "description": {"type": "string"} - } - } - } - } - } - result = huggingface_structured_json_response(prompt, schema, temperature=0.2, max_tokens=8192) - """ + """Generate structured JSON with explicit fallback model sequence.""" try: if not OPENAI_AVAILABLE: raise ImportError("OpenAI library not available. Install with: pip install openai") +<<<<<<< HEAD # Get API key with proper error handling api_key = get_huggingface_api_key(api_key) @@ -609,25 +498,18 @@ def huggingface_structured_json_response( client = get_huggingface_client(api_key) >>>>>>> pr-419 logger.info("✅ Hugging Face client initialized for structured JSON response") +======= +>>>>>>> pr-437 + + hf_api_key = get_huggingface_api_key(api_key) + client = _get_hf_client(hf_api_key) - # Prepare input for the API messages = [] - - # Add system prompt if provided if system_prompt: - messages.append({ - "role": "system", - "content": system_prompt - }) - - # Add user prompt with JSON instruction - # For HF models, explicit JSON instruction in prompt is often better than response_format - json_instruction = "Please respond with valid JSON that matches the provided schema." - messages.append({ - "role": "user", - "content": f"{prompt}\n\n{json_instruction}" - }) + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) +<<<<<<< HEAD # Add debugging for API call logger.info( "Hugging Face structured call | model={} | prompt_len={} | schema_kind={} | temp={} | max_tokens={}", @@ -753,12 +635,37 @@ def huggingface_structured_json_response( fallback_attempt += 1 started_at = time.perf_counter() >>>>>>> pr-419 +======= + response = None + last_error = None + + for candidate_model in _fallback_model_sequence(model, fallback_models): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + response_format={"type": "json_object"}, + ) + if candidate_model != model: + logger.warning("HF structured fallback model used: {}", candidate_model) + break + except Exception as err: + last_error = err + if isinstance(err, NotFoundError): + logger.warning("HF structured model not found: {}", candidate_model) + continue + + msg = str(err).lower() + if "422" in msg or "not supported" in msg: +>>>>>>> pr-437 try: response = client.chat.completions.create( model=candidate_model, messages=messages, temperature=temperature, - max_tokens=max_tokens + max_tokens=max_tokens, ) elapsed_ms = (time.perf_counter() - started_at) * 1000 logger.debug( @@ -768,8 +675,9 @@ def huggingface_structured_json_response( elapsed_ms, ) if candidate_model != model: - logger.warning("HF structured no-response_format fallback model: {}", candidate_model) + logger.warning("HF structured fallback(no response_format) model: {}", candidate_model) break +<<<<<<< HEAD except NotFoundError as nf_err: last_error = nf_err elapsed_ms = (time.perf_counter() - started_at) * 1000 @@ -780,25 +688,16 @@ def huggingface_structured_json_response( elapsed_ms, ) logger.warning("HF structured model not found (no response_format path): {}", candidate_model) - continue ======= - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - response_format={"type": "json_object"} - ) - except Exception as e: - details = _error_details(e) - logger.error("❌ Hugging Face API call failed | error_class={} | type={} | message={} | repr={}", _classify_hf_error(e), details["type"], details["message"], details["repr"]) - raise ->>>>>>> pr-416 + except Exception as second_err: + last_error = second_err +>>>>>>> pr-437 + continue - response_text = response.choices[0].message.content + if response is None: + raise last_error or RuntimeError("All fallback models failed") - # Clean up response text if needed - response_text = response_text.strip() + response_text = (response.choices[0].message.content or "").strip() if response_text.startswith("```json"): response_text = response_text[7:] if response_text.endswith("```"): @@ -806,57 +705,37 @@ def huggingface_structured_json_response( response_text = response_text.strip() try: - parsed_json = json.loads(response_text) - logger.info("✅ Hugging Face structured JSON response parsed successfully") - return parsed_json - except json.JSONDecodeError as json_err: - logger.error(f"❌ JSON parsing failed: {json_err}") - logger.error(f"Raw response: {response_text}") - json_match = re.search(r'\{.*\}', response_text, re.DOTALL) + return json.loads(response_text) + except json.JSONDecodeError: + json_match = re.search(r"\{.*\}", response_text, re.DOTALL) if json_match: - try: - extracted_json = json.loads(json_match.group()) - logger.info("✅ JSON extracted using regex fallback") - return extracted_json - except json.JSONDecodeError: - pass + return json.loads(json_match.group()) return {"error": "Failed to parse JSON response", "raw_response": response_text} - - except Exception as e: - error_msg = str(e) if str(e) else repr(e) - error_type = type(e).__name__ - details = _error_details(e) - logger.error("❌ Hugging Face structured JSON generation failed | error_class={} | type={} | message={} | repr={}", _classify_hf_error(e), error_type, details["message"], details["repr"]) - logger.error(f"❌ Full exception details: {repr(e)}") - import traceback - logger.error(f"❌ Traceback: {traceback.format_exc()}") - raise Exception(f"Hugging Face structured JSON generation failed: {error_type}: {error_msg}") + + except Exception as exc: + details = _error_details(exc) + logger.error( + "❌ Hugging Face structured JSON generation failed | error_class={} | type={} | message={} | repr={}", + _classify_hf_error(exc), + details["type"], + details["message"], + details["repr"], + ) + raise Exception(f"Hugging Face structured JSON generation failed: {exc}") from exc + def get_available_models() -> list: - """ - Get list of available Hugging Face models for text generation. - - Returns: - list: List of available model identifiers - """ + """Get list of available Hugging Face models for text generation.""" return [ PREMIUM_DEFAULT_MODEL, "moonshotai/Kimi-K2-Instruct-0905:groq", "Qwen/Qwen2.5-VL-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct:groq", "microsoft/Phi-3-medium-4k-instruct:groq", - SIF_LOW_COST_MODEL_DEFAULTS[0] + SIF_LOW_COST_MODEL_DEFAULTS[0], ] + def validate_model(model: str) -> bool: - """ - Validate if a model identifier is supported. - - Args: - model (str): Model identifier to validate - - Returns: - bool: True if model is supported, False otherwise - """ - available_models = get_available_models() - return model in available_models + """Validate if a model identifier is supported.""" + return model in get_available_models()