diff --git a/backend/services/llm_providers/huggingface_provider.py b/backend/services/llm_providers/huggingface_provider.py index 7366c88c..0a0185e6 100644 --- a/backend/services/llm_providers/huggingface_provider.py +++ b/backend/services/llm_providers/huggingface_provider.py @@ -1,106 +1,50 @@ """ -Hugging Face Provider Module for ALwrity +Hugging Face Provider Module for ALwrity. -This module provides functions for interacting with Hugging Face's Inference Providers API -using the Responses API (beta) which provides a unified interface for model interactions. - -Key Features: -- Text response generation with retry logic -- Structured JSON response generation with schema validation -- Comprehensive error handling and logging -- Automatic API key management -- Support for various Hugging Face models via Inference Providers - -Best Practices: -1. Use structured output for complex, multi-field responses -2. Keep schemas simple and flat to avoid truncation -3. Set appropriate token limits (8192 for complex outputs) -4. Use low temperature (0.1-0.3) for consistent structured output -5. Implement proper error handling in calling functions -6. Use the Responses API for better compatibility - -Usage Examples: - # Text response - result = huggingface_text_response(prompt, temperature=0.7, max_tokens=2048) - - # Structured JSON response - schema = { - "type": "object", - "properties": { - "tasks": { - "type": "array", - "items": {"type": "object", "properties": {...}} - } - } - } - result = huggingface_structured_json_response(prompt, schema, temperature=0.2, max_tokens=8192) - -Dependencies: -- openai (for Hugging Face Responses API) -- tenacity (for retry logic) -- logging (for debugging) -- json (for fallback parsing) - -Author: ALwrity Team -Version: 1.0 -Last Updated: January 2025 +Provides text and structured JSON generation through Hugging Face Router +(OpenAI-compatible API), with retry and explicit fallback controls. """ -import os import json +import os import re from functools import lru_cache -from typing import Optional, Dict, Any +from typing import Any, Dict, List, Optional + +from tenacity import retry, retry_if_exception, stop_after_attempt, wait_random_exponential -from loguru import logger from utils.logger_utils import get_service_logger from .routing_policy import PREMIUM_DEFAULT_MODEL, SIF_LOW_COST_MODEL_DEFAULTS -# Use service-specific logger to avoid conflicts logger = get_service_logger("huggingface_provider") -<<<<<<< HEAD -from tenacity import ( - retry, - retry_if_exception, - stop_after_attempt, - wait_random_exponential, -) -======= ->>>>>>> pr-416 - try: - from openai import OpenAI - OPENAI_AVAILABLE = True -except ImportError: - OPENAI_AVAILABLE = False - logger.warn("OpenAI library not available. Install with: pip install openai") + from openai import NotFoundError, OpenAI + + OPENAI_AVAILABLE = True +except ImportError: # pragma: no cover - environment-dependent + OPENAI_AVAILABLE = False + OpenAI = None + NotFoundError = Exception + logger.warning("OpenAI library not available. Install with: pip install openai") -<<<<<<< HEAD HF_FALLBACK_MODELS = [ -<<<<<<< HEAD - "openai/gpt-oss-120b:cerebras", - "moonshotai/Kimi-K2-Instruct-0905:cerebras", - "meta-llama/Llama-3.1-8B-Instruct:cerebras", - "mistralai/Mistral-7B-Instruct-v0.3:cerebras", -======= PREMIUM_DEFAULT_MODEL, "moonshotai/Kimi-K2-Instruct-0905:groq", "meta-llama/Llama-3.1-8B-Instruct:groq", SIF_LOW_COST_MODEL_DEFAULTS[0], ->>>>>>> pr-417 ] def _candidate_model_variants(model: str): - """Yield model ids to try for a single logical model preference.""" + """Yield model IDs to try for a single logical model preference.""" if not model: return - # Try configured model first (supports provider suffixes like ":cerebras") + # Try configured model first (supports provider suffixes like ':groq'). yield model - # Fallback to base repo id when provider suffix is not recognized by the router + # Fallback to base repo id when provider suffix isn't recognized. if ":" in model: base_model = model.split(":", 1)[0] if base_model: @@ -108,12 +52,16 @@ def _candidate_model_variants(model: str): def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = None): - # IMPORTANT: Do not apply implicit global fallback chains. - # Callers must explicitly provide fallback_models when they want multi-model retries. + """Yield unique model candidates preserving caller-defined order. + + IMPORTANT: no implicit global fallback chain is applied here; callers must + explicitly pass fallback_models if they want multi-model retries. + """ if fallback_models: sequence = [model] + fallback_models else: sequence = [model] + seen = set() for preferred_model in sequence: for candidate in _candidate_model_variants(preferred_model): @@ -123,11 +71,9 @@ def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = def _is_non_retryable_hf_error(exc: Exception) -> bool: - """Skip retries for deterministic HF failures (e.g., unknown model ids, billing).""" msg = str(exc).lower() status = getattr(exc, "status_code", None) - - # Non-retryable errors + if isinstance(exc, NotFoundError) or "not found" in msg or "404" in msg: return True if status == 402 or "402" in msg or "depleted" in msg or "credits" in msg: @@ -136,7 +82,6 @@ def _is_non_retryable_hf_error(exc: Exception) -> bool: return True if status == 403 or "forbidden" in msg or "403" in msg: return True - return False @@ -145,7 +90,6 @@ def _should_retry_hf_error(exc: Exception) -> bool: def _classify_hf_error(exc: Exception) -> str: - """Classify HF failures for actionable logs.""" msg = str(exc).lower() if any(token in msg for token in ["insufficient", "balance", "quota", "billing", "payment", "402"]): return "billing_or_quota" @@ -156,445 +100,175 @@ def _classify_hf_error(exc: Exception) -> str: return "unknown" -def _hf_error_details(exc: Exception) -> str: - """Return compact, actionable exception details for logs.""" - status = getattr(exc, "status_code", None) - err_type = type(exc).__name__ - message = str(exc) - raw_body = getattr(exc, "body", None) - details = f"type={err_type}" - if status is not None: - details += f", status={status}" - if message: - details += f", message={message}" - if raw_body: - details += f", body={raw_body}" - details += f", repr={repr(exc)}" - return details - -def get_huggingface_api_key() -> str: -======= - - -def _classify_hf_error(error: Exception) -> str: - message = str(error or "").lower() - if any(x in message for x in ["insufficient", "quota", "billing", "payment", "credits", "balance"]): - return "billing_or_quota" - if any(x in message for x in ["unauthorized", "forbidden", "permission", "invalid api key", "authentication"]): - return "auth_or_permission" - if ("not found" in message) or ("404" in message): - return "model_not_found" - return "other" - - -def _error_details(error: Exception) -> Dict[str, str]: +def _error_details(exc: Exception) -> Dict[str, str]: return { - "type": type(error).__name__, - "message": str(error), - "repr": repr(error), + "type": type(exc).__name__, + "message": str(exc), + "repr": repr(exc), } def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str: ->>>>>>> pr-416 - """Get Hugging Face API key with proper error handling.""" - api_key = explicit_api_key or os.getenv('HF_TOKEN') + """Get Hugging Face API key with basic validation.""" + api_key = explicit_api_key or os.getenv("HF_TOKEN") if not api_key: error_msg = "HF_TOKEN environment variable is not set. Please set it in your .env file." logger.error(error_msg) raise ValueError(error_msg) - - # Validate API key format (basic check) - if not api_key.startswith('hf_'): + + if not api_key.startswith("hf_"): error_msg = "HF_TOKEN appears to be invalid. It should start with 'hf_'." logger.error(error_msg) raise ValueError(error_msg) - + return api_key -<<<<<<< HEAD -@retry( - retry=retry_if_exception(_should_retry_hf_error), - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), -) -======= + @lru_cache(maxsize=16) def _get_hf_client(api_key: str): return OpenAI(base_url="https://router.huggingface.co/v1", api_key=api_key) ->>>>>>> pr-416 +@retry( + retry=retry_if_exception(_should_retry_hf_error), + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), +) def huggingface_text_response( prompt: str, -<<<<<<< HEAD - model: str = "openai/gpt-oss-120b:cerebras", - fallback_models: Optional[List[str]] = None, -======= model: str = PREMIUM_DEFAULT_MODEL, ->>>>>>> pr-417 + fallback_models: Optional[List[str]] = None, temperature: float = 0.7, max_tokens: int = 2048, top_p: float = 0.9, system_prompt: Optional[str] = None, api_key: Optional[str] = None, ) -> str: - """ - Generate text response using Hugging Face Inference Providers API. - - This function uses the Hugging Face Responses API which provides a unified interface - for model interactions with built-in retry logic and error handling. - - Args: - prompt (str): The input prompt for the AI model - model (str): Hugging Face model identifier (default: "openai/gpt-oss-120b:groq") - temperature (float): Controls randomness (0.0-1.0) - max_tokens (int): Maximum tokens in response - top_p (float): Nucleus sampling parameter (0.0-1.0) - system_prompt (str, optional): System instruction for the model - - Returns: - str: Generated text response - - Raises: - Exception: If API key is missing or API call fails - - Best Practices: - - Use appropriate temperature for your use case (0.7 for creative, 0.1-0.3 for factual) - - Set max_tokens based on expected response length - - Use system_prompt to guide model behavior - - Handle errors gracefully in calling functions - - Example: - result = huggingface_text_response( - prompt="Write a blog post about AI", -<<<<<<< HEAD - model="openai/gpt-oss-120b:cerebras", -======= - model=PREMIUM_DEFAULT_MODEL, ->>>>>>> pr-417 - temperature=0.7, - max_tokens=2048, - system_prompt="You are a professional content writer." - ) - """ + """Generate text with explicit fallback model sequence.""" try: if not OPENAI_AVAILABLE: raise ImportError("OpenAI library not available. Install with: pip install openai") - - # Get API key with proper error handling - api_key = get_huggingface_api_key(api_key) - logger.info(f"🔑 Hugging Face API key loaded: {bool(api_key)} (length: {len(api_key) if api_key else 0})") - - if not api_key: - raise Exception("HF_TOKEN not found in environment variables") - - # Initialize Hugging Face client -<<<<<<< HEAD - client = OpenAI( - base_url="https://router.huggingface.co/v1", - api_key=api_key, - ) -======= - client = _get_hf_client(api_key) ->>>>>>> pr-416 - logger.info("✅ Hugging Face client initialized for text response") - # Prepare input for the API + hf_api_key = get_huggingface_api_key(api_key) + client = _get_hf_client(hf_api_key) + messages = [] - - # Add system prompt if provided if system_prompt: - messages.append({ - "role": "system", - "content": system_prompt - }) - - # Add user prompt - messages.append({ - "role": "user", - "content": prompt - }) + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) - # Add debugging for API call - logger.info( - "Hugging Face text call | model={} | prompt_len={} | temp={} | top_p={} | max_tokens={}", - model, - len(prompt) if isinstance(prompt, str) else '', - temperature, - top_p, - max_tokens, - ) - - logger.info("🚀 Making Hugging Face API call (chat completion)...") - -<<<<<<< HEAD - # Add rate limiting to prevent expensive API calls - import time - time.sleep(1) # 1 second delay between API calls - - # Call exactly the requested model; no retries, no fallbacks, no variants -======= ->>>>>>> pr-416 - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - top_p=top_p, - max_tokens=max_tokens - ) - - # Extract text from response - generated_text = response.choices[0].message.content - - # Clean up the response - if generated_text: - # Remove any markdown formatting if present - generated_text = re.sub(r'```[a-zA-Z]*\n?', '', generated_text) - generated_text = re.sub(r'```\n?', '', generated_text) - generated_text = generated_text.strip() - - logger.info("✅ Hugging Face text response generated successfully (length: {})", len(generated_text)) - return generated_text - - except Exception as e: - error_class = _classify_hf_error(e) -<<<<<<< HEAD - error_details = _hf_error_details(e) - logger.error(f"❌ Hugging Face text generation failed: {error_details}") - - # Extra diagnostics: try to capture raw response if available - if hasattr(e, 'response') and e.response is not None: - logger.error(f"🔍 HF Error Diagnostics:") - logger.error(f" - Status: {e.response.status_code}") - logger.error(f" - Headers: {dict(e.response.headers)}") + response = None + last_error = None + for candidate_model in _fallback_model_sequence(model, fallback_models): try: - body_json = e.response.json() - logger.error(f" - Body JSON: {json.dumps(body_json, indent=2)}") - except Exception: - logger.error(f" - Body Raw: {e.response.text[:1000]}") - else: - logger.error(f"🔍 No HTTP response attached to exception object.") - -======= - details = _error_details(e) - logger.error("❌ Hugging Face text generation failed | error_class={} | type={} | message={} | repr={}", error_class, details["type"], details["message"], details["repr"]) ->>>>>>> pr-416 - raise Exception(f"Hugging Face text generation failed: {str(e)}") + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + ) + if candidate_model != model: + logger.warning("HF text fallback model used: {}", candidate_model) + break + except NotFoundError as nf_err: + last_error = nf_err + logger.warning("HF text model not found: {}", candidate_model) + continue + except Exception as call_err: + last_error = call_err + logger.warning("HF text call failed for model {}: {}", candidate_model, _error_details(call_err)) + continue + if response is None: + raise last_error or RuntimeError("All fallback models failed") + + generated_text = response.choices[0].message.content or "" + generated_text = re.sub(r"```[a-zA-Z]*\n?", "", generated_text) + generated_text = re.sub(r"```\n?", "", generated_text).strip() + return generated_text + + except Exception as exc: + details = _error_details(exc) + logger.error( + "❌ Hugging Face text generation failed | error_class={} | type={} | message={} | repr={}", + _classify_hf_error(exc), + details["type"], + details["message"], + details["repr"], + ) + raise Exception(f"Hugging Face text generation failed: {exc}") from exc + + +@retry( + retry=retry_if_exception(_should_retry_hf_error), + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), +) def huggingface_structured_json_response( prompt: str, schema: Dict[str, Any], -<<<<<<< HEAD - model: str = "openai/gpt-oss-120b:cerebras", - fallback_models: Optional[List[str]] = None, -======= model: str = PREMIUM_DEFAULT_MODEL, ->>>>>>> pr-417 + fallback_models: Optional[List[str]] = None, temperature: float = 0.7, max_tokens: int = 8192, system_prompt: Optional[str] = None, api_key: Optional[str] = None, ) -> Dict[str, Any]: - """ - Generate structured JSON response using Hugging Face Inference Providers API. - - This function uses the Hugging Face Responses API with structured output support - to generate JSON responses that match a provided schema. - - Args: - prompt (str): The input prompt for the AI model - schema (dict): JSON schema defining the expected output structure - model (str): Hugging Face model identifier (default: "openai/gpt-oss-120b:groq") - temperature (float): Controls randomness (0.0-1.0). Use 0.1-0.3 for structured output - max_tokens (int): Maximum tokens in response. Use 8192 for complex outputs - system_prompt (str, optional): System instruction for the model - - Returns: - dict: Parsed JSON response matching the provided schema - - Raises: - Exception: If API key is missing or API call fails - - Best Practices: - - Keep schemas simple and flat to avoid truncation - - Use low temperature (0.1-0.3) for consistent structured output - - Set max_tokens to 8192 for complex multi-field responses - - Avoid deeply nested schemas with many required fields - - Test with smaller outputs first, then scale up - - Example: - schema = { - "type": "object", - "properties": { - "tasks": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "description": {"type": "string"} - } - } - } - } - } - result = huggingface_structured_json_response(prompt, schema, temperature=0.2, max_tokens=8192) - """ + """Generate structured JSON with explicit fallback model sequence.""" try: if not OPENAI_AVAILABLE: raise ImportError("OpenAI library not available. Install with: pip install openai") - - # Get API key with proper error handling - api_key = get_huggingface_api_key(api_key) - logger.info(f"🔑 Hugging Face API key loaded: {bool(api_key)} (length: {len(api_key) if api_key else 0})") - - if not api_key: - raise Exception("HF_TOKEN not found in environment variables") - - # Initialize OpenAI client with Hugging Face base URL - # Use standard Inference API endpoint -<<<<<<< HEAD - client = OpenAI( - base_url="https://router.huggingface.co/v1", - api_key=api_key, - ) -======= - client = _get_hf_client(api_key) ->>>>>>> pr-416 - logger.info("✅ Hugging Face client initialized for structured JSON response") - # Prepare input for the API + hf_api_key = get_huggingface_api_key(api_key) + client = _get_hf_client(hf_api_key) + messages = [] - - # Add system prompt if provided if system_prompt: - messages.append({ - "role": "system", - "content": system_prompt - }) - - # Add user prompt with JSON instruction - # For HF models, explicit JSON instruction in prompt is often better than response_format - json_instruction = "Please respond with valid JSON that matches the provided schema." - messages.append({ - "role": "user", - "content": f"{prompt}\n\n{json_instruction}" - }) + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) - # Add debugging for API call - logger.info( - "Hugging Face structured call | model={} | prompt_len={} | schema_kind={} | temp={} | max_tokens={}", - model, - len(prompt) if isinstance(prompt, str) else '', - type(schema).__name__, - temperature, - max_tokens, - ) - - logger.info("🚀 Making Hugging Face structured API call...") - - # Make the API call using standard Chat Completions - logger.info("🚀 Making Hugging Face API call (chat completion)...") - - # Add JSON schema to prompt for guidance - json_schema_str = json.dumps(schema, indent=2) - messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}" - - try: -<<<<<<< HEAD - response = None - last_error = None - for candidate_model in _fallback_model_sequence(model, fallback_models): - try: - response = client.chat.completions.create( - model=candidate_model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - response_format={"type": "json_object"} # Try to enforce JSON mode if supported - ) - if candidate_model != model: - logger.warning("HF structured generation switched to fallback model: {}", candidate_model) - break - except NotFoundError as nf_err: - last_error = nf_err - logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model) + response = None + last_error = None + + for candidate_model in _fallback_model_sequence(model, fallback_models): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + response_format={"type": "json_object"}, + ) + if candidate_model != model: + logger.warning("HF structured fallback model used: {}", candidate_model) + break + except Exception as err: + last_error = err + if isinstance(err, NotFoundError): + logger.warning("HF structured model not found: {}", candidate_model) continue - if response is None: - raise last_error or Exception("Hugging Face structured generation failed: all fallback models failed") - - response_text = response.choices[0].message.content - - # Clean up response text if needed - response_text = response_text.strip() - if response_text.startswith("```json"): - response_text = response_text[7:] - if response_text.endswith("```"): - response_text = response_text[:-3] - response_text = response_text.strip() - - try: - parsed_json = json.loads(response_text) - logger.info("✅ Hugging Face structured JSON response parsed successfully") - return parsed_json - except json.JSONDecodeError as json_err: - logger.error(f"❌ JSON parsing failed: {json_err}") - logger.error(f"Raw response: {response_text}") - - # Try to extract JSON from the response using regex - json_match = re.search(r'\{.*\}', response_text, re.DOTALL) - if json_match: - try: - extracted_json = json.loads(json_match.group()) - logger.info("✅ JSON extracted using regex fallback") - return extracted_json - except json.JSONDecodeError: - pass - - return {"error": "Failed to parse JSON response", "raw_response": response_text} - - except Exception as e: - logger.error(f"❌ Hugging Face API call failed: {e}") - # If 422 Unprocessable Entity (often due to response_format not supported), retry without it - if "422" in str(e) or "not supported" in str(e).lower() or isinstance(e, NotFoundError): - logger.info("Retrying without response_format...") - response = None - last_error = None - for candidate_model in _fallback_model_sequence(model, fallback_models): + msg = str(err).lower() + if "422" in msg or "not supported" in msg: try: response = client.chat.completions.create( model=candidate_model, messages=messages, temperature=temperature, - max_tokens=max_tokens + max_tokens=max_tokens, ) if candidate_model != model: - logger.warning("HF structured no-response_format fallback model: {}", candidate_model) + logger.warning("HF structured fallback(no response_format) model: {}", candidate_model) break - except NotFoundError as nf_err: - last_error = nf_err - logger.warning("HF structured model not found (no response_format path): {}", candidate_model) + except Exception as second_err: + last_error = second_err continue -======= - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - response_format={"type": "json_object"} - ) - except Exception as e: - details = _error_details(e) - logger.error("❌ Hugging Face API call failed | error_class={} | type={} | message={} | repr={}", _classify_hf_error(e), details["type"], details["message"], details["repr"]) - raise ->>>>>>> pr-416 - response_text = response.choices[0].message.content + if response is None: + raise last_error or RuntimeError("All fallback models failed") - # Clean up response text if needed - response_text = response_text.strip() + response_text = (response.choices[0].message.content or "").strip() if response_text.startswith("```json"): response_text = response_text[7:] if response_text.endswith("```"): @@ -602,57 +276,37 @@ def huggingface_structured_json_response( response_text = response_text.strip() try: - parsed_json = json.loads(response_text) - logger.info("✅ Hugging Face structured JSON response parsed successfully") - return parsed_json - except json.JSONDecodeError as json_err: - logger.error(f"❌ JSON parsing failed: {json_err}") - logger.error(f"Raw response: {response_text}") - json_match = re.search(r'\{.*\}', response_text, re.DOTALL) + return json.loads(response_text) + except json.JSONDecodeError: + json_match = re.search(r"\{.*\}", response_text, re.DOTALL) if json_match: - try: - extracted_json = json.loads(json_match.group()) - logger.info("✅ JSON extracted using regex fallback") - return extracted_json - except json.JSONDecodeError: - pass + return json.loads(json_match.group()) return {"error": "Failed to parse JSON response", "raw_response": response_text} - - except Exception as e: - error_msg = str(e) if str(e) else repr(e) - error_type = type(e).__name__ - details = _error_details(e) - logger.error("❌ Hugging Face structured JSON generation failed | error_class={} | type={} | message={} | repr={}", _classify_hf_error(e), error_type, details["message"], details["repr"]) - logger.error(f"❌ Full exception details: {repr(e)}") - import traceback - logger.error(f"❌ Traceback: {traceback.format_exc()}") - raise Exception(f"Hugging Face structured JSON generation failed: {error_type}: {error_msg}") + + except Exception as exc: + details = _error_details(exc) + logger.error( + "❌ Hugging Face structured JSON generation failed | error_class={} | type={} | message={} | repr={}", + _classify_hf_error(exc), + details["type"], + details["message"], + details["repr"], + ) + raise Exception(f"Hugging Face structured JSON generation failed: {exc}") from exc + def get_available_models() -> list: - """ - Get list of available Hugging Face models for text generation. - - Returns: - list: List of available model identifiers - """ + """Get list of available Hugging Face models for text generation.""" return [ PREMIUM_DEFAULT_MODEL, "moonshotai/Kimi-K2-Instruct-0905:groq", "Qwen/Qwen2.5-VL-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct:groq", "microsoft/Phi-3-medium-4k-instruct:groq", - SIF_LOW_COST_MODEL_DEFAULTS[0] + SIF_LOW_COST_MODEL_DEFAULTS[0], ] + def validate_model(model: str) -> bool: - """ - Validate if a model identifier is supported. - - Args: - model (str): Model identifier to validate - - Returns: - bool: True if model is supported, False otherwise - """ - available_models = get_available_models() - return model in available_models + """Validate if a model identifier is supported.""" + return model in get_available_models()