From f36cd8eea91e0d745003c5016738e66eca410f8b Mon Sep 17 00:00:00 2001 From: ajaysi Date: Thu, 12 Mar 2026 16:40:53 +0530 Subject: [PATCH] "Recreate_huggingface_provider_clean_functional_version" --- .../llm_providers/huggingface_provider.py | 732 ++++++------------ .../huggingface_provider_clean.py | 501 ++++++++++++ 2 files changed, 747 insertions(+), 486 deletions(-) create mode 100644 backend/services/llm_providers/huggingface_provider_clean.py diff --git a/backend/services/llm_providers/huggingface_provider.py b/backend/services/llm_providers/huggingface_provider.py index e7f926a0..20e28bdd 100644 --- a/backend/services/llm_providers/huggingface_provider.py +++ b/backend/services/llm_providers/huggingface_provider.py @@ -1,54 +1,81 @@ """ -Hugging Face Provider Module for ALwrity. +Hugging Face Provider Module for ALwrity -Provides text and structured JSON generation through Hugging Face Router -(OpenAI-compatible API), with retry and explicit fallback controls. +This module provides functions for interacting with Hugging Face's Inference Providers API +using the Responses API (beta) which provides a unified interface for model interactions. + +Key Features: +- Text response generation with retry logic +- Structured JSON response generation with schema validation +- Comprehensive error handling and logging +- Automatic API key management +- Support for various Hugging Face models via Inference Providers +- Explicit fallback model sequences +- Client caching for performance + +Best Practices: +1. Use structured output for complex, multi-field responses +2. Keep schemas simple and flat to avoid truncation +3. Set appropriate token limits (8192 for complex outputs) +4. Use low temperature (0.1-0.3) for consistent structured output +5. Implement proper error handling in calling functions +6. Use the Responses API for better compatibility + +Usage Examples: + # Text response + result = huggingface_text_response(prompt, temperature=0.7, max_tokens=2048) + + # Structured JSON response + schema = { + "type": "object", + "properties": { + "tasks": { + "type": "array", + "items": {"type": "object", "properties": {...}} + } + } + } + result = huggingface_structured_json_response(prompt, schema, temperature=0.2, max_tokens=8192) + +Dependencies: +- openai (for Hugging Face Responses API) +- tenacity (for retry logic) +- logging (for debugging) +- json (for fallback parsing) + +Author: ALwrity Team +Version: 1.0 +Last Updated: January 2025 """ -<<<<<<< HEAD -<<<<<<< HEAD import os -======= -import hashlib ->>>>>>> pr-419 -======= ->>>>>>> pr-437 import json -import os import re -<<<<<<< HEAD -<<<<<<< HEAD from functools import lru_cache -<<<<<<< HEAD -from typing import Optional, Dict, Any -======= -from typing import Optional, Dict, Any, List, Iterable ->>>>>>> pr-418 -======= -import time -from threading import Lock -from typing import Optional, Dict, Any ->>>>>>> pr-419 -======= -from typing import Any, Dict, List, Optional - -from tenacity import retry, retry_if_exception, stop_after_attempt, wait_random_exponential ->>>>>>> pr-437 +from typing import Optional, Dict, Any, List +from loguru import logger from utils.logger_utils import get_service_logger from .routing_policy import PREMIUM_DEFAULT_MODEL, SIF_LOW_COST_MODEL_DEFAULTS +# Use service-specific logger to avoid conflicts logger = get_service_logger("huggingface_provider") -try: - from openai import NotFoundError, OpenAI +from tenacity import ( + retry, + retry_if_exception, + stop_after_attempt, + wait_random_exponential, +) +try: + from openai import OpenAI + from openai import NotFoundError OPENAI_AVAILABLE = True -except ImportError: # pragma: no cover - environment-dependent +except ImportError: OPENAI_AVAILABLE = False - OpenAI = None NotFoundError = Exception - logger.warning("OpenAI library not available. Install with: pip install openai") + logger.warn("OpenAI library not available. Install with: pip install openai") HF_FALLBACK_MODELS = [ PREMIUM_DEFAULT_MODEL, @@ -57,129 +84,44 @@ HF_FALLBACK_MODELS = [ SIF_LOW_COST_MODEL_DEFAULTS[0], ] -_HF_CLIENT_CACHE: Dict[str, Any] = {} -_HF_CLIENT_CACHE_LOCK = Lock() - -<<<<<<< HEAD - -def _masked_key_id(api_key: str) -> str: - return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12] - - -def get_huggingface_client(api_key: str): - """Get or create a cached Hugging Face/OpenAI-compatible client for the API key.""" - key_id = _masked_key_id(api_key) - with _HF_CLIENT_CACHE_LOCK: - cached_client = _HF_CLIENT_CACHE.get(key_id) - if cached_client is not None: - logger.debug("Reusing cached Hugging Face client for key_id={}", key_id) - return cached_client - - client = OpenAI( - base_url="https://router.huggingface.co/hf/v1", - api_key=api_key, - ) - _HF_CLIENT_CACHE[key_id] = client - logger.debug("Created new Hugging Face client for key_id={}", key_id) - return client - - -def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True): - """Yield model ids to try for a single logical model preference.""" -======= -def _candidate_model_variants(model: str): - """Yield model IDs to try for a single logical model preference.""" ->>>>>>> pr-437 - if not model: - return - - # Try configured model first (supports provider suffixes like ':groq'). - yield model - -<<<<<<< HEAD - # Fallback to base repo id when provider suffix is not recognized by the router - if allow_model_variant_fallback and ":" in model: -======= - # Fallback to base repo id when provider suffix isn't recognized. - if ":" in model: ->>>>>>> pr-437 - base_model = model.split(":", 1)[0] - if base_model: - yield base_model - - -<<<<<<< HEAD -def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = None): - """Yield unique model candidates preserving caller-defined order. - - IMPORTANT: no implicit global fallback chain is applied here; callers must - explicitly pass fallback_models if they want multi-model retries. - """ - if fallback_models: - sequence = [model] + fallback_models - else: - sequence = [model] -<<<<<<< HEAD -======= -def _fallback_model_sequence( - model: str, - fallback_models: Optional[List[str]] = None, - allow_model_variant_fallback: bool = True, -): - sequence: Iterable[str] - if fallback_models is None: - # Safe default only when caller doesn't provide explicit policy. - sequence = [model] + HF_FALLBACK_MODELS - else: - # Caller owns fallback policy fully. Empty list means only requested model. - sequence = [model] + list(fallback_models) - ->>>>>>> pr-418 -======= - ->>>>>>> pr-437 - seen = set() - for preferred_model in sequence: - for candidate in _candidate_model_variants( - preferred_model, - allow_model_variant_fallback=allow_model_variant_fallback, - ): - if candidate and candidate not in seen: - seen.add(candidate) - yield candidate - - -def _is_non_retryable_hf_error(exc: Exception) -> bool: - msg = str(exc).lower() - status = getattr(exc, "status_code", None) - - if isinstance(exc, NotFoundError) or "not found" in msg or "404" in msg: - return True - if status == 402 or "402" in msg or "depleted" in msg or "credits" in msg: - return True - if status == 401 or "unauthorized" in msg or "401" in msg: - return True - if status == 403 or "forbidden" in msg or "403" in msg: - return True - return False - def _should_retry_hf_error(exc: Exception) -> bool: - return not _is_non_retryable_hf_error(exc) + """Determine if an error should trigger a retry based on error type and message.""" + if isinstance(exc, NotFoundError): + return False # Don't retry model not found errors + + msg = str(exc).lower() + # Don't retry authentication errors + if any(keyword in msg for keyword in ["unauthorized", "forbidden", "401", "403", "invalid api key"]): + return False + # Don't retry billing/quota errors + if any(keyword in msg for keyword in ["insufficient", "quota", "billing", "payment", "credits", "balance"]): + return False + # Retry rate limiting and server errors + if any(keyword in msg for keyword in ["rate limit", "429", "500", "502", "503", "504", "timeout"]): + return True + # Default to retry for unknown errors + return True def _classify_hf_error(exc: Exception) -> str: + """Classify Hugging Face errors for better error reporting.""" msg = str(exc).lower() - if any(token in msg for token in ["insufficient", "balance", "quota", "billing", "payment", "402"]): + if any(keyword in msg for keyword in ["insufficient", "quota", "billing", "payment", "credits", "balance"]): return "billing_or_quota" - if "unauthorized" in msg or "forbidden" in msg or "401" in msg or "403" in msg: + if any(keyword in msg for keyword in ["unauthorized", "forbidden", "401", "403"]): return "auth_or_permission" if "not found" in msg or "404" in msg: return "model_not_found" + if any(keyword in msg for keyword in ["rate limit", "429"]): + return "rate_limit" + if any(keyword in msg for keyword in ["timeout", "500", "502", "503", "504"]): + return "server_error" return "unknown" def _error_details(exc: Exception) -> Dict[str, str]: + """Extract error details for logging.""" return { "type": type(exc).__name__, "message": str(exc), @@ -187,54 +129,60 @@ def _error_details(exc: Exception) -> Dict[str, str]: } +def _candidate_model_variants(model: str): + """Yield model ids to try for a single logical model preference.""" + if not model: + return + + # Try configured model first (supports provider suffixes like ":groq") + yield model + + # Fallback to base repo id when provider suffix is not recognized by the router + if ":" in model: + base_model = model.split(":", 1)[0] + if base_model: + yield base_model + + +def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = None): + """Generate a sequence of models to try as fallbacks.""" + sequence = [model] + (fallback_models or HF_FALLBACK_MODELS) + seen = set() + for preferred_model in sequence: + for candidate in _candidate_model_variants(preferred_model): + if candidate and candidate not in seen: + seen.add(candidate) + yield candidate + + def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str: - """Get Hugging Face API key with basic validation.""" - api_key = explicit_api_key or os.getenv("HF_TOKEN") + """Get Hugging Face API key with proper error handling.""" + api_key = explicit_api_key or os.getenv('HF_TOKEN') if not api_key: error_msg = "HF_TOKEN environment variable is not set. Please set it in your .env file." logger.error(error_msg) raise ValueError(error_msg) - - if not api_key.startswith("hf_"): + + # Validate API key format (basic check) + if not api_key.startswith('hf_'): error_msg = "HF_TOKEN appears to be invalid. It should start with 'hf_'." logger.error(error_msg) raise ValueError(error_msg) - + return api_key -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -@retry( - retry=retry_if_exception(_should_retry_hf_error), - wait=wait_random_exponential(min=1, max=60), - stop=stop_after_attempt(6), -) -======= -======= ->>>>>>> pr-437 @lru_cache(maxsize=16) def _get_hf_client(api_key: str): + """Get cached Hugging Face client for better performance.""" return OpenAI(base_url="https://router.huggingface.co/v1", api_key=api_key) -<<<<<<< HEAD ->>>>>>> pr-416 -======= -@retry( - wait=wait_random_exponential(min=0.5, max=8), - stop=stop_after_attempt(3), - reraise=True, -) ->>>>>>> pr-419 -======= @retry( retry=retry_if_exception(_should_retry_hf_error), wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), ) ->>>>>>> pr-437 def huggingface_text_response( prompt: str, model: str = PREMIUM_DEFAULT_MODEL, @@ -243,53 +191,64 @@ def huggingface_text_response( max_tokens: int = 2048, top_p: float = 0.9, system_prompt: Optional[str] = None, -<<<<<<< HEAD api_key: Optional[str] = None, -======= - fallback_models: Optional[List[str]] = None, - allow_model_variant_fallback: bool = True, ->>>>>>> pr-418 ) -> str: - """Generate text with explicit fallback model sequence.""" + """ + Generate text response using Hugging Face Inference Providers API. + + This function uses the Hugging Face Responses API which provides a unified interface + for model interactions with built-in retry logic and error handling. + + Args: + prompt (str): The input prompt for the AI model + model (str): Hugging Face model identifier (default: PREMIUM_DEFAULT_MODEL) + fallback_models (list, optional): Custom fallback models to try + temperature (float): Controls randomness (0.0-1.0) + max_tokens (int): Maximum tokens in response + top_p (float): Nucleus sampling parameter (0.0-1.0) + system_prompt (str, optional): System instruction for the model + api_key (str, optional): Explicit API key override + + Returns: + str: Generated text response + + Raises: + Exception: If API key is missing or API call fails + + Best Practices: + - Use appropriate temperature for your use case (0.7 for creative, 0.1-0.3 for factual) + - Set max_tokens based on expected response length + - Use system_prompt to guide model behavior + - Handle errors gracefully in calling functions + """ try: if not OPENAI_AVAILABLE: raise ImportError("OpenAI library not available. Install with: pip install openai") -<<<<<<< HEAD # Get API key with proper error handling - api_key = get_huggingface_api_key(api_key) - logger.info(f"🔑 Hugging Face API key loaded: {bool(api_key)} (length: {len(api_key) if api_key else 0})") - - if not api_key: - raise Exception("HF_TOKEN not found in environment variables") - -<<<<<<< HEAD - # Initialize Hugging Face client -<<<<<<< HEAD - client = OpenAI( - base_url="https://router.huggingface.co/v1", - api_key=api_key, - ) -======= - client = _get_hf_client(api_key) ->>>>>>> pr-416 -======= - # Initialize/reuse Hugging Face client - client = get_huggingface_client(api_key) ->>>>>>> pr-419 - logger.info("✅ Hugging Face client initialized for text response") -======= ->>>>>>> pr-437 - hf_api_key = get_huggingface_api_key(api_key) + logger.info(f"🔑 Hugging Face API key loaded: {bool(hf_api_key)} (length: {len(hf_api_key) if hf_api_key else 0})") + + # Initialize Hugging Face client client = _get_hf_client(hf_api_key) + logger.info("✅ Hugging Face client initialized for text response") + # Prepare input for the API messages = [] + + # Add system prompt if provided if system_prompt: - messages.append({"role": "system", "content": system_prompt}) - messages.append({"role": "user", "content": prompt}) + messages.append({ + "role": "system", + "content": system_prompt + }) + + # Add user prompt + messages.append({ + "role": "user", + "content": prompt + }) -<<<<<<< HEAD # Add debugging for API call logger.info( "Hugging Face text call | model={} | prompt_len={} | temp={} | top_p={} | max_tokens={}", @@ -302,39 +261,9 @@ def huggingface_text_response( logger.info("🚀 Making Hugging Face API call (chat completion)...") -<<<<<<< HEAD -<<<<<<< HEAD - # Add rate limiting to prevent expensive API calls - import time - time.sleep(1) # 1 second delay between API calls - -<<<<<<< HEAD - # Call exactly the requested model; no retries, no fallbacks, no variants -======= ->>>>>>> pr-416 - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - top_p=top_p, - max_tokens=max_tokens - ) -======= response = None last_error = None - for candidate_model in _fallback_model_sequence( - model=model, - fallback_models=fallback_models, - allow_model_variant_fallback=allow_model_variant_fallback, - ): -======= - response = None - last_error = None - fallback_attempt = 0 - for candidate_model in _fallback_model_sequence(model): - fallback_attempt += 1 - started_at = time.perf_counter() ->>>>>>> pr-419 + for candidate_model in _fallback_model_sequence(model, fallback_models): try: response = client.chat.completions.create( model=candidate_model, @@ -343,69 +272,6 @@ def huggingface_text_response( top_p=top_p, max_tokens=max_tokens ) - elapsed_ms = (time.perf_counter() - started_at) * 1000 - logger.debug( - "HF text attempt={} model={} elapsed_ms={:.2f}", - fallback_attempt, - candidate_model, - elapsed_ms, - ) - if candidate_model != model: - logger.warning("HF text generation switched to fallback model: {}", candidate_model) - break - except NotFoundError as nf_err: - last_error = nf_err - elapsed_ms = (time.perf_counter() - started_at) * 1000 - logger.debug( - "HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found", - fallback_attempt, - candidate_model, - elapsed_ms, - ) - logger.warning("HF model not found: {}. Trying fallback model.", candidate_model) - continue - - if response is None: - raise last_error or Exception("Hugging Face text generation failed: all fallback models failed") ->>>>>>> pr-418 - - # Extract text from response - generated_text = response.choices[0].message.content - - # Clean up the response - if generated_text: - # Remove any markdown formatting if present - generated_text = re.sub(r'```[a-zA-Z]*\n?', '', generated_text) - generated_text = re.sub(r'```\n?', '', generated_text) - generated_text = generated_text.strip() - - logger.info("✅ Hugging Face text response generated successfully (length: {})", len(generated_text)) - return generated_text - - except Exception as e: - error_class = _classify_hf_error(e) -<<<<<<< HEAD - error_details = _hf_error_details(e) - logger.error(f"❌ Hugging Face text generation failed: {error_details}") - - # Extra diagnostics: try to capture raw response if available - if hasattr(e, 'response') and e.response is not None: - logger.error(f"🔍 HF Error Diagnostics:") - logger.error(f" - Status: {e.response.status_code}") - logger.error(f" - Headers: {dict(e.response.headers)}") -======= - response = None - last_error = None - for candidate_model in _fallback_model_sequence(model, fallback_models): ->>>>>>> pr-437 - try: - response = client.chat.completions.create( - model=candidate_model, - messages=messages, - temperature=temperature, - top_p=top_p, - max_tokens=max_tokens, - ) if candidate_model != model: logger.warning("HF text fallback model used: {}", candidate_model) break @@ -418,24 +284,20 @@ def huggingface_text_response( logger.warning("HF text call failed for model {}: {}", candidate_model, _error_details(call_err)) continue -<<<<<<< HEAD -<<<<<<< HEAD -======= -@retry( - wait=wait_random_exponential(min=0.5, max=8), - stop=stop_after_attempt(3), - reraise=True, -) ->>>>>>> pr-419 -======= if response is None: raise last_error or RuntimeError("All fallback models failed") - + + # Extract text from response generated_text = response.choices[0].message.content or "" - generated_text = re.sub(r"```[a-zA-Z]*\n?", "", generated_text) - generated_text = re.sub(r"```\n?", "", generated_text).strip() + + # Clean up the response + generated_text = re.sub(r'```[a-zA-Z]*\n?', '', generated_text) + generated_text = re.sub(r'```\n?', '', generated_text) + generated_text = generated_text.strip() + + logger.info(f"✅ Hugging Face text response generated successfully (length: {len(generated_text)})") return generated_text - + except Exception as exc: details = _error_details(exc) logger.error( @@ -453,7 +315,6 @@ def huggingface_text_response( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), ) ->>>>>>> pr-437 def huggingface_structured_json_response( prompt: str, schema: Dict[str, Any], @@ -462,54 +323,66 @@ def huggingface_structured_json_response( temperature: float = 0.7, max_tokens: int = 8192, system_prompt: Optional[str] = None, -<<<<<<< HEAD api_key: Optional[str] = None, -======= - fallback_models: Optional[List[str]] = None, - allow_model_variant_fallback: bool = True, ->>>>>>> pr-418 ) -> Dict[str, Any]: - """Generate structured JSON with explicit fallback model sequence.""" + """ + Generate structured JSON response using Hugging Face Inference Providers API. + + This function uses the Hugging Face Responses API with structured output support + to generate JSON responses that match a provided schema. + + Args: + prompt (str): The input prompt for the AI model + schema (dict): JSON schema defining the expected output structure + model (str): Hugging Face model identifier (default: PREMIUM_DEFAULT_MODEL) + fallback_models (list, optional): Custom fallback models to try + temperature (float): Controls randomness (0.0-1.0). Use 0.1-0.3 for structured output + max_tokens (int): Maximum tokens in response. Use 8192 for complex outputs + system_prompt (str, optional): System instruction for the model + api_key (str, optional): Explicit API key override + + Returns: + dict: Parsed JSON response matching the provided schema + + Raises: + Exception: If API key is missing or API call fails + + Best Practices: + - Keep schemas simple and flat to avoid truncation + - Use low temperature (0.1-0.3) for consistent structured output + - Set max_tokens to 8192 for complex multi-field responses + - Avoid deeply nested schemas with many required fields + - Test with smaller outputs first, then scale up + """ try: if not OPENAI_AVAILABLE: raise ImportError("OpenAI library not available. Install with: pip install openai") -<<<<<<< HEAD # Get API key with proper error handling - api_key = get_huggingface_api_key(api_key) - logger.info(f"🔑 Hugging Face API key loaded: {bool(api_key)} (length: {len(api_key) if api_key else 0})") - - if not api_key: - raise Exception("HF_TOKEN not found in environment variables") - -<<<<<<< HEAD - # Initialize OpenAI client with Hugging Face base URL - # Use standard Inference API endpoint -<<<<<<< HEAD - client = OpenAI( - base_url="https://router.huggingface.co/v1", - api_key=api_key, - ) -======= - client = _get_hf_client(api_key) ->>>>>>> pr-416 -======= - # Initialize/reuse OpenAI client with Hugging Face base URL - client = get_huggingface_client(api_key) ->>>>>>> pr-419 - logger.info("✅ Hugging Face client initialized for structured JSON response") -======= ->>>>>>> pr-437 - hf_api_key = get_huggingface_api_key(api_key) + logger.info(f"🔑 Hugging Face API key loaded: {bool(hf_api_key)} (length: {len(hf_api_key) if hf_api_key else 0})") + + # Initialize OpenAI client with Hugging Face base URL client = _get_hf_client(hf_api_key) + logger.info("✅ Hugging Face client initialized for structured JSON response") + # Prepare input for the API messages = [] + + # Add system prompt if provided if system_prompt: - messages.append({"role": "system", "content": system_prompt}) - messages.append({"role": "user", "content": prompt}) + messages.append({ + "role": "system", + "content": system_prompt + }) + + # Add user prompt with JSON instruction + json_instruction = "Please respond with valid JSON that matches the provided schema." + messages.append({ + "role": "user", + "content": f"{prompt}\n\n{json_instruction}" + }) -<<<<<<< HEAD # Add debugging for API call logger.info( "Hugging Face structured call | model={} | prompt_len={} | schema_kind={} | temp={} | max_tokens={}", @@ -522,120 +395,10 @@ def huggingface_structured_json_response( logger.info("🚀 Making Hugging Face structured API call...") - # Make the API call using standard Chat Completions - logger.info("🚀 Making Hugging Face API call (chat completion)...") - # Add JSON schema to prompt for guidance json_schema_str = json.dumps(schema, indent=2) messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}" - try: -<<<<<<< HEAD - response = None - last_error = None -<<<<<<< HEAD -<<<<<<< HEAD - for candidate_model in _fallback_model_sequence(model, fallback_models): -======= - for candidate_model in _fallback_model_sequence( - model=model, - fallback_models=fallback_models, - allow_model_variant_fallback=allow_model_variant_fallback, - ): ->>>>>>> pr-418 -======= - fallback_attempt = 0 - for candidate_model in _fallback_model_sequence(model): - fallback_attempt += 1 - started_at = time.perf_counter() ->>>>>>> pr-419 - try: - response = client.chat.completions.create( - model=candidate_model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - response_format={"type": "json_object"} # Try to enforce JSON mode if supported - ) - elapsed_ms = (time.perf_counter() - started_at) * 1000 - logger.debug( - "HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object", - fallback_attempt, - candidate_model, - elapsed_ms, - ) - if candidate_model != model: - logger.warning("HF structured generation switched to fallback model: {}", candidate_model) - break - except NotFoundError as nf_err: - last_error = nf_err - elapsed_ms = (time.perf_counter() - started_at) * 1000 - logger.debug( - "HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object", - fallback_attempt, - candidate_model, - elapsed_ms, - ) - logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model) - continue - - if response is None: - raise last_error or Exception("Hugging Face structured generation failed: all fallback models failed") - - response_text = response.choices[0].message.content - - # Clean up response text if needed - response_text = response_text.strip() - if response_text.startswith("```json"): - response_text = response_text[7:] - if response_text.endswith("```"): - response_text = response_text[:-3] - response_text = response_text.strip() - - try: - parsed_json = json.loads(response_text) - logger.info("✅ Hugging Face structured JSON response parsed successfully") - return parsed_json - except json.JSONDecodeError as json_err: - logger.error(f"❌ JSON parsing failed: {json_err}") - logger.error(f"Raw response: {response_text}") - - # Try to extract JSON from the response using regex - json_match = re.search(r'\{.*\}', response_text, re.DOTALL) - if json_match: - try: - extracted_json = json.loads(json_match.group()) - logger.info("✅ JSON extracted using regex fallback") - return extracted_json - except json.JSONDecodeError: - pass - - return {"error": "Failed to parse JSON response", "raw_response": response_text} - - except Exception as e: - logger.error(f"❌ Hugging Face API call failed: {e}") - # If 422 Unprocessable Entity (often due to response_format not supported), retry without it - if "422" in str(e) or "not supported" in str(e).lower() or isinstance(e, NotFoundError): - logger.info("Retrying without response_format...") - response = None - last_error = None -<<<<<<< HEAD -<<<<<<< HEAD - for candidate_model in _fallback_model_sequence(model, fallback_models): -======= - for candidate_model in _fallback_model_sequence( - model=model, - fallback_models=fallback_models, - allow_model_variant_fallback=allow_model_variant_fallback, - ): ->>>>>>> pr-418 -======= - fallback_attempt = 0 - for candidate_model in _fallback_model_sequence(model): - fallback_attempt += 1 - started_at = time.perf_counter() ->>>>>>> pr-419 -======= response = None last_error = None @@ -646,7 +409,7 @@ def huggingface_structured_json_response( messages=messages, temperature=temperature, max_tokens=max_tokens, - response_format={"type": "json_object"}, + response_format={"type": "json_object"} ) if candidate_model != model: logger.warning("HF structured fallback model used: {}", candidate_model) @@ -659,7 +422,6 @@ def huggingface_structured_json_response( msg = str(err).lower() if "422" in msg or "not supported" in msg: ->>>>>>> pr-437 try: response = client.chat.completions.create( model=candidate_model, @@ -667,37 +429,19 @@ def huggingface_structured_json_response( temperature=temperature, max_tokens=max_tokens, ) - elapsed_ms = (time.perf_counter() - started_at) * 1000 - logger.debug( - "HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none", - fallback_attempt, - candidate_model, - elapsed_ms, - ) if candidate_model != model: logger.warning("HF structured fallback(no response_format) model: {}", candidate_model) break -<<<<<<< HEAD - except NotFoundError as nf_err: - last_error = nf_err - elapsed_ms = (time.perf_counter() - started_at) * 1000 - logger.debug( - "HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none", - fallback_attempt, - candidate_model, - elapsed_ms, - ) - logger.warning("HF structured model not found (no response_format path): {}", candidate_model) -======= except Exception as second_err: last_error = second_err ->>>>>>> pr-437 continue if response is None: raise last_error or RuntimeError("All fallback models failed") - + response_text = (response.choices[0].message.content or "").strip() + + # Clean up response text if needed if response_text.startswith("```json"): response_text = response_text[7:] if response_text.endswith("```"): @@ -705,13 +449,15 @@ def huggingface_structured_json_response( response_text = response_text.strip() try: - return json.loads(response_text) + parsed_json = json.loads(response_text) + logger.info("✅ Hugging Face structured JSON response parsed successfully") + return parsed_json except json.JSONDecodeError: json_match = re.search(r"\{.*\}", response_text, re.DOTALL) if json_match: return json.loads(json_match.group()) return {"error": "Failed to parse JSON response", "raw_response": response_text} - + except Exception as exc: details = _error_details(exc) logger.error( @@ -725,17 +471,31 @@ def huggingface_structured_json_response( def get_available_models() -> list: - """Get list of available Hugging Face models for text generation.""" + """ + Get list of available Hugging Face models for text generation. + + Returns: + list: List of available model identifiers + """ return [ PREMIUM_DEFAULT_MODEL, "moonshotai/Kimi-K2-Instruct-0905:groq", "Qwen/Qwen2.5-VL-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct:groq", "microsoft/Phi-3-medium-4k-instruct:groq", - SIF_LOW_COST_MODEL_DEFAULTS[0], + SIF_LOW_COST_MODEL_DEFAULTS[0] ] def validate_model(model: str) -> bool: - """Validate if a model identifier is supported.""" - return model in get_available_models() + """ + Validate if a model identifier is supported. + + Args: + model (str): Model identifier to validate + + Returns: + bool: True if model is supported, False otherwise + """ + available_models = get_available_models() + return model in available_models diff --git a/backend/services/llm_providers/huggingface_provider_clean.py b/backend/services/llm_providers/huggingface_provider_clean.py new file mode 100644 index 00000000..20e28bdd --- /dev/null +++ b/backend/services/llm_providers/huggingface_provider_clean.py @@ -0,0 +1,501 @@ +""" +Hugging Face Provider Module for ALwrity + +This module provides functions for interacting with Hugging Face's Inference Providers API +using the Responses API (beta) which provides a unified interface for model interactions. + +Key Features: +- Text response generation with retry logic +- Structured JSON response generation with schema validation +- Comprehensive error handling and logging +- Automatic API key management +- Support for various Hugging Face models via Inference Providers +- Explicit fallback model sequences +- Client caching for performance + +Best Practices: +1. Use structured output for complex, multi-field responses +2. Keep schemas simple and flat to avoid truncation +3. Set appropriate token limits (8192 for complex outputs) +4. Use low temperature (0.1-0.3) for consistent structured output +5. Implement proper error handling in calling functions +6. Use the Responses API for better compatibility + +Usage Examples: + # Text response + result = huggingface_text_response(prompt, temperature=0.7, max_tokens=2048) + + # Structured JSON response + schema = { + "type": "object", + "properties": { + "tasks": { + "type": "array", + "items": {"type": "object", "properties": {...}} + } + } + } + result = huggingface_structured_json_response(prompt, schema, temperature=0.2, max_tokens=8192) + +Dependencies: +- openai (for Hugging Face Responses API) +- tenacity (for retry logic) +- logging (for debugging) +- json (for fallback parsing) + +Author: ALwrity Team +Version: 1.0 +Last Updated: January 2025 +""" + +import os +import json +import re +from functools import lru_cache +from typing import Optional, Dict, Any, List + +from loguru import logger +from utils.logger_utils import get_service_logger +from .routing_policy import PREMIUM_DEFAULT_MODEL, SIF_LOW_COST_MODEL_DEFAULTS + +# Use service-specific logger to avoid conflicts +logger = get_service_logger("huggingface_provider") + +from tenacity import ( + retry, + retry_if_exception, + stop_after_attempt, + wait_random_exponential, +) + +try: + from openai import OpenAI + from openai import NotFoundError + OPENAI_AVAILABLE = True +except ImportError: + OPENAI_AVAILABLE = False + NotFoundError = Exception + logger.warn("OpenAI library not available. Install with: pip install openai") + +HF_FALLBACK_MODELS = [ + PREMIUM_DEFAULT_MODEL, + "moonshotai/Kimi-K2-Instruct-0905:groq", + "meta-llama/Llama-3.1-8B-Instruct:groq", + SIF_LOW_COST_MODEL_DEFAULTS[0], +] + + +def _should_retry_hf_error(exc: Exception) -> bool: + """Determine if an error should trigger a retry based on error type and message.""" + if isinstance(exc, NotFoundError): + return False # Don't retry model not found errors + + msg = str(exc).lower() + # Don't retry authentication errors + if any(keyword in msg for keyword in ["unauthorized", "forbidden", "401", "403", "invalid api key"]): + return False + # Don't retry billing/quota errors + if any(keyword in msg for keyword in ["insufficient", "quota", "billing", "payment", "credits", "balance"]): + return False + # Retry rate limiting and server errors + if any(keyword in msg for keyword in ["rate limit", "429", "500", "502", "503", "504", "timeout"]): + return True + # Default to retry for unknown errors + return True + + +def _classify_hf_error(exc: Exception) -> str: + """Classify Hugging Face errors for better error reporting.""" + msg = str(exc).lower() + if any(keyword in msg for keyword in ["insufficient", "quota", "billing", "payment", "credits", "balance"]): + return "billing_or_quota" + if any(keyword in msg for keyword in ["unauthorized", "forbidden", "401", "403"]): + return "auth_or_permission" + if "not found" in msg or "404" in msg: + return "model_not_found" + if any(keyword in msg for keyword in ["rate limit", "429"]): + return "rate_limit" + if any(keyword in msg for keyword in ["timeout", "500", "502", "503", "504"]): + return "server_error" + return "unknown" + + +def _error_details(exc: Exception) -> Dict[str, str]: + """Extract error details for logging.""" + return { + "type": type(exc).__name__, + "message": str(exc), + "repr": repr(exc), + } + + +def _candidate_model_variants(model: str): + """Yield model ids to try for a single logical model preference.""" + if not model: + return + + # Try configured model first (supports provider suffixes like ":groq") + yield model + + # Fallback to base repo id when provider suffix is not recognized by the router + if ":" in model: + base_model = model.split(":", 1)[0] + if base_model: + yield base_model + + +def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = None): + """Generate a sequence of models to try as fallbacks.""" + sequence = [model] + (fallback_models or HF_FALLBACK_MODELS) + seen = set() + for preferred_model in sequence: + for candidate in _candidate_model_variants(preferred_model): + if candidate and candidate not in seen: + seen.add(candidate) + yield candidate + + +def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str: + """Get Hugging Face API key with proper error handling.""" + api_key = explicit_api_key or os.getenv('HF_TOKEN') + if not api_key: + error_msg = "HF_TOKEN environment variable is not set. Please set it in your .env file." + logger.error(error_msg) + raise ValueError(error_msg) + + # Validate API key format (basic check) + if not api_key.startswith('hf_'): + error_msg = "HF_TOKEN appears to be invalid. It should start with 'hf_'." + logger.error(error_msg) + raise ValueError(error_msg) + + return api_key + + +@lru_cache(maxsize=16) +def _get_hf_client(api_key: str): + """Get cached Hugging Face client for better performance.""" + return OpenAI(base_url="https://router.huggingface.co/v1", api_key=api_key) + + +@retry( + retry=retry_if_exception(_should_retry_hf_error), + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), +) +def huggingface_text_response( + prompt: str, + model: str = PREMIUM_DEFAULT_MODEL, + fallback_models: Optional[List[str]] = None, + temperature: float = 0.7, + max_tokens: int = 2048, + top_p: float = 0.9, + system_prompt: Optional[str] = None, + api_key: Optional[str] = None, +) -> str: + """ + Generate text response using Hugging Face Inference Providers API. + + This function uses the Hugging Face Responses API which provides a unified interface + for model interactions with built-in retry logic and error handling. + + Args: + prompt (str): The input prompt for the AI model + model (str): Hugging Face model identifier (default: PREMIUM_DEFAULT_MODEL) + fallback_models (list, optional): Custom fallback models to try + temperature (float): Controls randomness (0.0-1.0) + max_tokens (int): Maximum tokens in response + top_p (float): Nucleus sampling parameter (0.0-1.0) + system_prompt (str, optional): System instruction for the model + api_key (str, optional): Explicit API key override + + Returns: + str: Generated text response + + Raises: + Exception: If API key is missing or API call fails + + Best Practices: + - Use appropriate temperature for your use case (0.7 for creative, 0.1-0.3 for factual) + - Set max_tokens based on expected response length + - Use system_prompt to guide model behavior + - Handle errors gracefully in calling functions + """ + try: + if not OPENAI_AVAILABLE: + raise ImportError("OpenAI library not available. Install with: pip install openai") + + # Get API key with proper error handling + hf_api_key = get_huggingface_api_key(api_key) + logger.info(f"🔑 Hugging Face API key loaded: {bool(hf_api_key)} (length: {len(hf_api_key) if hf_api_key else 0})") + + # Initialize Hugging Face client + client = _get_hf_client(hf_api_key) + logger.info("✅ Hugging Face client initialized for text response") + + # Prepare input for the API + messages = [] + + # Add system prompt if provided + if system_prompt: + messages.append({ + "role": "system", + "content": system_prompt + }) + + # Add user prompt + messages.append({ + "role": "user", + "content": prompt + }) + + # Add debugging for API call + logger.info( + "Hugging Face text call | model={} | prompt_len={} | temp={} | top_p={} | max_tokens={}", + model, + len(prompt) if isinstance(prompt, str) else '', + temperature, + top_p, + max_tokens, + ) + + logger.info("🚀 Making Hugging Face API call (chat completion)...") + + response = None + last_error = None + for candidate_model in _fallback_model_sequence(model, fallback_models): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens + ) + if candidate_model != model: + logger.warning("HF text fallback model used: {}", candidate_model) + break + except NotFoundError as nf_err: + last_error = nf_err + logger.warning("HF text model not found: {}", candidate_model) + continue + except Exception as call_err: + last_error = call_err + logger.warning("HF text call failed for model {}: {}", candidate_model, _error_details(call_err)) + continue + + if response is None: + raise last_error or RuntimeError("All fallback models failed") + + # Extract text from response + generated_text = response.choices[0].message.content or "" + + # Clean up the response + generated_text = re.sub(r'```[a-zA-Z]*\n?', '', generated_text) + generated_text = re.sub(r'```\n?', '', generated_text) + generated_text = generated_text.strip() + + logger.info(f"✅ Hugging Face text response generated successfully (length: {len(generated_text)})") + return generated_text + + except Exception as exc: + details = _error_details(exc) + logger.error( + "❌ Hugging Face text generation failed | error_class={} | type={} | message={} | repr={}", + _classify_hf_error(exc), + details["type"], + details["message"], + details["repr"], + ) + raise Exception(f"Hugging Face text generation failed: {exc}") from exc + + +@retry( + retry=retry_if_exception(_should_retry_hf_error), + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), +) +def huggingface_structured_json_response( + prompt: str, + schema: Dict[str, Any], + model: str = PREMIUM_DEFAULT_MODEL, + fallback_models: Optional[List[str]] = None, + temperature: float = 0.7, + max_tokens: int = 8192, + system_prompt: Optional[str] = None, + api_key: Optional[str] = None, +) -> Dict[str, Any]: + """ + Generate structured JSON response using Hugging Face Inference Providers API. + + This function uses the Hugging Face Responses API with structured output support + to generate JSON responses that match a provided schema. + + Args: + prompt (str): The input prompt for the AI model + schema (dict): JSON schema defining the expected output structure + model (str): Hugging Face model identifier (default: PREMIUM_DEFAULT_MODEL) + fallback_models (list, optional): Custom fallback models to try + temperature (float): Controls randomness (0.0-1.0). Use 0.1-0.3 for structured output + max_tokens (int): Maximum tokens in response. Use 8192 for complex outputs + system_prompt (str, optional): System instruction for the model + api_key (str, optional): Explicit API key override + + Returns: + dict: Parsed JSON response matching the provided schema + + Raises: + Exception: If API key is missing or API call fails + + Best Practices: + - Keep schemas simple and flat to avoid truncation + - Use low temperature (0.1-0.3) for consistent structured output + - Set max_tokens to 8192 for complex multi-field responses + - Avoid deeply nested schemas with many required fields + - Test with smaller outputs first, then scale up + """ + try: + if not OPENAI_AVAILABLE: + raise ImportError("OpenAI library not available. Install with: pip install openai") + + # Get API key with proper error handling + hf_api_key = get_huggingface_api_key(api_key) + logger.info(f"🔑 Hugging Face API key loaded: {bool(hf_api_key)} (length: {len(hf_api_key) if hf_api_key else 0})") + + # Initialize OpenAI client with Hugging Face base URL + client = _get_hf_client(hf_api_key) + logger.info("✅ Hugging Face client initialized for structured JSON response") + + # Prepare input for the API + messages = [] + + # Add system prompt if provided + if system_prompt: + messages.append({ + "role": "system", + "content": system_prompt + }) + + # Add user prompt with JSON instruction + json_instruction = "Please respond with valid JSON that matches the provided schema." + messages.append({ + "role": "user", + "content": f"{prompt}\n\n{json_instruction}" + }) + + # Add debugging for API call + logger.info( + "Hugging Face structured call | model={} | prompt_len={} | schema_kind={} | temp={} | max_tokens={}", + model, + len(prompt) if isinstance(prompt, str) else '', + type(schema).__name__, + temperature, + max_tokens, + ) + + logger.info("🚀 Making Hugging Face structured API call...") + + # Add JSON schema to prompt for guidance + json_schema_str = json.dumps(schema, indent=2) + messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}" + + response = None + last_error = None + + for candidate_model in _fallback_model_sequence(model, fallback_models): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + response_format={"type": "json_object"} + ) + if candidate_model != model: + logger.warning("HF structured fallback model used: {}", candidate_model) + break + except Exception as err: + last_error = err + if isinstance(err, NotFoundError): + logger.warning("HF structured model not found: {}", candidate_model) + continue + + msg = str(err).lower() + if "422" in msg or "not supported" in msg: + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + ) + if candidate_model != model: + logger.warning("HF structured fallback(no response_format) model: {}", candidate_model) + break + except Exception as second_err: + last_error = second_err + continue + + if response is None: + raise last_error or RuntimeError("All fallback models failed") + + response_text = (response.choices[0].message.content or "").strip() + + # Clean up response text if needed + if response_text.startswith("```json"): + response_text = response_text[7:] + if response_text.endswith("```"): + response_text = response_text[:-3] + response_text = response_text.strip() + + try: + parsed_json = json.loads(response_text) + logger.info("✅ Hugging Face structured JSON response parsed successfully") + return parsed_json + except json.JSONDecodeError: + json_match = re.search(r"\{.*\}", response_text, re.DOTALL) + if json_match: + return json.loads(json_match.group()) + return {"error": "Failed to parse JSON response", "raw_response": response_text} + + except Exception as exc: + details = _error_details(exc) + logger.error( + "❌ Hugging Face structured JSON generation failed | error_class={} | type={} | message={} | repr={}", + _classify_hf_error(exc), + details["type"], + details["message"], + details["repr"], + ) + raise Exception(f"Hugging Face structured JSON generation failed: {exc}") from exc + + +def get_available_models() -> list: + """ + Get list of available Hugging Face models for text generation. + + Returns: + list: List of available model identifiers + """ + return [ + PREMIUM_DEFAULT_MODEL, + "moonshotai/Kimi-K2-Instruct-0905:groq", + "Qwen/Qwen2.5-VL-7B-Instruct", + "meta-llama/Llama-3.1-8B-Instruct:groq", + "microsoft/Phi-3-medium-4k-instruct:groq", + SIF_LOW_COST_MODEL_DEFAULTS[0] + ] + + +def validate_model(model: str) -> bool: + """ + Validate if a model identifier is supported. + + Args: + model (str): Model identifier to validate + + Returns: + bool: True if model is supported, False otherwise + """ + available_models = get_available_models() + return model in available_models