diff --git a/backend/services/llm_providers/huggingface_provider.py b/backend/services/llm_providers/huggingface_provider.py index c3573f47..0606aa0e 100644 --- a/backend/services/llm_providers/huggingface_provider.py +++ b/backend/services/llm_providers/huggingface_provider.py @@ -46,15 +46,26 @@ Version: 1.0 Last Updated: January 2025 """ +<<<<<<< HEAD import os +======= +import hashlib +>>>>>>> pr-419 import json +import os import re <<<<<<< HEAD +<<<<<<< HEAD from functools import lru_cache from typing import Optional, Dict, Any ======= from typing import Optional, Dict, Any, List, Iterable >>>>>>> pr-418 +======= +import time +from threading import Lock +from typing import Optional, Dict, Any +>>>>>>> pr-419 from loguru import logger from utils.logger_utils import get_service_logger @@ -95,6 +106,31 @@ HF_FALLBACK_MODELS = [ >>>>>>> pr-417 ] +_HF_CLIENT_CACHE: Dict[str, Any] = {} +_HF_CLIENT_CACHE_LOCK = Lock() + + +def _masked_key_id(api_key: str) -> str: + return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12] + + +def get_huggingface_client(api_key: str): + """Get or create a cached Hugging Face/OpenAI-compatible client for the API key.""" + key_id = _masked_key_id(api_key) + with _HF_CLIENT_CACHE_LOCK: + cached_client = _HF_CLIENT_CACHE.get(key_id) + if cached_client is not None: + logger.debug("Reusing cached Hugging Face client for key_id={}", key_id) + return cached_client + + client = OpenAI( + base_url="https://router.huggingface.co/hf/v1", + api_key=api_key, + ) + _HF_CLIENT_CACHE[key_id] = client + logger.debug("Created new Hugging Face client for key_id={}", key_id) + return client + def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True): """Yield model ids to try for a single logical model preference.""" @@ -235,6 +271,7 @@ def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str: return api_key +<<<<<<< HEAD <<<<<<< HEAD @retry( retry=retry_if_exception(_should_retry_hf_error), @@ -248,6 +285,13 @@ def _get_hf_client(api_key: str): >>>>>>> pr-416 +======= +@retry( + wait=wait_random_exponential(min=0.5, max=8), + stop=stop_after_attempt(3), + reraise=True, +) +>>>>>>> pr-419 def huggingface_text_response( prompt: str, <<<<<<< HEAD @@ -317,6 +361,7 @@ def huggingface_text_response( if not api_key: raise Exception("HF_TOKEN not found in environment variables") +<<<<<<< HEAD # Initialize Hugging Face client <<<<<<< HEAD client = OpenAI( @@ -326,6 +371,10 @@ def huggingface_text_response( ======= client = _get_hf_client(api_key) >>>>>>> pr-416 +======= + # Initialize/reuse Hugging Face client + client = get_huggingface_client(api_key) +>>>>>>> pr-419 logger.info("✅ Hugging Face client initialized for text response") # Prepare input for the API @@ -356,6 +405,7 @@ def huggingface_text_response( logger.info("🚀 Making Hugging Face API call (chat completion)...") +<<<<<<< HEAD <<<<<<< HEAD # Add rate limiting to prevent expensive API calls import time @@ -380,6 +430,14 @@ def huggingface_text_response( fallback_models=fallback_models, allow_model_variant_fallback=allow_model_variant_fallback, ): +======= + response = None + last_error = None + fallback_attempt = 0 + for candidate_model in _fallback_model_sequence(model): + fallback_attempt += 1 + started_at = time.perf_counter() +>>>>>>> pr-419 try: response = client.chat.completions.create( model=candidate_model, @@ -388,11 +446,25 @@ def huggingface_text_response( top_p=top_p, max_tokens=max_tokens ) + elapsed_ms = (time.perf_counter() - started_at) * 1000 + logger.debug( + "HF text attempt={} model={} elapsed_ms={:.2f}", + fallback_attempt, + candidate_model, + elapsed_ms, + ) if candidate_model != model: logger.warning("HF text generation switched to fallback model: {}", candidate_model) break except NotFoundError as nf_err: last_error = nf_err + elapsed_ms = (time.perf_counter() - started_at) * 1000 + logger.debug( + "HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found", + fallback_attempt, + candidate_model, + elapsed_ms, + ) logger.warning("HF model not found: {}. Trying fallback model.", candidate_model) continue @@ -438,6 +510,14 @@ def huggingface_text_response( >>>>>>> pr-416 raise Exception(f"Hugging Face text generation failed: {str(e)}") +<<<<<<< HEAD +======= +@retry( + wait=wait_random_exponential(min=0.5, max=8), + stop=stop_after_attempt(3), + reraise=True, +) +>>>>>>> pr-419 def huggingface_structured_json_response( prompt: str, schema: Dict[str, Any], @@ -513,6 +593,7 @@ def huggingface_structured_json_response( if not api_key: raise Exception("HF_TOKEN not found in environment variables") +<<<<<<< HEAD # Initialize OpenAI client with Hugging Face base URL # Use standard Inference API endpoint <<<<<<< HEAD @@ -523,6 +604,10 @@ def huggingface_structured_json_response( ======= client = _get_hf_client(api_key) >>>>>>> pr-416 +======= + # Initialize/reuse OpenAI client with Hugging Face base URL + client = get_huggingface_client(api_key) +>>>>>>> pr-419 logger.info("✅ Hugging Face client initialized for structured JSON response") # Prepare input for the API @@ -566,6 +651,7 @@ def huggingface_structured_json_response( <<<<<<< HEAD response = None last_error = None +<<<<<<< HEAD <<<<<<< HEAD for candidate_model in _fallback_model_sequence(model, fallback_models): ======= @@ -575,6 +661,12 @@ def huggingface_structured_json_response( allow_model_variant_fallback=allow_model_variant_fallback, ): >>>>>>> pr-418 +======= + fallback_attempt = 0 + for candidate_model in _fallback_model_sequence(model): + fallback_attempt += 1 + started_at = time.perf_counter() +>>>>>>> pr-419 try: response = client.chat.completions.create( model=candidate_model, @@ -583,11 +675,25 @@ def huggingface_structured_json_response( max_tokens=max_tokens, response_format={"type": "json_object"} # Try to enforce JSON mode if supported ) + elapsed_ms = (time.perf_counter() - started_at) * 1000 + logger.debug( + "HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object", + fallback_attempt, + candidate_model, + elapsed_ms, + ) if candidate_model != model: logger.warning("HF structured generation switched to fallback model: {}", candidate_model) break except NotFoundError as nf_err: last_error = nf_err + elapsed_ms = (time.perf_counter() - started_at) * 1000 + logger.debug( + "HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object", + fallback_attempt, + candidate_model, + elapsed_ms, + ) logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model) continue @@ -631,6 +737,7 @@ def huggingface_structured_json_response( logger.info("Retrying without response_format...") response = None last_error = None +<<<<<<< HEAD <<<<<<< HEAD for candidate_model in _fallback_model_sequence(model, fallback_models): ======= @@ -640,6 +747,12 @@ def huggingface_structured_json_response( allow_model_variant_fallback=allow_model_variant_fallback, ): >>>>>>> pr-418 +======= + fallback_attempt = 0 + for candidate_model in _fallback_model_sequence(model): + fallback_attempt += 1 + started_at = time.perf_counter() +>>>>>>> pr-419 try: response = client.chat.completions.create( model=candidate_model, @@ -647,11 +760,25 @@ def huggingface_structured_json_response( temperature=temperature, max_tokens=max_tokens ) + elapsed_ms = (time.perf_counter() - started_at) * 1000 + logger.debug( + "HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none", + fallback_attempt, + candidate_model, + elapsed_ms, + ) if candidate_model != model: logger.warning("HF structured no-response_format fallback model: {}", candidate_model) break except NotFoundError as nf_err: last_error = nf_err + elapsed_ms = (time.perf_counter() - started_at) * 1000 + logger.debug( + "HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none", + fallback_attempt, + candidate_model, + elapsed_ms, + ) logger.warning("HF structured model not found (no response_format path): {}", candidate_model) continue =======