Refine Hugging Face provider retries and client reuse

This commit is contained in:
ي
2026-03-12 15:04:16 +05:30
parent b410ece4ca
commit 7df7d870e5

View File

@@ -46,28 +46,14 @@ Version: 1.0
Last Updated: January 2025
"""
import os
import sys
from pathlib import Path
import hashlib
import json
import os
import re
import time
from threading import Lock
from typing import Optional, Dict, Any
from dotenv import load_dotenv
# Fix the environment loading path - load from backend directory
current_dir = Path(__file__).parent.parent # services directory
backend_dir = current_dir.parent # backend directory
env_path = backend_dir / '.env'
if env_path.exists():
load_dotenv(env_path)
print(f"Loaded .env from: {env_path}")
else:
# Fallback to current directory
load_dotenv()
print(f"No .env found at {env_path}, using current directory")
from loguru import logger
from utils.logger_utils import get_service_logger
@@ -96,6 +82,31 @@ HF_FALLBACK_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3:groq",
]
_HF_CLIENT_CACHE: Dict[str, Any] = {}
_HF_CLIENT_CACHE_LOCK = Lock()
def _masked_key_id(api_key: str) -> str:
return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12]
def get_huggingface_client(api_key: str):
"""Get or create a cached Hugging Face/OpenAI-compatible client for the API key."""
key_id = _masked_key_id(api_key)
with _HF_CLIENT_CACHE_LOCK:
cached_client = _HF_CLIENT_CACHE.get(key_id)
if cached_client is not None:
logger.debug("Reusing cached Hugging Face client for key_id={}", key_id)
return cached_client
client = OpenAI(
base_url="https://router.huggingface.co/hf/v1",
api_key=api_key,
)
_HF_CLIENT_CACHE[key_id] = client
logger.debug("Created new Hugging Face client for key_id={}", key_id)
return client
def _candidate_model_variants(model: str):
"""Yield model ids to try for a single logical model preference."""
@@ -137,7 +148,11 @@ def get_huggingface_api_key() -> str:
return api_key
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
@retry(
wait=wait_random_exponential(min=0.5, max=8),
stop=stop_after_attempt(3),
reraise=True,
)
def huggingface_text_response(
prompt: str,
model: str = "openai/gpt-oss-120b:groq",
@@ -192,11 +207,8 @@ def huggingface_text_response(
if not api_key:
raise Exception("HF_TOKEN not found in environment variables")
# Initialize Hugging Face client
client = OpenAI(
base_url=f"https://router.huggingface.co/hf/v1",
api_key=api_key,
)
# Initialize/reuse Hugging Face client
client = get_huggingface_client(api_key)
logger.info("✅ Hugging Face client initialized for text response")
# Prepare input for the API
@@ -227,13 +239,12 @@ def huggingface_text_response(
logger.info("🚀 Making Hugging Face API call (chat completion)...")
# Add rate limiting to prevent expensive API calls
import time
time.sleep(1) # 1 second delay between API calls
response = None
last_error = None
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
try:
response = client.chat.completions.create(
model=candidate_model,
@@ -242,11 +253,25 @@ def huggingface_text_response(
top_p=top_p,
max_tokens=max_tokens
)
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF text attempt={} model={} elapsed_ms={:.2f}",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model:
logger.warning("HF text generation switched to fallback model: {}", candidate_model)
break
except NotFoundError as nf_err:
last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
continue
@@ -270,7 +295,11 @@ def huggingface_text_response(
logger.error(f"❌ Hugging Face text generation failed: {str(e)}")
raise Exception(f"Hugging Face text generation failed: {str(e)}")
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
@retry(
wait=wait_random_exponential(min=0.5, max=8),
stop=stop_after_attempt(3),
reraise=True,
)
def huggingface_structured_json_response(
prompt: str,
schema: Dict[str, Any],
@@ -335,12 +364,8 @@ def huggingface_structured_json_response(
if not api_key:
raise Exception("HF_TOKEN not found in environment variables")
# Initialize OpenAI client with Hugging Face base URL
# Use standard Inference API endpoint
client = OpenAI(
base_url=f"https://router.huggingface.co/hf/v1",
api_key=api_key,
)
# Initialize/reuse OpenAI client with Hugging Face base URL
client = get_huggingface_client(api_key)
logger.info("✅ Hugging Face client initialized for structured JSON response")
# Prepare input for the API
@@ -380,14 +405,13 @@ def huggingface_structured_json_response(
json_schema_str = json.dumps(schema, indent=2)
messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}"
# Add rate limiting to prevent expensive API calls
import time
time.sleep(1) # 1 second delay between API calls
try:
response = None
last_error = None
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
try:
response = client.chat.completions.create(
model=candidate_model,
@@ -396,11 +420,25 @@ def huggingface_structured_json_response(
max_tokens=max_tokens,
response_format={"type": "json_object"} # Try to enforce JSON mode if supported
)
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model:
logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
break
except NotFoundError as nf_err:
last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
continue
@@ -444,7 +482,10 @@ def huggingface_structured_json_response(
logger.info("Retrying without response_format...")
response = None
last_error = None
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
try:
response = client.chat.completions.create(
model=candidate_model,
@@ -452,11 +493,25 @@ def huggingface_structured_json_response(
temperature=temperature,
max_tokens=max_tokens
)
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model:
logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
break
except NotFoundError as nf_err:
last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
continue