Refine Hugging Face provider retries and client reuse

This commit is contained in:
ي
2026-03-12 15:04:16 +05:30
parent b410ece4ca
commit 7df7d870e5

View File

@@ -46,28 +46,14 @@ Version: 1.0
Last Updated: January 2025 Last Updated: January 2025
""" """
import os import hashlib
import sys
from pathlib import Path
import json import json
import os
import re import re
import time
from threading import Lock
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
from dotenv import load_dotenv
# Fix the environment loading path - load from backend directory
current_dir = Path(__file__).parent.parent # services directory
backend_dir = current_dir.parent # backend directory
env_path = backend_dir / '.env'
if env_path.exists():
load_dotenv(env_path)
print(f"Loaded .env from: {env_path}")
else:
# Fallback to current directory
load_dotenv()
print(f"No .env found at {env_path}, using current directory")
from loguru import logger from loguru import logger
from utils.logger_utils import get_service_logger from utils.logger_utils import get_service_logger
@@ -96,6 +82,31 @@ HF_FALLBACK_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3:groq", "mistralai/Mistral-7B-Instruct-v0.3:groq",
] ]
_HF_CLIENT_CACHE: Dict[str, Any] = {}
_HF_CLIENT_CACHE_LOCK = Lock()
def _masked_key_id(api_key: str) -> str:
return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12]
def get_huggingface_client(api_key: str):
"""Get or create a cached Hugging Face/OpenAI-compatible client for the API key."""
key_id = _masked_key_id(api_key)
with _HF_CLIENT_CACHE_LOCK:
cached_client = _HF_CLIENT_CACHE.get(key_id)
if cached_client is not None:
logger.debug("Reusing cached Hugging Face client for key_id={}", key_id)
return cached_client
client = OpenAI(
base_url="https://router.huggingface.co/hf/v1",
api_key=api_key,
)
_HF_CLIENT_CACHE[key_id] = client
logger.debug("Created new Hugging Face client for key_id={}", key_id)
return client
def _candidate_model_variants(model: str): def _candidate_model_variants(model: str):
"""Yield model ids to try for a single logical model preference.""" """Yield model ids to try for a single logical model preference."""
@@ -137,7 +148,11 @@ def get_huggingface_api_key() -> str:
return api_key return api_key
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) @retry(
wait=wait_random_exponential(min=0.5, max=8),
stop=stop_after_attempt(3),
reraise=True,
)
def huggingface_text_response( def huggingface_text_response(
prompt: str, prompt: str,
model: str = "openai/gpt-oss-120b:groq", model: str = "openai/gpt-oss-120b:groq",
@@ -192,11 +207,8 @@ def huggingface_text_response(
if not api_key: if not api_key:
raise Exception("HF_TOKEN not found in environment variables") raise Exception("HF_TOKEN not found in environment variables")
# Initialize Hugging Face client # Initialize/reuse Hugging Face client
client = OpenAI( client = get_huggingface_client(api_key)
base_url=f"https://router.huggingface.co/hf/v1",
api_key=api_key,
)
logger.info("✅ Hugging Face client initialized for text response") logger.info("✅ Hugging Face client initialized for text response")
# Prepare input for the API # Prepare input for the API
@@ -227,13 +239,12 @@ def huggingface_text_response(
logger.info("🚀 Making Hugging Face API call (chat completion)...") logger.info("🚀 Making Hugging Face API call (chat completion)...")
# Add rate limiting to prevent expensive API calls
import time
time.sleep(1) # 1 second delay between API calls
response = None response = None
last_error = None last_error = None
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model): for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
try: try:
response = client.chat.completions.create( response = client.chat.completions.create(
model=candidate_model, model=candidate_model,
@@ -242,11 +253,25 @@ def huggingface_text_response(
top_p=top_p, top_p=top_p,
max_tokens=max_tokens max_tokens=max_tokens
) )
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF text attempt={} model={} elapsed_ms={:.2f}",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model: if candidate_model != model:
logger.warning("HF text generation switched to fallback model: {}", candidate_model) logger.warning("HF text generation switched to fallback model: {}", candidate_model)
break break
except NotFoundError as nf_err: except NotFoundError as nf_err:
last_error = nf_err last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model) logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
continue continue
@@ -270,7 +295,11 @@ def huggingface_text_response(
logger.error(f"❌ Hugging Face text generation failed: {str(e)}") logger.error(f"❌ Hugging Face text generation failed: {str(e)}")
raise Exception(f"Hugging Face text generation failed: {str(e)}") raise Exception(f"Hugging Face text generation failed: {str(e)}")
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) @retry(
wait=wait_random_exponential(min=0.5, max=8),
stop=stop_after_attempt(3),
reraise=True,
)
def huggingface_structured_json_response( def huggingface_structured_json_response(
prompt: str, prompt: str,
schema: Dict[str, Any], schema: Dict[str, Any],
@@ -335,12 +364,8 @@ def huggingface_structured_json_response(
if not api_key: if not api_key:
raise Exception("HF_TOKEN not found in environment variables") raise Exception("HF_TOKEN not found in environment variables")
# Initialize OpenAI client with Hugging Face base URL # Initialize/reuse OpenAI client with Hugging Face base URL
# Use standard Inference API endpoint client = get_huggingface_client(api_key)
client = OpenAI(
base_url=f"https://router.huggingface.co/hf/v1",
api_key=api_key,
)
logger.info("✅ Hugging Face client initialized for structured JSON response") logger.info("✅ Hugging Face client initialized for structured JSON response")
# Prepare input for the API # Prepare input for the API
@@ -380,14 +405,13 @@ def huggingface_structured_json_response(
json_schema_str = json.dumps(schema, indent=2) json_schema_str = json.dumps(schema, indent=2)
messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}" messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}"
# Add rate limiting to prevent expensive API calls
import time
time.sleep(1) # 1 second delay between API calls
try: try:
response = None response = None
last_error = None last_error = None
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model): for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
try: try:
response = client.chat.completions.create( response = client.chat.completions.create(
model=candidate_model, model=candidate_model,
@@ -396,11 +420,25 @@ def huggingface_structured_json_response(
max_tokens=max_tokens, max_tokens=max_tokens,
response_format={"type": "json_object"} # Try to enforce JSON mode if supported response_format={"type": "json_object"} # Try to enforce JSON mode if supported
) )
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model: if candidate_model != model:
logger.warning("HF structured generation switched to fallback model: {}", candidate_model) logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
break break
except NotFoundError as nf_err: except NotFoundError as nf_err:
last_error = nf_err last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model) logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
continue continue
@@ -444,7 +482,10 @@ def huggingface_structured_json_response(
logger.info("Retrying without response_format...") logger.info("Retrying without response_format...")
response = None response = None
last_error = None last_error = None
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model): for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
try: try:
response = client.chat.completions.create( response = client.chat.completions.create(
model=candidate_model, model=candidate_model,
@@ -452,11 +493,25 @@ def huggingface_structured_json_response(
temperature=temperature, temperature=temperature,
max_tokens=max_tokens max_tokens=max_tokens
) )
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model: if candidate_model != model:
logger.warning("HF structured no-response_format fallback model: {}", candidate_model) logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
break break
except NotFoundError as nf_err: except NotFoundError as nf_err:
last_error = nf_err last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF structured model not found (no response_format path): {}", candidate_model) logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
continue continue