Merge_PR_419_refine_hf_provider_retries_and_client_reuse

This commit is contained in:
ajaysi
2026-03-12 16:22:48 +05:30

View File

@@ -46,15 +46,26 @@ Version: 1.0
Last Updated: January 2025
"""
<<<<<<< HEAD
import os
=======
import hashlib
>>>>>>> pr-419
import json
import os
import re
<<<<<<< HEAD
<<<<<<< HEAD
from functools import lru_cache
from typing import Optional, Dict, Any
=======
from typing import Optional, Dict, Any, List, Iterable
>>>>>>> pr-418
=======
import time
from threading import Lock
from typing import Optional, Dict, Any
>>>>>>> pr-419
from loguru import logger
from utils.logger_utils import get_service_logger
@@ -95,6 +106,31 @@ HF_FALLBACK_MODELS = [
>>>>>>> pr-417
]
_HF_CLIENT_CACHE: Dict[str, Any] = {}
_HF_CLIENT_CACHE_LOCK = Lock()
def _masked_key_id(api_key: str) -> str:
return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12]
def get_huggingface_client(api_key: str):
"""Get or create a cached Hugging Face/OpenAI-compatible client for the API key."""
key_id = _masked_key_id(api_key)
with _HF_CLIENT_CACHE_LOCK:
cached_client = _HF_CLIENT_CACHE.get(key_id)
if cached_client is not None:
logger.debug("Reusing cached Hugging Face client for key_id={}", key_id)
return cached_client
client = OpenAI(
base_url="https://router.huggingface.co/hf/v1",
api_key=api_key,
)
_HF_CLIENT_CACHE[key_id] = client
logger.debug("Created new Hugging Face client for key_id={}", key_id)
return client
def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True):
"""Yield model ids to try for a single logical model preference."""
@@ -235,6 +271,7 @@ def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str:
return api_key
<<<<<<< HEAD
<<<<<<< HEAD
@retry(
retry=retry_if_exception(_should_retry_hf_error),
@@ -248,6 +285,13 @@ def _get_hf_client(api_key: str):
>>>>>>> pr-416
=======
@retry(
wait=wait_random_exponential(min=0.5, max=8),
stop=stop_after_attempt(3),
reraise=True,
)
>>>>>>> pr-419
def huggingface_text_response(
prompt: str,
<<<<<<< HEAD
@@ -317,6 +361,7 @@ def huggingface_text_response(
if not api_key:
raise Exception("HF_TOKEN not found in environment variables")
<<<<<<< HEAD
# Initialize Hugging Face client
<<<<<<< HEAD
client = OpenAI(
@@ -326,6 +371,10 @@ def huggingface_text_response(
=======
client = _get_hf_client(api_key)
>>>>>>> pr-416
=======
# Initialize/reuse Hugging Face client
client = get_huggingface_client(api_key)
>>>>>>> pr-419
logger.info("✅ Hugging Face client initialized for text response")
# Prepare input for the API
@@ -356,6 +405,7 @@ def huggingface_text_response(
logger.info("🚀 Making Hugging Face API call (chat completion)...")
<<<<<<< HEAD
<<<<<<< HEAD
# Add rate limiting to prevent expensive API calls
import time
@@ -380,6 +430,14 @@ def huggingface_text_response(
fallback_models=fallback_models,
allow_model_variant_fallback=allow_model_variant_fallback,
):
=======
response = None
last_error = None
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
>>>>>>> pr-419
try:
response = client.chat.completions.create(
model=candidate_model,
@@ -388,11 +446,25 @@ def huggingface_text_response(
top_p=top_p,
max_tokens=max_tokens
)
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF text attempt={} model={} elapsed_ms={:.2f}",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model:
logger.warning("HF text generation switched to fallback model: {}", candidate_model)
break
except NotFoundError as nf_err:
last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
continue
@@ -438,6 +510,14 @@ def huggingface_text_response(
>>>>>>> pr-416
raise Exception(f"Hugging Face text generation failed: {str(e)}")
<<<<<<< HEAD
=======
@retry(
wait=wait_random_exponential(min=0.5, max=8),
stop=stop_after_attempt(3),
reraise=True,
)
>>>>>>> pr-419
def huggingface_structured_json_response(
prompt: str,
schema: Dict[str, Any],
@@ -513,6 +593,7 @@ def huggingface_structured_json_response(
if not api_key:
raise Exception("HF_TOKEN not found in environment variables")
<<<<<<< HEAD
# Initialize OpenAI client with Hugging Face base URL
# Use standard Inference API endpoint
<<<<<<< HEAD
@@ -523,6 +604,10 @@ def huggingface_structured_json_response(
=======
client = _get_hf_client(api_key)
>>>>>>> pr-416
=======
# Initialize/reuse OpenAI client with Hugging Face base URL
client = get_huggingface_client(api_key)
>>>>>>> pr-419
logger.info("✅ Hugging Face client initialized for structured JSON response")
# Prepare input for the API
@@ -566,6 +651,7 @@ def huggingface_structured_json_response(
<<<<<<< HEAD
response = None
last_error = None
<<<<<<< HEAD
<<<<<<< HEAD
for candidate_model in _fallback_model_sequence(model, fallback_models):
=======
@@ -575,6 +661,12 @@ def huggingface_structured_json_response(
allow_model_variant_fallback=allow_model_variant_fallback,
):
>>>>>>> pr-418
=======
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
>>>>>>> pr-419
try:
response = client.chat.completions.create(
model=candidate_model,
@@ -583,11 +675,25 @@ def huggingface_structured_json_response(
max_tokens=max_tokens,
response_format={"type": "json_object"} # Try to enforce JSON mode if supported
)
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model:
logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
break
except NotFoundError as nf_err:
last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
continue
@@ -631,6 +737,7 @@ def huggingface_structured_json_response(
logger.info("Retrying without response_format...")
response = None
last_error = None
<<<<<<< HEAD
<<<<<<< HEAD
for candidate_model in _fallback_model_sequence(model, fallback_models):
=======
@@ -640,6 +747,12 @@ def huggingface_structured_json_response(
allow_model_variant_fallback=allow_model_variant_fallback,
):
>>>>>>> pr-418
=======
fallback_attempt = 0
for candidate_model in _fallback_model_sequence(model):
fallback_attempt += 1
started_at = time.perf_counter()
>>>>>>> pr-419
try:
response = client.chat.completions.create(
model=candidate_model,
@@ -647,11 +760,25 @@ def huggingface_structured_json_response(
temperature=temperature,
max_tokens=max_tokens
)
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none",
fallback_attempt,
candidate_model,
elapsed_ms,
)
if candidate_model != model:
logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
break
except NotFoundError as nf_err:
last_error = nf_err
elapsed_ms = (time.perf_counter() - started_at) * 1000
logger.debug(
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none",
fallback_attempt,
candidate_model,
elapsed_ms,
)
logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
continue
=======