Merge_PR_419_refine_hf_provider_retries_and_client_reuse
This commit is contained in:
@@ -46,15 +46,26 @@ Version: 1.0
|
||||
Last Updated: January 2025
|
||||
"""
|
||||
|
||||
<<<<<<< HEAD
|
||||
import os
|
||||
=======
|
||||
import hashlib
|
||||
>>>>>>> pr-419
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
<<<<<<< HEAD
|
||||
<<<<<<< HEAD
|
||||
from functools import lru_cache
|
||||
from typing import Optional, Dict, Any
|
||||
=======
|
||||
from typing import Optional, Dict, Any, List, Iterable
|
||||
>>>>>>> pr-418
|
||||
=======
|
||||
import time
|
||||
from threading import Lock
|
||||
from typing import Optional, Dict, Any
|
||||
>>>>>>> pr-419
|
||||
|
||||
from loguru import logger
|
||||
from utils.logger_utils import get_service_logger
|
||||
@@ -95,6 +106,31 @@ HF_FALLBACK_MODELS = [
|
||||
>>>>>>> pr-417
|
||||
]
|
||||
|
||||
_HF_CLIENT_CACHE: Dict[str, Any] = {}
|
||||
_HF_CLIENT_CACHE_LOCK = Lock()
|
||||
|
||||
|
||||
def _masked_key_id(api_key: str) -> str:
|
||||
return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12]
|
||||
|
||||
|
||||
def get_huggingface_client(api_key: str):
|
||||
"""Get or create a cached Hugging Face/OpenAI-compatible client for the API key."""
|
||||
key_id = _masked_key_id(api_key)
|
||||
with _HF_CLIENT_CACHE_LOCK:
|
||||
cached_client = _HF_CLIENT_CACHE.get(key_id)
|
||||
if cached_client is not None:
|
||||
logger.debug("Reusing cached Hugging Face client for key_id={}", key_id)
|
||||
return cached_client
|
||||
|
||||
client = OpenAI(
|
||||
base_url="https://router.huggingface.co/hf/v1",
|
||||
api_key=api_key,
|
||||
)
|
||||
_HF_CLIENT_CACHE[key_id] = client
|
||||
logger.debug("Created new Hugging Face client for key_id={}", key_id)
|
||||
return client
|
||||
|
||||
|
||||
def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True):
|
||||
"""Yield model ids to try for a single logical model preference."""
|
||||
@@ -235,6 +271,7 @@ def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str:
|
||||
|
||||
return api_key
|
||||
|
||||
<<<<<<< HEAD
|
||||
<<<<<<< HEAD
|
||||
@retry(
|
||||
retry=retry_if_exception(_should_retry_hf_error),
|
||||
@@ -248,6 +285,13 @@ def _get_hf_client(api_key: str):
|
||||
|
||||
|
||||
>>>>>>> pr-416
|
||||
=======
|
||||
@retry(
|
||||
wait=wait_random_exponential(min=0.5, max=8),
|
||||
stop=stop_after_attempt(3),
|
||||
reraise=True,
|
||||
)
|
||||
>>>>>>> pr-419
|
||||
def huggingface_text_response(
|
||||
prompt: str,
|
||||
<<<<<<< HEAD
|
||||
@@ -317,6 +361,7 @@ def huggingface_text_response(
|
||||
if not api_key:
|
||||
raise Exception("HF_TOKEN not found in environment variables")
|
||||
|
||||
<<<<<<< HEAD
|
||||
# Initialize Hugging Face client
|
||||
<<<<<<< HEAD
|
||||
client = OpenAI(
|
||||
@@ -326,6 +371,10 @@ def huggingface_text_response(
|
||||
=======
|
||||
client = _get_hf_client(api_key)
|
||||
>>>>>>> pr-416
|
||||
=======
|
||||
# Initialize/reuse Hugging Face client
|
||||
client = get_huggingface_client(api_key)
|
||||
>>>>>>> pr-419
|
||||
logger.info("✅ Hugging Face client initialized for text response")
|
||||
|
||||
# Prepare input for the API
|
||||
@@ -356,6 +405,7 @@ def huggingface_text_response(
|
||||
|
||||
logger.info("🚀 Making Hugging Face API call (chat completion)...")
|
||||
|
||||
<<<<<<< HEAD
|
||||
<<<<<<< HEAD
|
||||
# Add rate limiting to prevent expensive API calls
|
||||
import time
|
||||
@@ -380,6 +430,14 @@ def huggingface_text_response(
|
||||
fallback_models=fallback_models,
|
||||
allow_model_variant_fallback=allow_model_variant_fallback,
|
||||
):
|
||||
=======
|
||||
response = None
|
||||
last_error = None
|
||||
fallback_attempt = 0
|
||||
for candidate_model in _fallback_model_sequence(model):
|
||||
fallback_attempt += 1
|
||||
started_at = time.perf_counter()
|
||||
>>>>>>> pr-419
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=candidate_model,
|
||||
@@ -388,11 +446,25 @@ def huggingface_text_response(
|
||||
top_p=top_p,
|
||||
max_tokens=max_tokens
|
||||
)
|
||||
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||
logger.debug(
|
||||
"HF text attempt={} model={} elapsed_ms={:.2f}",
|
||||
fallback_attempt,
|
||||
candidate_model,
|
||||
elapsed_ms,
|
||||
)
|
||||
if candidate_model != model:
|
||||
logger.warning("HF text generation switched to fallback model: {}", candidate_model)
|
||||
break
|
||||
except NotFoundError as nf_err:
|
||||
last_error = nf_err
|
||||
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||
logger.debug(
|
||||
"HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found",
|
||||
fallback_attempt,
|
||||
candidate_model,
|
||||
elapsed_ms,
|
||||
)
|
||||
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
|
||||
continue
|
||||
|
||||
@@ -438,6 +510,14 @@ def huggingface_text_response(
|
||||
>>>>>>> pr-416
|
||||
raise Exception(f"Hugging Face text generation failed: {str(e)}")
|
||||
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
@retry(
|
||||
wait=wait_random_exponential(min=0.5, max=8),
|
||||
stop=stop_after_attempt(3),
|
||||
reraise=True,
|
||||
)
|
||||
>>>>>>> pr-419
|
||||
def huggingface_structured_json_response(
|
||||
prompt: str,
|
||||
schema: Dict[str, Any],
|
||||
@@ -513,6 +593,7 @@ def huggingface_structured_json_response(
|
||||
if not api_key:
|
||||
raise Exception("HF_TOKEN not found in environment variables")
|
||||
|
||||
<<<<<<< HEAD
|
||||
# Initialize OpenAI client with Hugging Face base URL
|
||||
# Use standard Inference API endpoint
|
||||
<<<<<<< HEAD
|
||||
@@ -523,6 +604,10 @@ def huggingface_structured_json_response(
|
||||
=======
|
||||
client = _get_hf_client(api_key)
|
||||
>>>>>>> pr-416
|
||||
=======
|
||||
# Initialize/reuse OpenAI client with Hugging Face base URL
|
||||
client = get_huggingface_client(api_key)
|
||||
>>>>>>> pr-419
|
||||
logger.info("✅ Hugging Face client initialized for structured JSON response")
|
||||
|
||||
# Prepare input for the API
|
||||
@@ -566,6 +651,7 @@ def huggingface_structured_json_response(
|
||||
<<<<<<< HEAD
|
||||
response = None
|
||||
last_error = None
|
||||
<<<<<<< HEAD
|
||||
<<<<<<< HEAD
|
||||
for candidate_model in _fallback_model_sequence(model, fallback_models):
|
||||
=======
|
||||
@@ -575,6 +661,12 @@ def huggingface_structured_json_response(
|
||||
allow_model_variant_fallback=allow_model_variant_fallback,
|
||||
):
|
||||
>>>>>>> pr-418
|
||||
=======
|
||||
fallback_attempt = 0
|
||||
for candidate_model in _fallback_model_sequence(model):
|
||||
fallback_attempt += 1
|
||||
started_at = time.perf_counter()
|
||||
>>>>>>> pr-419
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=candidate_model,
|
||||
@@ -583,11 +675,25 @@ def huggingface_structured_json_response(
|
||||
max_tokens=max_tokens,
|
||||
response_format={"type": "json_object"} # Try to enforce JSON mode if supported
|
||||
)
|
||||
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||
logger.debug(
|
||||
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object",
|
||||
fallback_attempt,
|
||||
candidate_model,
|
||||
elapsed_ms,
|
||||
)
|
||||
if candidate_model != model:
|
||||
logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
|
||||
break
|
||||
except NotFoundError as nf_err:
|
||||
last_error = nf_err
|
||||
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||
logger.debug(
|
||||
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object",
|
||||
fallback_attempt,
|
||||
candidate_model,
|
||||
elapsed_ms,
|
||||
)
|
||||
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
|
||||
continue
|
||||
|
||||
@@ -631,6 +737,7 @@ def huggingface_structured_json_response(
|
||||
logger.info("Retrying without response_format...")
|
||||
response = None
|
||||
last_error = None
|
||||
<<<<<<< HEAD
|
||||
<<<<<<< HEAD
|
||||
for candidate_model in _fallback_model_sequence(model, fallback_models):
|
||||
=======
|
||||
@@ -640,6 +747,12 @@ def huggingface_structured_json_response(
|
||||
allow_model_variant_fallback=allow_model_variant_fallback,
|
||||
):
|
||||
>>>>>>> pr-418
|
||||
=======
|
||||
fallback_attempt = 0
|
||||
for candidate_model in _fallback_model_sequence(model):
|
||||
fallback_attempt += 1
|
||||
started_at = time.perf_counter()
|
||||
>>>>>>> pr-419
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=candidate_model,
|
||||
@@ -647,11 +760,25 @@ def huggingface_structured_json_response(
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens
|
||||
)
|
||||
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||
logger.debug(
|
||||
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none",
|
||||
fallback_attempt,
|
||||
candidate_model,
|
||||
elapsed_ms,
|
||||
)
|
||||
if candidate_model != model:
|
||||
logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
|
||||
break
|
||||
except NotFoundError as nf_err:
|
||||
last_error = nf_err
|
||||
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||
logger.debug(
|
||||
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none",
|
||||
fallback_attempt,
|
||||
candidate_model,
|
||||
elapsed_ms,
|
||||
)
|
||||
logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
|
||||
continue
|
||||
=======
|
||||
|
||||
Reference in New Issue
Block a user