Merge_PR_419_refine_hf_provider_retries_and_client_reuse
This commit is contained in:
@@ -46,15 +46,26 @@ Version: 1.0
|
|||||||
Last Updated: January 2025
|
Last Updated: January 2025
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
import os
|
import os
|
||||||
|
=======
|
||||||
|
import hashlib
|
||||||
|
>>>>>>> pr-419
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
|
<<<<<<< HEAD
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any
|
||||||
=======
|
=======
|
||||||
from typing import Optional, Dict, Any, List, Iterable
|
from typing import Optional, Dict, Any, List, Iterable
|
||||||
>>>>>>> pr-418
|
>>>>>>> pr-418
|
||||||
|
=======
|
||||||
|
import time
|
||||||
|
from threading import Lock
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
>>>>>>> pr-419
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from utils.logger_utils import get_service_logger
|
from utils.logger_utils import get_service_logger
|
||||||
@@ -95,6 +106,31 @@ HF_FALLBACK_MODELS = [
|
|||||||
>>>>>>> pr-417
|
>>>>>>> pr-417
|
||||||
]
|
]
|
||||||
|
|
||||||
|
_HF_CLIENT_CACHE: Dict[str, Any] = {}
|
||||||
|
_HF_CLIENT_CACHE_LOCK = Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _masked_key_id(api_key: str) -> str:
|
||||||
|
return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12]
|
||||||
|
|
||||||
|
|
||||||
|
def get_huggingface_client(api_key: str):
|
||||||
|
"""Get or create a cached Hugging Face/OpenAI-compatible client for the API key."""
|
||||||
|
key_id = _masked_key_id(api_key)
|
||||||
|
with _HF_CLIENT_CACHE_LOCK:
|
||||||
|
cached_client = _HF_CLIENT_CACHE.get(key_id)
|
||||||
|
if cached_client is not None:
|
||||||
|
logger.debug("Reusing cached Hugging Face client for key_id={}", key_id)
|
||||||
|
return cached_client
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
base_url="https://router.huggingface.co/hf/v1",
|
||||||
|
api_key=api_key,
|
||||||
|
)
|
||||||
|
_HF_CLIENT_CACHE[key_id] = client
|
||||||
|
logger.debug("Created new Hugging Face client for key_id={}", key_id)
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True):
|
def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True):
|
||||||
"""Yield model ids to try for a single logical model preference."""
|
"""Yield model ids to try for a single logical model preference."""
|
||||||
@@ -235,6 +271,7 @@ def get_huggingface_api_key(explicit_api_key: Optional[str] = None) -> str:
|
|||||||
|
|
||||||
return api_key
|
return api_key
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
@retry(
|
@retry(
|
||||||
retry=retry_if_exception(_should_retry_hf_error),
|
retry=retry_if_exception(_should_retry_hf_error),
|
||||||
@@ -248,6 +285,13 @@ def _get_hf_client(api_key: str):
|
|||||||
|
|
||||||
|
|
||||||
>>>>>>> pr-416
|
>>>>>>> pr-416
|
||||||
|
=======
|
||||||
|
@retry(
|
||||||
|
wait=wait_random_exponential(min=0.5, max=8),
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
|
>>>>>>> pr-419
|
||||||
def huggingface_text_response(
|
def huggingface_text_response(
|
||||||
prompt: str,
|
prompt: str,
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
@@ -317,6 +361,7 @@ def huggingface_text_response(
|
|||||||
if not api_key:
|
if not api_key:
|
||||||
raise Exception("HF_TOKEN not found in environment variables")
|
raise Exception("HF_TOKEN not found in environment variables")
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
# Initialize Hugging Face client
|
# Initialize Hugging Face client
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
@@ -326,6 +371,10 @@ def huggingface_text_response(
|
|||||||
=======
|
=======
|
||||||
client = _get_hf_client(api_key)
|
client = _get_hf_client(api_key)
|
||||||
>>>>>>> pr-416
|
>>>>>>> pr-416
|
||||||
|
=======
|
||||||
|
# Initialize/reuse Hugging Face client
|
||||||
|
client = get_huggingface_client(api_key)
|
||||||
|
>>>>>>> pr-419
|
||||||
logger.info("✅ Hugging Face client initialized for text response")
|
logger.info("✅ Hugging Face client initialized for text response")
|
||||||
|
|
||||||
# Prepare input for the API
|
# Prepare input for the API
|
||||||
@@ -356,6 +405,7 @@ def huggingface_text_response(
|
|||||||
|
|
||||||
logger.info("🚀 Making Hugging Face API call (chat completion)...")
|
logger.info("🚀 Making Hugging Face API call (chat completion)...")
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
# Add rate limiting to prevent expensive API calls
|
# Add rate limiting to prevent expensive API calls
|
||||||
import time
|
import time
|
||||||
@@ -380,6 +430,14 @@ def huggingface_text_response(
|
|||||||
fallback_models=fallback_models,
|
fallback_models=fallback_models,
|
||||||
allow_model_variant_fallback=allow_model_variant_fallback,
|
allow_model_variant_fallback=allow_model_variant_fallback,
|
||||||
):
|
):
|
||||||
|
=======
|
||||||
|
response = None
|
||||||
|
last_error = None
|
||||||
|
fallback_attempt = 0
|
||||||
|
for candidate_model in _fallback_model_sequence(model):
|
||||||
|
fallback_attempt += 1
|
||||||
|
started_at = time.perf_counter()
|
||||||
|
>>>>>>> pr-419
|
||||||
try:
|
try:
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=candidate_model,
|
model=candidate_model,
|
||||||
@@ -388,11 +446,25 @@ def huggingface_text_response(
|
|||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
max_tokens=max_tokens
|
max_tokens=max_tokens
|
||||||
)
|
)
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF text attempt={} model={} elapsed_ms={:.2f}",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
if candidate_model != model:
|
if candidate_model != model:
|
||||||
logger.warning("HF text generation switched to fallback model: {}", candidate_model)
|
logger.warning("HF text generation switched to fallback model: {}", candidate_model)
|
||||||
break
|
break
|
||||||
except NotFoundError as nf_err:
|
except NotFoundError as nf_err:
|
||||||
last_error = nf_err
|
last_error = nf_err
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
|
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -438,6 +510,14 @@ def huggingface_text_response(
|
|||||||
>>>>>>> pr-416
|
>>>>>>> pr-416
|
||||||
raise Exception(f"Hugging Face text generation failed: {str(e)}")
|
raise Exception(f"Hugging Face text generation failed: {str(e)}")
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
|
=======
|
||||||
|
@retry(
|
||||||
|
wait=wait_random_exponential(min=0.5, max=8),
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
|
>>>>>>> pr-419
|
||||||
def huggingface_structured_json_response(
|
def huggingface_structured_json_response(
|
||||||
prompt: str,
|
prompt: str,
|
||||||
schema: Dict[str, Any],
|
schema: Dict[str, Any],
|
||||||
@@ -513,6 +593,7 @@ def huggingface_structured_json_response(
|
|||||||
if not api_key:
|
if not api_key:
|
||||||
raise Exception("HF_TOKEN not found in environment variables")
|
raise Exception("HF_TOKEN not found in environment variables")
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
# Initialize OpenAI client with Hugging Face base URL
|
# Initialize OpenAI client with Hugging Face base URL
|
||||||
# Use standard Inference API endpoint
|
# Use standard Inference API endpoint
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
@@ -523,6 +604,10 @@ def huggingface_structured_json_response(
|
|||||||
=======
|
=======
|
||||||
client = _get_hf_client(api_key)
|
client = _get_hf_client(api_key)
|
||||||
>>>>>>> pr-416
|
>>>>>>> pr-416
|
||||||
|
=======
|
||||||
|
# Initialize/reuse OpenAI client with Hugging Face base URL
|
||||||
|
client = get_huggingface_client(api_key)
|
||||||
|
>>>>>>> pr-419
|
||||||
logger.info("✅ Hugging Face client initialized for structured JSON response")
|
logger.info("✅ Hugging Face client initialized for structured JSON response")
|
||||||
|
|
||||||
# Prepare input for the API
|
# Prepare input for the API
|
||||||
@@ -566,6 +651,7 @@ def huggingface_structured_json_response(
|
|||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
response = None
|
response = None
|
||||||
last_error = None
|
last_error = None
|
||||||
|
<<<<<<< HEAD
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
for candidate_model in _fallback_model_sequence(model, fallback_models):
|
for candidate_model in _fallback_model_sequence(model, fallback_models):
|
||||||
=======
|
=======
|
||||||
@@ -575,6 +661,12 @@ def huggingface_structured_json_response(
|
|||||||
allow_model_variant_fallback=allow_model_variant_fallback,
|
allow_model_variant_fallback=allow_model_variant_fallback,
|
||||||
):
|
):
|
||||||
>>>>>>> pr-418
|
>>>>>>> pr-418
|
||||||
|
=======
|
||||||
|
fallback_attempt = 0
|
||||||
|
for candidate_model in _fallback_model_sequence(model):
|
||||||
|
fallback_attempt += 1
|
||||||
|
started_at = time.perf_counter()
|
||||||
|
>>>>>>> pr-419
|
||||||
try:
|
try:
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=candidate_model,
|
model=candidate_model,
|
||||||
@@ -583,11 +675,25 @@ def huggingface_structured_json_response(
|
|||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
response_format={"type": "json_object"} # Try to enforce JSON mode if supported
|
response_format={"type": "json_object"} # Try to enforce JSON mode if supported
|
||||||
)
|
)
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
if candidate_model != model:
|
if candidate_model != model:
|
||||||
logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
|
logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
|
||||||
break
|
break
|
||||||
except NotFoundError as nf_err:
|
except NotFoundError as nf_err:
|
||||||
last_error = nf_err
|
last_error = nf_err
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
|
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -631,6 +737,7 @@ def huggingface_structured_json_response(
|
|||||||
logger.info("Retrying without response_format...")
|
logger.info("Retrying without response_format...")
|
||||||
response = None
|
response = None
|
||||||
last_error = None
|
last_error = None
|
||||||
|
<<<<<<< HEAD
|
||||||
<<<<<<< HEAD
|
<<<<<<< HEAD
|
||||||
for candidate_model in _fallback_model_sequence(model, fallback_models):
|
for candidate_model in _fallback_model_sequence(model, fallback_models):
|
||||||
=======
|
=======
|
||||||
@@ -640,6 +747,12 @@ def huggingface_structured_json_response(
|
|||||||
allow_model_variant_fallback=allow_model_variant_fallback,
|
allow_model_variant_fallback=allow_model_variant_fallback,
|
||||||
):
|
):
|
||||||
>>>>>>> pr-418
|
>>>>>>> pr-418
|
||||||
|
=======
|
||||||
|
fallback_attempt = 0
|
||||||
|
for candidate_model in _fallback_model_sequence(model):
|
||||||
|
fallback_attempt += 1
|
||||||
|
started_at = time.perf_counter()
|
||||||
|
>>>>>>> pr-419
|
||||||
try:
|
try:
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=candidate_model,
|
model=candidate_model,
|
||||||
@@ -647,11 +760,25 @@ def huggingface_structured_json_response(
|
|||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens
|
max_tokens=max_tokens
|
||||||
)
|
)
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
if candidate_model != model:
|
if candidate_model != model:
|
||||||
logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
|
logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
|
||||||
break
|
break
|
||||||
except NotFoundError as nf_err:
|
except NotFoundError as nf_err:
|
||||||
last_error = nf_err
|
last_error = nf_err
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
|
logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
|
||||||
continue
|
continue
|
||||||
=======
|
=======
|
||||||
|
|||||||
Reference in New Issue
Block a user