Refine Hugging Face provider retries and client reuse
This commit is contained in:
@@ -46,28 +46,14 @@ Version: 1.0
|
|||||||
Last Updated: January 2025
|
Last Updated: January 2025
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import hashlib
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
from threading import Lock
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
# Fix the environment loading path - load from backend directory
|
|
||||||
current_dir = Path(__file__).parent.parent # services directory
|
|
||||||
backend_dir = current_dir.parent # backend directory
|
|
||||||
env_path = backend_dir / '.env'
|
|
||||||
|
|
||||||
if env_path.exists():
|
|
||||||
load_dotenv(env_path)
|
|
||||||
print(f"Loaded .env from: {env_path}")
|
|
||||||
else:
|
|
||||||
# Fallback to current directory
|
|
||||||
load_dotenv()
|
|
||||||
print(f"No .env found at {env_path}, using current directory")
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from utils.logger_utils import get_service_logger
|
from utils.logger_utils import get_service_logger
|
||||||
|
|
||||||
@@ -96,6 +82,31 @@ HF_FALLBACK_MODELS = [
|
|||||||
"mistralai/Mistral-7B-Instruct-v0.3:groq",
|
"mistralai/Mistral-7B-Instruct-v0.3:groq",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
_HF_CLIENT_CACHE: Dict[str, Any] = {}
|
||||||
|
_HF_CLIENT_CACHE_LOCK = Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _masked_key_id(api_key: str) -> str:
|
||||||
|
return hashlib.sha256(api_key.encode("utf-8")).hexdigest()[:12]
|
||||||
|
|
||||||
|
|
||||||
|
def get_huggingface_client(api_key: str):
|
||||||
|
"""Get or create a cached Hugging Face/OpenAI-compatible client for the API key."""
|
||||||
|
key_id = _masked_key_id(api_key)
|
||||||
|
with _HF_CLIENT_CACHE_LOCK:
|
||||||
|
cached_client = _HF_CLIENT_CACHE.get(key_id)
|
||||||
|
if cached_client is not None:
|
||||||
|
logger.debug("Reusing cached Hugging Face client for key_id={}", key_id)
|
||||||
|
return cached_client
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
base_url="https://router.huggingface.co/hf/v1",
|
||||||
|
api_key=api_key,
|
||||||
|
)
|
||||||
|
_HF_CLIENT_CACHE[key_id] = client
|
||||||
|
logger.debug("Created new Hugging Face client for key_id={}", key_id)
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
def _candidate_model_variants(model: str):
|
def _candidate_model_variants(model: str):
|
||||||
"""Yield model ids to try for a single logical model preference."""
|
"""Yield model ids to try for a single logical model preference."""
|
||||||
@@ -137,7 +148,11 @@ def get_huggingface_api_key() -> str:
|
|||||||
|
|
||||||
return api_key
|
return api_key
|
||||||
|
|
||||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
@retry(
|
||||||
|
wait=wait_random_exponential(min=0.5, max=8),
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
def huggingface_text_response(
|
def huggingface_text_response(
|
||||||
prompt: str,
|
prompt: str,
|
||||||
model: str = "openai/gpt-oss-120b:groq",
|
model: str = "openai/gpt-oss-120b:groq",
|
||||||
@@ -192,11 +207,8 @@ def huggingface_text_response(
|
|||||||
if not api_key:
|
if not api_key:
|
||||||
raise Exception("HF_TOKEN not found in environment variables")
|
raise Exception("HF_TOKEN not found in environment variables")
|
||||||
|
|
||||||
# Initialize Hugging Face client
|
# Initialize/reuse Hugging Face client
|
||||||
client = OpenAI(
|
client = get_huggingface_client(api_key)
|
||||||
base_url=f"https://router.huggingface.co/hf/v1",
|
|
||||||
api_key=api_key,
|
|
||||||
)
|
|
||||||
logger.info("✅ Hugging Face client initialized for text response")
|
logger.info("✅ Hugging Face client initialized for text response")
|
||||||
|
|
||||||
# Prepare input for the API
|
# Prepare input for the API
|
||||||
@@ -227,13 +239,12 @@ def huggingface_text_response(
|
|||||||
|
|
||||||
logger.info("🚀 Making Hugging Face API call (chat completion)...")
|
logger.info("🚀 Making Hugging Face API call (chat completion)...")
|
||||||
|
|
||||||
# Add rate limiting to prevent expensive API calls
|
|
||||||
import time
|
|
||||||
time.sleep(1) # 1 second delay between API calls
|
|
||||||
|
|
||||||
response = None
|
response = None
|
||||||
last_error = None
|
last_error = None
|
||||||
|
fallback_attempt = 0
|
||||||
for candidate_model in _fallback_model_sequence(model):
|
for candidate_model in _fallback_model_sequence(model):
|
||||||
|
fallback_attempt += 1
|
||||||
|
started_at = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=candidate_model,
|
model=candidate_model,
|
||||||
@@ -242,11 +253,25 @@ def huggingface_text_response(
|
|||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
max_tokens=max_tokens
|
max_tokens=max_tokens
|
||||||
)
|
)
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF text attempt={} model={} elapsed_ms={:.2f}",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
if candidate_model != model:
|
if candidate_model != model:
|
||||||
logger.warning("HF text generation switched to fallback model: {}", candidate_model)
|
logger.warning("HF text generation switched to fallback model: {}", candidate_model)
|
||||||
break
|
break
|
||||||
except NotFoundError as nf_err:
|
except NotFoundError as nf_err:
|
||||||
last_error = nf_err
|
last_error = nf_err
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF text attempt={} model={} elapsed_ms={:.2f} status=model_not_found",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
|
logger.warning("HF model not found: {}. Trying fallback model.", candidate_model)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -270,7 +295,11 @@ def huggingface_text_response(
|
|||||||
logger.error(f"❌ Hugging Face text generation failed: {str(e)}")
|
logger.error(f"❌ Hugging Face text generation failed: {str(e)}")
|
||||||
raise Exception(f"Hugging Face text generation failed: {str(e)}")
|
raise Exception(f"Hugging Face text generation failed: {str(e)}")
|
||||||
|
|
||||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
@retry(
|
||||||
|
wait=wait_random_exponential(min=0.5, max=8),
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
|
reraise=True,
|
||||||
|
)
|
||||||
def huggingface_structured_json_response(
|
def huggingface_structured_json_response(
|
||||||
prompt: str,
|
prompt: str,
|
||||||
schema: Dict[str, Any],
|
schema: Dict[str, Any],
|
||||||
@@ -335,12 +364,8 @@ def huggingface_structured_json_response(
|
|||||||
if not api_key:
|
if not api_key:
|
||||||
raise Exception("HF_TOKEN not found in environment variables")
|
raise Exception("HF_TOKEN not found in environment variables")
|
||||||
|
|
||||||
# Initialize OpenAI client with Hugging Face base URL
|
# Initialize/reuse OpenAI client with Hugging Face base URL
|
||||||
# Use standard Inference API endpoint
|
client = get_huggingface_client(api_key)
|
||||||
client = OpenAI(
|
|
||||||
base_url=f"https://router.huggingface.co/hf/v1",
|
|
||||||
api_key=api_key,
|
|
||||||
)
|
|
||||||
logger.info("✅ Hugging Face client initialized for structured JSON response")
|
logger.info("✅ Hugging Face client initialized for structured JSON response")
|
||||||
|
|
||||||
# Prepare input for the API
|
# Prepare input for the API
|
||||||
@@ -380,14 +405,13 @@ def huggingface_structured_json_response(
|
|||||||
json_schema_str = json.dumps(schema, indent=2)
|
json_schema_str = json.dumps(schema, indent=2)
|
||||||
messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}"
|
messages[-1]["content"] += f"\n\nJSON Schema:\n{json_schema_str}"
|
||||||
|
|
||||||
# Add rate limiting to prevent expensive API calls
|
|
||||||
import time
|
|
||||||
time.sleep(1) # 1 second delay between API calls
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = None
|
response = None
|
||||||
last_error = None
|
last_error = None
|
||||||
|
fallback_attempt = 0
|
||||||
for candidate_model in _fallback_model_sequence(model):
|
for candidate_model in _fallback_model_sequence(model):
|
||||||
|
fallback_attempt += 1
|
||||||
|
started_at = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=candidate_model,
|
model=candidate_model,
|
||||||
@@ -396,11 +420,25 @@ def huggingface_structured_json_response(
|
|||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
response_format={"type": "json_object"} # Try to enforce JSON mode if supported
|
response_format={"type": "json_object"} # Try to enforce JSON mode if supported
|
||||||
)
|
)
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=json_object",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
if candidate_model != model:
|
if candidate_model != model:
|
||||||
logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
|
logger.warning("HF structured generation switched to fallback model: {}", candidate_model)
|
||||||
break
|
break
|
||||||
except NotFoundError as nf_err:
|
except NotFoundError as nf_err:
|
||||||
last_error = nf_err
|
last_error = nf_err
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=json_object",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
|
logger.warning("HF structured model not found: {}. Trying fallback model.", candidate_model)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -444,7 +482,10 @@ def huggingface_structured_json_response(
|
|||||||
logger.info("Retrying without response_format...")
|
logger.info("Retrying without response_format...")
|
||||||
response = None
|
response = None
|
||||||
last_error = None
|
last_error = None
|
||||||
|
fallback_attempt = 0
|
||||||
for candidate_model in _fallback_model_sequence(model):
|
for candidate_model in _fallback_model_sequence(model):
|
||||||
|
fallback_attempt += 1
|
||||||
|
started_at = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=candidate_model,
|
model=candidate_model,
|
||||||
@@ -452,11 +493,25 @@ def huggingface_structured_json_response(
|
|||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens
|
max_tokens=max_tokens
|
||||||
)
|
)
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} response_format=none",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
if candidate_model != model:
|
if candidate_model != model:
|
||||||
logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
|
logger.warning("HF structured no-response_format fallback model: {}", candidate_model)
|
||||||
break
|
break
|
||||||
except NotFoundError as nf_err:
|
except NotFoundError as nf_err:
|
||||||
last_error = nf_err
|
last_error = nf_err
|
||||||
|
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
||||||
|
logger.debug(
|
||||||
|
"HF structured attempt={} model={} elapsed_ms={:.2f} status=model_not_found response_format=none",
|
||||||
|
fallback_attempt,
|
||||||
|
candidate_model,
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
|
logger.warning("HF structured model not found (no response_format path): {}", candidate_model)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user