Merge_PR_418_refine_hf_fallback_policy_and_sif_low_cost_routing

This commit is contained in:
ajaysi
2026-03-12 16:19:19 +05:30
3 changed files with 126 additions and 6 deletions

View File

@@ -137,6 +137,10 @@ from .routing_policy import (
)
>>>>>>> pr-417
PREMIUM_HF_MINIMAL_FALLBACK_MODELS = [
"openai/gpt-oss-120b:groq",
]
def llm_text_gen(
prompt: str,
@@ -403,6 +407,7 @@ def llm_text_gen(
provider_sequence = _resolve_provider_sequence(preferred_provider, env_provider_raw, available_providers)
>>>>>>> pr-416
<<<<<<< HEAD
<<<<<<< HEAD
if not provider_sequence:
logger.error("[llm_text_gen] No configured providers available for tenant.")
@@ -433,6 +438,24 @@ def llm_text_gen(
<<<<<<< HEAD
logger.info(f"[llm_text_gen][{flow_tag}] Using provider={gpt_provider}, model={model}")
=======
=======
hf_fallback_models: Optional[List[str]] = None
hf_allow_model_variant_fallback = True
if gpt_provider == "huggingface":
if preferred_hf_models is not None:
if preferred_hf_models:
model = preferred_hf_models[0]
hf_fallback_models = preferred_hf_models[1:]
logger.info(f"[llm_text_gen] Using caller-provided HF policy starting model: {model}")
else:
# Explicit empty policy: only requested model (plus optional variant handling).
hf_fallback_models = []
logger.info("[llm_text_gen] Using caller-provided HF policy with no fallback models")
else:
# Premium/default path: minimal safe fallback chain to avoid excessive model hopping.
hf_fallback_models = PREMIUM_HF_MINIMAL_FALLBACK_MODELS
>>>>>>> pr-418
logger.debug(f"[llm_text_gen] Using provider: {gpt_provider}, model: {model}")
emit_routing_event(
logger,
@@ -644,7 +667,9 @@ def llm_text_gen(
fallback_models=hf_fallback_models,
temperature=temperature,
max_tokens=max_tokens,
system_prompt=system_instructions
system_prompt=system_instructions,
fallback_models=hf_fallback_models,
allow_model_variant_fallback=hf_allow_model_variant_fallback,
)
else:
response_text = huggingface_text_response(
@@ -676,7 +701,9 @@ def llm_text_gen(
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
system_prompt=system_instructions
system_prompt=system_instructions,
fallback_models=hf_fallback_models,
allow_model_variant_fallback=hf_allow_model_variant_fallback,
)
else:
logger.error(f"[llm_text_gen] Unknown provider: {gpt_provider}")
@@ -809,7 +836,12 @@ def llm_text_gen(
temperature=temperature,
max_tokens=max_tokens,
system_prompt=system_instructions,
<<<<<<< HEAD
api_key=hf_api_key_current,
=======
fallback_models=PREMIUM_HF_MINIMAL_FALLBACK_MODELS,
allow_model_variant_fallback=True,
>>>>>>> pr-418
)
else:
response_text = huggingface_text_response(
@@ -824,7 +856,9 @@ def llm_text_gen(
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
system_prompt=system_instructions
system_prompt=system_instructions,
fallback_models=PREMIUM_HF_MINIMAL_FALLBACK_MODELS,
allow_model_variant_fallback=True,
)
elif fallback_provider == "wavespeed":
from .wavespeed_provider import wavespeed_text_response, wavespeed_structured_json_response