diff --git a/backend/services/intelligence/sif_agents.py b/backend/services/intelligence/sif_agents.py index 5b13c843..5fd2f22e 100644 --- a/backend/services/intelligence/sif_agents.py +++ b/backend/services/intelligence/sif_agents.py @@ -38,8 +38,12 @@ class SharedLLMWrapper: return llm_text_gen( prompt, user_id=self.user_id, +<<<<<<< HEAD preferred_hf_models=LOW_COST_SHARED_REMOTE_MODELS, flow_type="sif_agent", +======= + preferred_hf_models=REMOTE_LOW_COST_HF_MODELS, +>>>>>>> pr-418 ) except Exception as e: logger.error(f"SharedLLMWrapper failed to generate text: {e}") @@ -50,7 +54,12 @@ class SharedLLMWrapper: _local_llm_cache = {} +<<<<<<< HEAD LOW_COST_SHARED_REMOTE_MODELS = [ +======= + +REMOTE_LOW_COST_HF_MODELS = [ +>>>>>>> pr-418 "Qwen/Qwen2.5-1.5B-Instruct", "Qwen/Qwen2.5-0.5B-Instruct", "TinyLlama/TinyLlama-1.1B-Chat-v1.0", diff --git a/backend/services/llm_providers/huggingface_provider.py b/backend/services/llm_providers/huggingface_provider.py index 7366c88c..c3573f47 100644 --- a/backend/services/llm_providers/huggingface_provider.py +++ b/backend/services/llm_providers/huggingface_provider.py @@ -49,8 +49,12 @@ Last Updated: January 2025 import os import json import re +<<<<<<< HEAD from functools import lru_cache from typing import Optional, Dict, Any +======= +from typing import Optional, Dict, Any, List, Iterable +>>>>>>> pr-418 from loguru import logger from utils.logger_utils import get_service_logger @@ -92,7 +96,7 @@ HF_FALLBACK_MODELS = [ ] -def _candidate_model_variants(model: str): +def _candidate_model_variants(model: str, allow_model_variant_fallback: bool = True): """Yield model ids to try for a single logical model preference.""" if not model: return @@ -101,12 +105,13 @@ def _candidate_model_variants(model: str): yield model # Fallback to base repo id when provider suffix is not recognized by the router - if ":" in model: + if allow_model_variant_fallback and ":" in model: base_model = model.split(":", 1)[0] if base_model: yield base_model +<<<<<<< HEAD def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = None): # IMPORTANT: Do not apply implicit global fallback chains. # Callers must explicitly provide fallback_models when they want multi-model retries. @@ -114,9 +119,27 @@ def _fallback_model_sequence(model: str, fallback_models: Optional[List[str]] = sequence = [model] + fallback_models else: sequence = [model] +======= +def _fallback_model_sequence( + model: str, + fallback_models: Optional[List[str]] = None, + allow_model_variant_fallback: bool = True, +): + sequence: Iterable[str] + if fallback_models is None: + # Safe default only when caller doesn't provide explicit policy. + sequence = [model] + HF_FALLBACK_MODELS + else: + # Caller owns fallback policy fully. Empty list means only requested model. + sequence = [model] + list(fallback_models) + +>>>>>>> pr-418 seen = set() for preferred_model in sequence: - for candidate in _candidate_model_variants(preferred_model): + for candidate in _candidate_model_variants( + preferred_model, + allow_model_variant_fallback=allow_model_variant_fallback, + ): if candidate and candidate not in seen: seen.add(candidate) yield candidate @@ -237,7 +260,12 @@ def huggingface_text_response( max_tokens: int = 2048, top_p: float = 0.9, system_prompt: Optional[str] = None, +<<<<<<< HEAD api_key: Optional[str] = None, +======= + fallback_models: Optional[List[str]] = None, + allow_model_variant_fallback: bool = True, +>>>>>>> pr-418 ) -> str: """ Generate text response using Hugging Face Inference Providers API. @@ -333,6 +361,7 @@ def huggingface_text_response( import time time.sleep(1) # 1 second delay between API calls +<<<<<<< HEAD # Call exactly the requested model; no retries, no fallbacks, no variants ======= >>>>>>> pr-416 @@ -343,6 +372,33 @@ def huggingface_text_response( top_p=top_p, max_tokens=max_tokens ) +======= + response = None + last_error = None + for candidate_model in _fallback_model_sequence( + model=model, + fallback_models=fallback_models, + allow_model_variant_fallback=allow_model_variant_fallback, + ): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens + ) + if candidate_model != model: + logger.warning("HF text generation switched to fallback model: {}", candidate_model) + break + except NotFoundError as nf_err: + last_error = nf_err + logger.warning("HF model not found: {}. Trying fallback model.", candidate_model) + continue + + if response is None: + raise last_error or Exception("Hugging Face text generation failed: all fallback models failed") +>>>>>>> pr-418 # Extract text from response generated_text = response.choices[0].message.content @@ -394,7 +450,12 @@ def huggingface_structured_json_response( temperature: float = 0.7, max_tokens: int = 8192, system_prompt: Optional[str] = None, +<<<<<<< HEAD api_key: Optional[str] = None, +======= + fallback_models: Optional[List[str]] = None, + allow_model_variant_fallback: bool = True, +>>>>>>> pr-418 ) -> Dict[str, Any]: """ Generate structured JSON response using Hugging Face Inference Providers API. @@ -505,7 +566,15 @@ def huggingface_structured_json_response( <<<<<<< HEAD response = None last_error = None +<<<<<<< HEAD for candidate_model in _fallback_model_sequence(model, fallback_models): +======= + for candidate_model in _fallback_model_sequence( + model=model, + fallback_models=fallback_models, + allow_model_variant_fallback=allow_model_variant_fallback, + ): +>>>>>>> pr-418 try: response = client.chat.completions.create( model=candidate_model, @@ -562,7 +631,15 @@ def huggingface_structured_json_response( logger.info("Retrying without response_format...") response = None last_error = None +<<<<<<< HEAD for candidate_model in _fallback_model_sequence(model, fallback_models): +======= + for candidate_model in _fallback_model_sequence( + model=model, + fallback_models=fallback_models, + allow_model_variant_fallback=allow_model_variant_fallback, + ): +>>>>>>> pr-418 try: response = client.chat.completions.create( model=candidate_model, diff --git a/backend/services/llm_providers/main_text_generation.py b/backend/services/llm_providers/main_text_generation.py index ec98c991..e8e48369 100644 --- a/backend/services/llm_providers/main_text_generation.py +++ b/backend/services/llm_providers/main_text_generation.py @@ -137,6 +137,10 @@ from .routing_policy import ( ) >>>>>>> pr-417 +PREMIUM_HF_MINIMAL_FALLBACK_MODELS = [ + "openai/gpt-oss-120b:groq", +] + def llm_text_gen( prompt: str, @@ -403,6 +407,7 @@ def llm_text_gen( provider_sequence = _resolve_provider_sequence(preferred_provider, env_provider_raw, available_providers) >>>>>>> pr-416 +<<<<<<< HEAD <<<<<<< HEAD if not provider_sequence: logger.error("[llm_text_gen] No configured providers available for tenant.") @@ -433,6 +438,24 @@ def llm_text_gen( <<<<<<< HEAD logger.info(f"[llm_text_gen][{flow_tag}] Using provider={gpt_provider}, model={model}") ======= +======= + hf_fallback_models: Optional[List[str]] = None + hf_allow_model_variant_fallback = True + if gpt_provider == "huggingface": + if preferred_hf_models is not None: + if preferred_hf_models: + model = preferred_hf_models[0] + hf_fallback_models = preferred_hf_models[1:] + logger.info(f"[llm_text_gen] Using caller-provided HF policy starting model: {model}") + else: + # Explicit empty policy: only requested model (plus optional variant handling). + hf_fallback_models = [] + logger.info("[llm_text_gen] Using caller-provided HF policy with no fallback models") + else: + # Premium/default path: minimal safe fallback chain to avoid excessive model hopping. + hf_fallback_models = PREMIUM_HF_MINIMAL_FALLBACK_MODELS + +>>>>>>> pr-418 logger.debug(f"[llm_text_gen] Using provider: {gpt_provider}, model: {model}") emit_routing_event( logger, @@ -644,7 +667,9 @@ def llm_text_gen( fallback_models=hf_fallback_models, temperature=temperature, max_tokens=max_tokens, - system_prompt=system_instructions + system_prompt=system_instructions, + fallback_models=hf_fallback_models, + allow_model_variant_fallback=hf_allow_model_variant_fallback, ) else: response_text = huggingface_text_response( @@ -676,7 +701,9 @@ def llm_text_gen( temperature=temperature, max_tokens=max_tokens, top_p=top_p, - system_prompt=system_instructions + system_prompt=system_instructions, + fallback_models=hf_fallback_models, + allow_model_variant_fallback=hf_allow_model_variant_fallback, ) else: logger.error(f"[llm_text_gen] Unknown provider: {gpt_provider}") @@ -809,7 +836,12 @@ def llm_text_gen( temperature=temperature, max_tokens=max_tokens, system_prompt=system_instructions, +<<<<<<< HEAD api_key=hf_api_key_current, +======= + fallback_models=PREMIUM_HF_MINIMAL_FALLBACK_MODELS, + allow_model_variant_fallback=True, +>>>>>>> pr-418 ) else: response_text = huggingface_text_response( @@ -824,7 +856,9 @@ def llm_text_gen( temperature=temperature, max_tokens=max_tokens, top_p=top_p, - system_prompt=system_instructions + system_prompt=system_instructions, + fallback_models=PREMIUM_HF_MINIMAL_FALLBACK_MODELS, + allow_model_variant_fallback=True, ) elif fallback_provider == "wavespeed": from .wavespeed_provider import wavespeed_text_response, wavespeed_structured_json_response