diff --git a/backend/services/intelligence/agents/agent_orchestrator.py b/backend/services/intelligence/agents/agent_orchestrator.py index 6f0deff2..3f09948e 100644 --- a/backend/services/intelligence/agents/agent_orchestrator.py +++ b/backend/services/intelligence/agents/agent_orchestrator.py @@ -47,7 +47,7 @@ logger = get_service_logger(__name__) class AgentTeamConfiguration: """Configuration for the complete agent team""" user_id: str - shared_llm: str = "Qwen/Qwen2.5-3B-Instruct" # Updated to a stable model known for text-generation + shared_llm: str = "Qwen/Qwen2.5-1.5B-Instruct" # Reduced default memory footprint for local environments max_iterations: int = 15 enable_safety: bool = True enable_performance_monitoring: bool = True diff --git a/backend/services/intelligence/agents/core_agent_framework.py b/backend/services/intelligence/agents/core_agent_framework.py index 683de459..fe8b68e6 100644 --- a/backend/services/intelligence/agents/core_agent_framework.py +++ b/backend/services/intelligence/agents/core_agent_framework.py @@ -40,10 +40,17 @@ from services.intelligence.monitoring.semantic_dashboard import RealTimeSemantic from services.intelligence.agents.safety_framework import get_safety_framework from services.agent_activity_service import AgentActivityService, build_agent_event_payload from services.intelligence.agents.agent_usage_tracking import track_agent_usage_sync +from services.llm_providers.main_text_generation import llm_text_gen import time logger = get_service_logger(__name__) +LOW_COST_REMOTE_MODELS = [ + "Qwen/Qwen2.5-1.5B-Instruct", + "Qwen/Qwen2.5-0.5B-Instruct", + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", +] + class TrackingLLMWrapper: """ Wrapper for LLM instances to transparently track usage. @@ -169,7 +176,7 @@ class BaseALwrityAgent(ABC): _prompt_context_cache: Dict[str, Dict[str, Any]] = {} _profile_cache: Dict[str, Dict[str, Any]] = {} - def __init__(self, user_id: str, agent_type: str, model_name: str = "Qwen/Qwen3-4B-Instruct-2507", llm: Any = None, enable_tracing: bool = True, **kwargs): + def __init__(self, user_id: str, agent_type: str, model_name: str = "Qwen/Qwen2.5-1.5B-Instruct", llm: Any = None, enable_tracing: bool = True, **kwargs): self.user_id = user_id self.agent_type = agent_type self.model_name = model_name @@ -295,7 +302,8 @@ class BaseALwrityAgent(ABC): Centralized method for all agents inheriting from BaseALwrityAgent. """ if not self.llm: - return "[LLM Unavailable]" + logger.error("LLM unavailable for agent %s (%s)", self.agent_type, self.agent_id) + raise RuntimeError(f"LLM unavailable for agent {self.agent_type}") try: # Run in executor to avoid blocking if LLM is synchronous @@ -319,7 +327,37 @@ class BaseALwrityAgent(ABC): except Exception as e: logger.error(f"LLM generation failed in agent {self.agent_type}: {e}") - return "[Generation Failed]" + logger.warning( + "Attempting remote low-cost fallback via llm_text_gen for agent %s (user=%s)", + self.agent_type, + self.user_id, + ) + try: + loop = asyncio.get_event_loop() + fallback_response = await loop.run_in_executor( + None, + lambda: llm_text_gen( + prompt=prompt, + user_id=self.user_id, + preferred_hf_models=LOW_COST_REMOTE_MODELS, + ), + ) + logger.warning( + "Remote low-cost fallback succeeded for agent %s (user=%s)", + self.agent_type, + self.user_id, + ) + return fallback_response + except Exception as remote_e: + logger.error( + "Remote fallback failed for agent %s (user=%s): %s", + self.agent_type, + self.user_id, + remote_e, + ) + raise RuntimeError( + f"Local and remote LLM generation failed for agent {self.agent_type}: {remote_e}" + ) from remote_e def _resolve_agent_key(self, agent_type: str) -> str: value = str(agent_type or "").strip() @@ -524,7 +562,7 @@ class BaseALwrityAgent(ABC): result = await loop.run_in_executor(None, self.txtai_agent, prompt) if not self.txtai_agent: - result = "Agent not initialized" + raise RuntimeError(f"Agent {self.agent_id} not initialized (txtai_agent missing)") if activity and run_record: activity.log_event( @@ -848,19 +886,15 @@ class BaseALwrityAgent(ABC): raise e async def _execute_fallback(self, action: AgentAction) -> str: - """Execute fallback action when txtai is not available""" - # Simulate agent processing for development - logger.info(f"Executing fallback action: {action.action_type}") - - # Return simulated result based on action type - if action.action_type == "analyze_competitor": - return "Competitor analysis completed (fallback mode)" - elif action.action_type == "optimize_content": - return "Content optimization completed (fallback mode)" - elif action.action_type == "fix_seo_issue": - return "SEO issue fixed (fallback mode)" - else: - return f"Action {action.action_type} completed (fallback mode)" + """Fail-fast instead of returning mock fallback output.""" + logger.error( + "Fallback execution requested for action '%s' on agent %s. Failing fast to avoid mock output.", + action.action_type, + self.agent_id, + ) + raise RuntimeError( + f"Fallback execution is disabled for SIF reliability. Agent={self.agent_id}, action={action.action_type}" + ) def _prepare_agent_prompt(self, action: AgentAction) -> str: """Prepare prompt for txtai agent""" diff --git a/backend/services/intelligence/agents/specialized/base.py b/backend/services/intelligence/agents/specialized/base.py index a3e75055..163d9612 100644 --- a/backend/services/intelligence/agents/specialized/base.py +++ b/backend/services/intelligence/agents/specialized/base.py @@ -29,7 +29,7 @@ except ImportError: logger.warning("txtai not available, using fallback implementation") class SIFBaseAgent(BaseALwrityAgent): - def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-3B-Instruct", llm: Any = None, **kwargs): + def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-1.5B-Instruct", llm: Any = None, **kwargs): # Hybrid LLM Strategy: # 1. Shared LLM for external/high-quality generation self.shared_llm = SharedLLMWrapper(user_id) diff --git a/backend/services/intelligence/sif_agents.py b/backend/services/intelligence/sif_agents.py index 5cb04da1..52464915 100644 --- a/backend/services/intelligence/sif_agents.py +++ b/backend/services/intelligence/sif_agents.py @@ -44,6 +44,12 @@ class SharedLLMWrapper: _local_llm_cache = {} +LOCAL_LLM_FALLBACKS = [ + "Qwen/Qwen2.5-1.5B-Instruct", + "Qwen/Qwen2.5-0.5B-Instruct", + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", +] + class LocalLLMWrapper: """ Lazily loads a local LLM via txtai and caches it globally. @@ -72,22 +78,56 @@ class LocalLLMWrapper: if task_to_use == "text-generation": task_to_use = "language-generation" - logger.info(f"Loading local LLM (singleton): {self.model_path} (task={task_to_use})") - try: - _local_llm_cache[cache_key] = LLM(path=self.model_path, task=task_to_use) - except Exception as e: + candidate_models = [] + for candidate in [self.model_path, *LOCAL_LLM_FALLBACKS]: + if candidate not in candidate_models: + candidate_models.append(candidate) + + last_error = None + for candidate_model in candidate_models: + candidate_key = f"{candidate_model}:{self.task}" + if candidate_key in _local_llm_cache: + if candidate_model != self.model_path: + logger.warning(f"Using cached fallback local LLM model: {candidate_model}") + return _local_llm_cache[candidate_key] + + logger.info(f"Loading local LLM (singleton): {candidate_model} (task={task_to_use})") try: - import transformers - from transformers.pipelines import SUPPORTED_TASKS - logger.error( - f"LocalLLMWrapper init failed (model={self.model_path}, requested_task={task_to_use}, " - f"transformers={getattr(transformers, '__version__', 'unknown')}, " - f"supported_tasks={sorted(list(SUPPORTED_TASKS.keys()))[:50]})" + _local_llm_cache[candidate_key] = LLM(path=candidate_model, task=task_to_use) + if candidate_model != self.model_path: + logger.warning( + f"Loaded fallback local LLM model '{candidate_model}' after failure on '{self.model_path}'" + ) + return _local_llm_cache[candidate_key] + except Exception as e: + last_error = e + message = str(e).lower() + is_memory_issue = ( + "paging file is too small" in message + or "os error 1455" in message + or "out of memory" in message + or "not enough memory" in message ) - except Exception: - pass - logger.error(f"Failed to initialize LocalLLMWrapper: {e}") - raise e + if is_memory_issue: + logger.warning( + f"Local LLM memory load failure for '{candidate_model}', trying smaller fallback. Error: {e}" + ) + continue + logger.warning(f"Local LLM load failed for '{candidate_model}', trying next fallback. Error: {e}") + continue + + try: + import transformers + from transformers.pipelines import SUPPORTED_TASKS + logger.error( + f"LocalLLMWrapper init failed (model={self.model_path}, requested_task={task_to_use}, " + f"transformers={getattr(transformers, '__version__', 'unknown')}, " + f"supported_tasks={sorted(list(SUPPORTED_TASKS.keys()))[:50]})" + ) + except Exception: + pass + logger.error(f"Failed to initialize LocalLLMWrapper after fallback attempts: {last_error}") + raise last_error return _local_llm_cache[cache_key] @@ -98,7 +138,7 @@ class LocalLLMWrapper: return self.llm(prompt, **kwargs) class SIFBaseAgent(BaseALwrityAgent): - def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-3B-Instruct", llm: Any = None): + def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-1.5B-Instruct", llm: Any = None): # Hybrid LLM Strategy: # 1. Shared LLM for external/high-quality generation (available to all agents) self.shared_llm = SharedLLMWrapper(user_id) diff --git a/backend/services/intelligence/txtai_service.py b/backend/services/intelligence/txtai_service.py index b641e598..6f62d0a8 100644 --- a/backend/services/intelligence/txtai_service.py +++ b/backend/services/intelligence/txtai_service.py @@ -54,6 +54,7 @@ class TxtaiIntelligenceService: self.cache_manager = semantic_cache_manager if enable_caching else None self._backend = "faiss" # Default backend self._disable_ann_queries = False # Set when FAISS nprobe incompatibility is detected + self.fail_fast = str(os.getenv("SIF_FAIL_FAST", "true")).lower() in {"1", "true", "yes", "on"} # Mark as initialized for singleton pattern self._singleton_initialized = True @@ -226,6 +227,7 @@ class TxtaiIntelligenceService: Args: items: List of (id, text, metadata) tuples. """ +<<<<<<< HEAD # Check if already initialized if not self._initialized and not self._initialization_in_progress: # Trigger initialization in background (non-blocking) @@ -241,6 +243,14 @@ class TxtaiIntelligenceService: if not self.embeddings: logger.error(f"Cannot index content - embeddings not available for user {self.user_id}") +======= + self._ensure_initialized() + if not self._initialized or not self.embeddings: + message = f"Cannot index content - service not initialized for user {self.user_id}" + logger.error(message) + if self.fail_fast: + raise RuntimeError(message) +>>>>>>> 8b0547c (Make SIF fail fast and add low-cost remote LLM fallback) return try: @@ -287,7 +297,10 @@ class TxtaiIntelligenceService: """Perform semantic search with intelligent caching.""" self._ensure_initialized() if not self._initialized or not self.embeddings: - logger.error(f"Cannot perform search - service not initialized for user {self.user_id}") + message = f"Cannot perform search - service not initialized for user {self.user_id}" + logger.error(message) + if self.fail_fast: + raise RuntimeError(message) return [] try: @@ -321,6 +334,8 @@ class TxtaiIntelligenceService: return results except Exception as e: logger.error(f"Search failed for user {self.user_id}: {e}") + if self.fail_fast: + raise logger.error(f"Query: '{query}'") logger.error(f"Full traceback: {traceback.format_exc()}") return [] diff --git a/backend/services/llm_providers/huggingface_provider.py b/backend/services/llm_providers/huggingface_provider.py index 69970a4a..5f354ca7 100644 --- a/backend/services/llm_providers/huggingface_provider.py +++ b/backend/services/llm_providers/huggingface_provider.py @@ -82,11 +82,29 @@ from tenacity import ( try: from openai import OpenAI + from openai import NotFoundError OPENAI_AVAILABLE = True except ImportError: OPENAI_AVAILABLE = False + NotFoundError = Exception logger.warn("OpenAI library not available. Install with: pip install openai") +HF_FALLBACK_MODELS = [ + "openai/gpt-oss-120b:groq", + "moonshotai/Kimi-K2-Instruct-0905:groq", + "meta-llama/Llama-3.1-8B-Instruct:groq", + "mistralai/Mistral-7B-Instruct-v0.3:groq", +] + + +def _fallback_model_sequence(model: str): + sequence = [model] + HF_FALLBACK_MODELS + seen = set() + for candidate in sequence: + if candidate and candidate not in seen: + seen.add(candidate) + yield candidate + def get_huggingface_api_key() -> str: """Get Hugging Face API key with proper error handling.""" api_key = os.getenv('HF_TOKEN') @@ -197,14 +215,27 @@ def huggingface_text_response( import time time.sleep(1) # 1 second delay between API calls - # Make the API call using Chat Completions - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - top_p=top_p, - max_tokens=max_tokens - ) + response = None + last_error = None + for candidate_model in _fallback_model_sequence(model): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens + ) + if candidate_model != model: + logger.warning("HF text generation switched to fallback model: %s", candidate_model) + break + except NotFoundError as nf_err: + last_error = nf_err + logger.warning("HF model not found: %s. Trying fallback model.", candidate_model) + continue + + if response is None: + raise last_error or Exception("Hugging Face text generation failed: all fallback models failed") # Extract text from response generated_text = response.choices[0].message.content @@ -338,13 +369,27 @@ def huggingface_structured_json_response( time.sleep(1) # 1 second delay between API calls try: - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - response_format={"type": "json_object"} # Try to enforce JSON mode if supported - ) + response = None + last_error = None + for candidate_model in _fallback_model_sequence(model): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + response_format={"type": "json_object"} # Try to enforce JSON mode if supported + ) + if candidate_model != model: + logger.warning("HF structured generation switched to fallback model: %s", candidate_model) + break + except NotFoundError as nf_err: + last_error = nf_err + logger.warning("HF structured model not found: %s. Trying fallback model.", candidate_model) + continue + + if response is None: + raise last_error or Exception("Hugging Face structured generation failed: all fallback models failed") response_text = response.choices[0].message.content @@ -379,14 +424,28 @@ def huggingface_structured_json_response( except Exception as e: logger.error(f"❌ Hugging Face API call failed: {e}") # If 422 Unprocessable Entity (often due to response_format not supported), retry without it - if "422" in str(e) or "not supported" in str(e).lower(): + if "422" in str(e) or "not supported" in str(e).lower() or isinstance(e, NotFoundError): logger.info("Retrying without response_format...") - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens - ) + response = None + last_error = None + for candidate_model in _fallback_model_sequence(model): + try: + response = client.chat.completions.create( + model=candidate_model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens + ) + if candidate_model != model: + logger.warning("HF structured no-response_format fallback model: %s", candidate_model) + break + except NotFoundError as nf_err: + last_error = nf_err + logger.warning("HF structured model not found (no response_format path): %s", candidate_model) + continue + + if response is None: + raise last_error or e response_text = response.choices[0].message.content # ... (same parsing logic would apply, simplified here for brevity) try: diff --git a/backend/services/llm_providers/main_text_generation.py b/backend/services/llm_providers/main_text_generation.py index 3e34dc74..dd4ec672 100644 --- a/backend/services/llm_providers/main_text_generation.py +++ b/backend/services/llm_providers/main_text_generation.py @@ -6,7 +6,7 @@ migrated from the legacy lib/gpt_providers/text_generation/main_text_generation. import os import json -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, List from datetime import datetime from loguru import logger from fastapi import HTTPException @@ -16,7 +16,13 @@ from .gemini_provider import gemini_text_response, gemini_structured_json_respon from .huggingface_provider import huggingface_text_response, huggingface_structured_json_response -def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: Optional[Dict[str, Any]] = None, user_id: str = None) -> str: +def llm_text_gen( + prompt: str, + system_prompt: Optional[str] = None, + json_struct: Optional[Dict[str, Any]] = None, + user_id: str = None, + preferred_hf_models: Optional[List[str]] = None, +) -> str: """ Generate text using Language Model (LLM) based on the provided prompt. @@ -54,7 +60,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: model = "gemini-2.0-flash-001" elif env_provider in ['hf_response_api', 'huggingface', 'hf']: gpt_provider = "huggingface" - model = "mistralai/Mistral-7B-Instruct-v0.3" + model = "mistralai/Mistral-7B-Instruct-v0.3:groq" # Default blog characteristics blog_tone = "Professional" @@ -80,7 +86,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: model = "gemini-2.0-flash-001" elif "huggingface" in available_providers: gpt_provider = "huggingface" - model = "mistralai/Mistral-7B-Instruct-v0.3" + model = "mistralai/Mistral-7B-Instruct-v0.3:groq" else: logger.error("[llm_text_gen] No API keys found for supported providers.") raise RuntimeError("No LLM API keys configured. Configure GEMINI_API_KEY or HF_TOKEN to enable AI responses.") @@ -93,9 +99,13 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: model = "gemini-2.0-flash-001" elif "huggingface" in available_providers: gpt_provider = "huggingface" - model = "mistralai/Mistral-7B-Instruct-v0.3" + model = "mistralai/Mistral-7B-Instruct-v0.3:groq" else: raise RuntimeError("No supported providers available.") + + if gpt_provider == "huggingface" and preferred_hf_models: + model = preferred_hf_models[0] + logger.info(f"[llm_text_gen] Using preferred low-cost HF model: {model}") logger.debug(f"[llm_text_gen] Using provider: {gpt_provider}, model: {model}") @@ -303,7 +313,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: elif fallback_provider == "huggingface": provider_enum = APIProvider.MISTRAL actual_provider_name = "huggingface" - fallback_model = "mistralai/Mistral-7B-Instruct-v0.3" + fallback_model = "mistralai/Mistral-7B-Instruct-v0.3:groq" if fallback_provider == "google": if json_struct: @@ -330,7 +340,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: response_text = huggingface_structured_json_response( prompt=prompt, schema=json_struct, - model="mistralai/Mistral-7B-Instruct-v0.3", + model="mistralai/Mistral-7B-Instruct-v0.3:groq", temperature=temperature, max_tokens=max_tokens, system_prompt=system_instructions @@ -338,7 +348,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: else: response_text = huggingface_text_response( prompt=prompt, - model="mistralai/Mistral-7B-Instruct-v0.3", + model="mistralai/Mistral-7B-Instruct-v0.3:groq", temperature=temperature, max_tokens=max_tokens, top_p=top_p, @@ -394,4 +404,4 @@ def get_api_key(gpt_provider: str) -> Optional[str]: return api_key_manager.get_api_key(mapped_provider) except Exception as e: logger.error(f"[get_api_key] Error getting API key for {gpt_provider}: {str(e)}") - return None \ No newline at end of file + return None