diff --git a/backend/services/intelligence/agents/agent_orchestrator.py b/backend/services/intelligence/agents/agent_orchestrator.py
index 6f0deff2..3f09948e 100644
--- a/backend/services/intelligence/agents/agent_orchestrator.py
+++ b/backend/services/intelligence/agents/agent_orchestrator.py
@@ -47,7 +47,7 @@ logger = get_service_logger(__name__)
 class AgentTeamConfiguration:
     """Configuration for the complete agent team"""
     user_id: str
-    shared_llm: str = "Qwen/Qwen2.5-3B-Instruct"  # Updated to a stable model known for text-generation
+    shared_llm: str = "Qwen/Qwen2.5-1.5B-Instruct"  # Reduced default memory footprint for local environments
     max_iterations: int = 15
     enable_safety: bool = True
     enable_performance_monitoring: bool = True
diff --git a/backend/services/intelligence/agents/core_agent_framework.py b/backend/services/intelligence/agents/core_agent_framework.py
index 683de459..fe8b68e6 100644
--- a/backend/services/intelligence/agents/core_agent_framework.py
+++ b/backend/services/intelligence/agents/core_agent_framework.py
@@ -40,10 +40,17 @@ from services.intelligence.monitoring.semantic_dashboard import RealTimeSemantic
 from services.intelligence.agents.safety_framework import get_safety_framework
 from services.agent_activity_service import AgentActivityService, build_agent_event_payload
 from services.intelligence.agents.agent_usage_tracking import track_agent_usage_sync
+from services.llm_providers.main_text_generation import llm_text_gen
 import time
 
 logger = get_service_logger(__name__)
 
+LOW_COST_REMOTE_MODELS = [
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
 class TrackingLLMWrapper:
     """
     Wrapper for LLM instances to transparently track usage.
@@ -169,7 +176,7 @@ class BaseALwrityAgent(ABC):
     _prompt_context_cache: Dict[str, Dict[str, Any]] = {}
     _profile_cache: Dict[str, Dict[str, Any]] = {}
     
-    def __init__(self, user_id: str, agent_type: str, model_name: str = "Qwen/Qwen3-4B-Instruct-2507", llm: Any = None, enable_tracing: bool = True, **kwargs):
+    def __init__(self, user_id: str, agent_type: str, model_name: str = "Qwen/Qwen2.5-1.5B-Instruct", llm: Any = None, enable_tracing: bool = True, **kwargs):
         self.user_id = user_id
         self.agent_type = agent_type
         self.model_name = model_name
@@ -295,7 +302,8 @@ class BaseALwrityAgent(ABC):
         Centralized method for all agents inheriting from BaseALwrityAgent.
         """
         if not self.llm:
-            return "[LLM Unavailable]"
+            logger.error("LLM unavailable for agent %s (%s)", self.agent_type, self.agent_id)
+            raise RuntimeError(f"LLM unavailable for agent {self.agent_type}")
             
         try:
             # Run in executor to avoid blocking if LLM is synchronous
@@ -319,7 +327,37 @@ class BaseALwrityAgent(ABC):
             
         except Exception as e:
             logger.error(f"LLM generation failed in agent {self.agent_type}: {e}")
-            return "[Generation Failed]"
+            logger.warning(
+                "Attempting remote low-cost fallback via llm_text_gen for agent %s (user=%s)",
+                self.agent_type,
+                self.user_id,
+            )
+            try:
+                loop = asyncio.get_event_loop()
+                fallback_response = await loop.run_in_executor(
+                    None,
+                    lambda: llm_text_gen(
+                        prompt=prompt,
+                        user_id=self.user_id,
+                        preferred_hf_models=LOW_COST_REMOTE_MODELS,
+                    ),
+                )
+                logger.warning(
+                    "Remote low-cost fallback succeeded for agent %s (user=%s)",
+                    self.agent_type,
+                    self.user_id,
+                )
+                return fallback_response
+            except Exception as remote_e:
+                logger.error(
+                    "Remote fallback failed for agent %s (user=%s): %s",
+                    self.agent_type,
+                    self.user_id,
+                    remote_e,
+                )
+                raise RuntimeError(
+                    f"Local and remote LLM generation failed for agent {self.agent_type}: {remote_e}"
+                ) from remote_e
 
     def _resolve_agent_key(self, agent_type: str) -> str:
         value = str(agent_type or "").strip()
@@ -524,7 +562,7 @@ class BaseALwrityAgent(ABC):
                     result = await loop.run_in_executor(None, self.txtai_agent, prompt)
             
             if not self.txtai_agent:
-                result = "Agent not initialized"
+                raise RuntimeError(f"Agent {self.agent_id} not initialized (txtai_agent missing)")
 
             if activity and run_record:
                 activity.log_event(
@@ -848,19 +886,15 @@ class BaseALwrityAgent(ABC):
             raise e
     
     async def _execute_fallback(self, action: AgentAction) -> str:
-        """Execute fallback action when txtai is not available"""
-        # Simulate agent processing for development
-        logger.info(f"Executing fallback action: {action.action_type}")
-        
-        # Return simulated result based on action type
-        if action.action_type == "analyze_competitor":
-            return "Competitor analysis completed (fallback mode)"
-        elif action.action_type == "optimize_content":
-            return "Content optimization completed (fallback mode)"
-        elif action.action_type == "fix_seo_issue":
-            return "SEO issue fixed (fallback mode)"
-        else:
-            return f"Action {action.action_type} completed (fallback mode)"
+        """Fail-fast instead of returning mock fallback output."""
+        logger.error(
+            "Fallback execution requested for action '%s' on agent %s. Failing fast to avoid mock output.",
+            action.action_type,
+            self.agent_id,
+        )
+        raise RuntimeError(
+            f"Fallback execution is disabled for SIF reliability. Agent={self.agent_id}, action={action.action_type}"
+        )
     
     def _prepare_agent_prompt(self, action: AgentAction) -> str:
         """Prepare prompt for txtai agent"""
diff --git a/backend/services/intelligence/agents/specialized/base.py b/backend/services/intelligence/agents/specialized/base.py
index a3e75055..163d9612 100644
--- a/backend/services/intelligence/agents/specialized/base.py
+++ b/backend/services/intelligence/agents/specialized/base.py
@@ -29,7 +29,7 @@ except ImportError:
         logger.warning("txtai not available, using fallback implementation")
 
 class SIFBaseAgent(BaseALwrityAgent):
-    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-3B-Instruct", llm: Any = None, **kwargs):
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-1.5B-Instruct", llm: Any = None, **kwargs):
         # Hybrid LLM Strategy:
         # 1. Shared LLM for external/high-quality generation
         self.shared_llm = SharedLLMWrapper(user_id)
diff --git a/backend/services/intelligence/sif_agents.py b/backend/services/intelligence/sif_agents.py
index 5cb04da1..52464915 100644
--- a/backend/services/intelligence/sif_agents.py
+++ b/backend/services/intelligence/sif_agents.py
@@ -44,6 +44,12 @@ class SharedLLMWrapper:
 
 _local_llm_cache = {}
 
+LOCAL_LLM_FALLBACKS = [
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
 class LocalLLMWrapper:
     """
     Lazily loads a local LLM via txtai and caches it globally.
@@ -72,22 +78,56 @@ class LocalLLMWrapper:
         if task_to_use == "text-generation":
             task_to_use = "language-generation"
             
-        logger.info(f"Loading local LLM (singleton): {self.model_path} (task={task_to_use})")
-        try:
-            _local_llm_cache[cache_key] = LLM(path=self.model_path, task=task_to_use)
-        except Exception as e:
+        candidate_models = []
+        for candidate in [self.model_path, *LOCAL_LLM_FALLBACKS]:
+            if candidate not in candidate_models:
+                candidate_models.append(candidate)
+
+        last_error = None
+        for candidate_model in candidate_models:
+            candidate_key = f"{candidate_model}:{self.task}"
+            if candidate_key in _local_llm_cache:
+                if candidate_model != self.model_path:
+                    logger.warning(f"Using cached fallback local LLM model: {candidate_model}")
+                return _local_llm_cache[candidate_key]
+
+            logger.info(f"Loading local LLM (singleton): {candidate_model} (task={task_to_use})")
             try:
-                import transformers
-                from transformers.pipelines import SUPPORTED_TASKS
-                logger.error(
-                    f"LocalLLMWrapper init failed (model={self.model_path}, requested_task={task_to_use}, "
-                    f"transformers={getattr(transformers, '__version__', 'unknown')}, "
-                    f"supported_tasks={sorted(list(SUPPORTED_TASKS.keys()))[:50]})"
+                _local_llm_cache[candidate_key] = LLM(path=candidate_model, task=task_to_use)
+                if candidate_model != self.model_path:
+                    logger.warning(
+                        f"Loaded fallback local LLM model '{candidate_model}' after failure on '{self.model_path}'"
+                    )
+                return _local_llm_cache[candidate_key]
+            except Exception as e:
+                last_error = e
+                message = str(e).lower()
+                is_memory_issue = (
+                    "paging file is too small" in message
+                    or "os error 1455" in message
+                    or "out of memory" in message
+                    or "not enough memory" in message
                 )
-            except Exception:
-                pass
-            logger.error(f"Failed to initialize LocalLLMWrapper: {e}")
-            raise e
+                if is_memory_issue:
+                    logger.warning(
+                        f"Local LLM memory load failure for '{candidate_model}', trying smaller fallback. Error: {e}"
+                    )
+                    continue
+                logger.warning(f"Local LLM load failed for '{candidate_model}', trying next fallback. Error: {e}")
+                continue
+
+        try:
+            import transformers
+            from transformers.pipelines import SUPPORTED_TASKS
+            logger.error(
+                f"LocalLLMWrapper init failed (model={self.model_path}, requested_task={task_to_use}, "
+                f"transformers={getattr(transformers, '__version__', 'unknown')}, "
+                f"supported_tasks={sorted(list(SUPPORTED_TASKS.keys()))[:50]})"
+            )
+        except Exception:
+            pass
+        logger.error(f"Failed to initialize LocalLLMWrapper after fallback attempts: {last_error}")
+        raise last_error
             
         return _local_llm_cache[cache_key]
         
@@ -98,7 +138,7 @@ class LocalLLMWrapper:
         return self.llm(prompt, **kwargs)
 
 class SIFBaseAgent(BaseALwrityAgent):
-    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-3B-Instruct", llm: Any = None):
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-1.5B-Instruct", llm: Any = None):
         # Hybrid LLM Strategy:
         # 1. Shared LLM for external/high-quality generation (available to all agents)
         self.shared_llm = SharedLLMWrapper(user_id)
diff --git a/backend/services/intelligence/txtai_service.py b/backend/services/intelligence/txtai_service.py
index b641e598..6f62d0a8 100644
--- a/backend/services/intelligence/txtai_service.py
+++ b/backend/services/intelligence/txtai_service.py
@@ -54,6 +54,7 @@ class TxtaiIntelligenceService:
         self.cache_manager = semantic_cache_manager if enable_caching else None
         self._backend = "faiss"  # Default backend
         self._disable_ann_queries = False  # Set when FAISS nprobe incompatibility is detected
+        self.fail_fast = str(os.getenv("SIF_FAIL_FAST", "true")).lower() in {"1", "true", "yes", "on"}
         
         # Mark as initialized for singleton pattern
         self._singleton_initialized = True
@@ -226,6 +227,7 @@ class TxtaiIntelligenceService:
         Args:
             items: List of (id, text, metadata) tuples.
         """
+<<<<<<< HEAD
         # Check if already initialized
         if not self._initialized and not self._initialization_in_progress:
             # Trigger initialization in background (non-blocking)
@@ -241,6 +243,14 @@ class TxtaiIntelligenceService:
         
         if not self.embeddings:
             logger.error(f"Cannot index content - embeddings not available for user {self.user_id}")
+=======
+        self._ensure_initialized()
+        if not self._initialized or not self.embeddings:
+            message = f"Cannot index content - service not initialized for user {self.user_id}"
+            logger.error(message)
+            if self.fail_fast:
+                raise RuntimeError(message)
+>>>>>>> 8b0547c (Make SIF fail fast and add low-cost remote LLM fallback)
             return
 
         try:
@@ -287,7 +297,10 @@ class TxtaiIntelligenceService:
         """Perform semantic search with intelligent caching."""
         self._ensure_initialized()
         if not self._initialized or not self.embeddings:
-            logger.error(f"Cannot perform search - service not initialized for user {self.user_id}")
+            message = f"Cannot perform search - service not initialized for user {self.user_id}"
+            logger.error(message)
+            if self.fail_fast:
+                raise RuntimeError(message)
             return []
 
         try:
@@ -321,6 +334,8 @@ class TxtaiIntelligenceService:
             return results
         except Exception as e:
             logger.error(f"Search failed for user {self.user_id}: {e}")
+            if self.fail_fast:
+                raise
             logger.error(f"Query: '{query}'")
             logger.error(f"Full traceback: {traceback.format_exc()}")
             return []
diff --git a/backend/services/llm_providers/huggingface_provider.py b/backend/services/llm_providers/huggingface_provider.py
index 69970a4a..5f354ca7 100644
--- a/backend/services/llm_providers/huggingface_provider.py
+++ b/backend/services/llm_providers/huggingface_provider.py
@@ -82,11 +82,29 @@ from tenacity import (
 
 try:
     from openai import OpenAI
+    from openai import NotFoundError
     OPENAI_AVAILABLE = True
 except ImportError:
     OPENAI_AVAILABLE = False
+    NotFoundError = Exception
     logger.warn("OpenAI library not available. Install with: pip install openai")
 
+HF_FALLBACK_MODELS = [
+    "openai/gpt-oss-120b:groq",
+    "moonshotai/Kimi-K2-Instruct-0905:groq",
+    "meta-llama/Llama-3.1-8B-Instruct:groq",
+    "mistralai/Mistral-7B-Instruct-v0.3:groq",
+]
+
+
+def _fallback_model_sequence(model: str):
+    sequence = [model] + HF_FALLBACK_MODELS
+    seen = set()
+    for candidate in sequence:
+        if candidate and candidate not in seen:
+            seen.add(candidate)
+            yield candidate
+
 def get_huggingface_api_key() -> str:
     """Get Hugging Face API key with proper error handling."""
     api_key = os.getenv('HF_TOKEN')
@@ -197,14 +215,27 @@ def huggingface_text_response(
         import time
         time.sleep(1)  # 1 second delay between API calls
         
-        # Make the API call using Chat Completions
-        response = client.chat.completions.create(
-            model=model,
-            messages=messages,
-            temperature=temperature,
-            top_p=top_p,
-            max_tokens=max_tokens
-        )
+        response = None
+        last_error = None
+        for candidate_model in _fallback_model_sequence(model):
+            try:
+                response = client.chat.completions.create(
+                    model=candidate_model,
+                    messages=messages,
+                    temperature=temperature,
+                    top_p=top_p,
+                    max_tokens=max_tokens
+                )
+                if candidate_model != model:
+                    logger.warning("HF text generation switched to fallback model: %s", candidate_model)
+                break
+            except NotFoundError as nf_err:
+                last_error = nf_err
+                logger.warning("HF model not found: %s. Trying fallback model.", candidate_model)
+                continue
+
+        if response is None:
+            raise last_error or Exception("Hugging Face text generation failed: all fallback models failed")
         
         # Extract text from response
         generated_text = response.choices[0].message.content
@@ -338,13 +369,27 @@ def huggingface_structured_json_response(
         time.sleep(1)  # 1 second delay between API calls
         
         try:
-            response = client.chat.completions.create(
-                model=model,
-                messages=messages,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                response_format={"type": "json_object"} # Try to enforce JSON mode if supported
-            )
+            response = None
+            last_error = None
+            for candidate_model in _fallback_model_sequence(model):
+                try:
+                    response = client.chat.completions.create(
+                        model=candidate_model,
+                        messages=messages,
+                        temperature=temperature,
+                        max_tokens=max_tokens,
+                        response_format={"type": "json_object"} # Try to enforce JSON mode if supported
+                    )
+                    if candidate_model != model:
+                        logger.warning("HF structured generation switched to fallback model: %s", candidate_model)
+                    break
+                except NotFoundError as nf_err:
+                    last_error = nf_err
+                    logger.warning("HF structured model not found: %s. Trying fallback model.", candidate_model)
+                    continue
+
+            if response is None:
+                raise last_error or Exception("Hugging Face structured generation failed: all fallback models failed")
             
             response_text = response.choices[0].message.content
             
@@ -379,14 +424,28 @@ def huggingface_structured_json_response(
         except Exception as e:
             logger.error(f"❌ Hugging Face API call failed: {e}")
             # If 422 Unprocessable Entity (often due to response_format not supported), retry without it
-            if "422" in str(e) or "not supported" in str(e).lower():
+            if "422" in str(e) or "not supported" in str(e).lower() or isinstance(e, NotFoundError):
                 logger.info("Retrying without response_format...")
-                response = client.chat.completions.create(
-                    model=model,
-                    messages=messages,
-                    temperature=temperature,
-                    max_tokens=max_tokens
-                )
+                response = None
+                last_error = None
+                for candidate_model in _fallback_model_sequence(model):
+                    try:
+                        response = client.chat.completions.create(
+                            model=candidate_model,
+                            messages=messages,
+                            temperature=temperature,
+                            max_tokens=max_tokens
+                        )
+                        if candidate_model != model:
+                            logger.warning("HF structured no-response_format fallback model: %s", candidate_model)
+                        break
+                    except NotFoundError as nf_err:
+                        last_error = nf_err
+                        logger.warning("HF structured model not found (no response_format path): %s", candidate_model)
+                        continue
+
+                if response is None:
+                    raise last_error or e
                 response_text = response.choices[0].message.content
                 # ... (same parsing logic would apply, simplified here for brevity)
                 try:
diff --git a/backend/services/llm_providers/main_text_generation.py b/backend/services/llm_providers/main_text_generation.py
index 3e34dc74..dd4ec672 100644
--- a/backend/services/llm_providers/main_text_generation.py
+++ b/backend/services/llm_providers/main_text_generation.py
@@ -6,7 +6,7 @@ migrated from the legacy lib/gpt_providers/text_generation/main_text_generation.
 
 import os
 import json
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, List
 from datetime import datetime
 from loguru import logger
 from fastapi import HTTPException
@@ -16,7 +16,13 @@ from .gemini_provider import gemini_text_response, gemini_structured_json_respon
 from .huggingface_provider import huggingface_text_response, huggingface_structured_json_response
 
 
-def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct: Optional[Dict[str, Any]] = None, user_id: str = None) -> str:
+def llm_text_gen(
+    prompt: str,
+    system_prompt: Optional[str] = None,
+    json_struct: Optional[Dict[str, Any]] = None,
+    user_id: str = None,
+    preferred_hf_models: Optional[List[str]] = None,
+) -> str:
     """
     Generate text using Language Model (LLM) based on the provided prompt.
     
@@ -54,7 +60,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
             model = "gemini-2.0-flash-001"
         elif env_provider in ['hf_response_api', 'huggingface', 'hf']:
             gpt_provider = "huggingface"
-            model = "mistralai/Mistral-7B-Instruct-v0.3"
+            model = "mistralai/Mistral-7B-Instruct-v0.3:groq"
         
         # Default blog characteristics
         blog_tone = "Professional"
@@ -80,7 +86,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
                 model = "gemini-2.0-flash-001"
             elif "huggingface" in available_providers:
                 gpt_provider = "huggingface"
-                model = "mistralai/Mistral-7B-Instruct-v0.3"
+                model = "mistralai/Mistral-7B-Instruct-v0.3:groq"
             else:
                 logger.error("[llm_text_gen] No API keys found for supported providers.")
                 raise RuntimeError("No LLM API keys configured. Configure GEMINI_API_KEY or HF_TOKEN to enable AI responses.")
@@ -93,9 +99,13 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
                     model = "gemini-2.0-flash-001"
                 elif "huggingface" in available_providers:
                     gpt_provider = "huggingface"
-                    model = "mistralai/Mistral-7B-Instruct-v0.3"
+                    model = "mistralai/Mistral-7B-Instruct-v0.3:groq"
                 else:
                     raise RuntimeError("No supported providers available.")
+
+        if gpt_provider == "huggingface" and preferred_hf_models:
+            model = preferred_hf_models[0]
+            logger.info(f"[llm_text_gen] Using preferred low-cost HF model: {model}")
             
         logger.debug(f"[llm_text_gen] Using provider: {gpt_provider}, model: {model}")
 
@@ -303,7 +313,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
                     elif fallback_provider == "huggingface":
                         provider_enum = APIProvider.MISTRAL
                         actual_provider_name = "huggingface"
-                        fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
+                        fallback_model = "mistralai/Mistral-7B-Instruct-v0.3:groq"
                     
                     if fallback_provider == "google":
                         if json_struct:
@@ -330,7 +340,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
                             response_text = huggingface_structured_json_response(
                                 prompt=prompt,
                                 schema=json_struct,
-                                model="mistralai/Mistral-7B-Instruct-v0.3",
+                                model="mistralai/Mistral-7B-Instruct-v0.3:groq",
                                 temperature=temperature,
                                 max_tokens=max_tokens,
                                 system_prompt=system_instructions
@@ -338,7 +348,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
                         else:
                             response_text = huggingface_text_response(
                                 prompt=prompt,
-                                model="mistralai/Mistral-7B-Instruct-v0.3",
+                                model="mistralai/Mistral-7B-Instruct-v0.3:groq",
                                 temperature=temperature,
                                 max_tokens=max_tokens,
                                 top_p=top_p,
@@ -394,4 +404,4 @@ def get_api_key(gpt_provider: str) -> Optional[str]:
         return api_key_manager.get_api_key(mapped_provider)
     except Exception as e:
         logger.error(f"[get_api_key] Error getting API key for {gpt_provider}: {str(e)}")
-        return None 
\ No newline at end of file
+        return None