feat: image generation overhaul (model-aware text, dim clamping, \.30 pricing), event-driven dashboard cache invalidation, SEO insights (AI visibility, GSC, keyword gap), YouTube OAuth/publish, blog writer & content planning improvements, scheduler monitoring updates

2026-05-30 07:58:22 +05:30
parent aaf94049da
commit 64f1f88cdd
129 changed files with 8796 additions and 8755 deletions
--- a/backend/services/intelligence/sif_agents.py
+++ b/backend/services/intelligence/sif_agents.py
@@ -61,32 +61,32 @@ LOCAL_LLM_FALLBACKS = [

 class LocalLLMWrapper:
    """
-    Lazily loads a local LLM via txtai and caches it globally.
-    This prevents blocking server startup and redundant model loads.
+    Wraps a local LLM with async lifecycle support.
+    Model loading runs off the event loop so it never blocks the server.
+    Loaded models are cached globally (shared across all instances).
    """
+
    def __init__(self, model_path: str, task: str = None):
        self.model_path = model_path
        self.task = task
-        # No self._llm here, we use the global cache
-        
-    @property
-    def llm(self):
-        # Create a cache key based on model path and task
+        self._initialized = False
+        self._init_task = None
+
+    def _load_model_sync(self) -> Any:
+        """Load model (blocking — call via thread executor from async code)."""
        cache_key = f"{self.model_path}:{self.task}"
-        
        if cache_key in _local_llm_cache:
            return _local_llm_cache[cache_key]
-            
+
        if LLM is None:
            raise ImportError("txtai.pipeline.LLM is not available")
-            
+
        task_to_use = (self.task or "language-generation").strip()
-        # Explicitly force language-generation for known models if auto-detect fails
        if any(x in self.model_path for x in ["Qwen", "Instruct", "GPT", "Llama"]):
            task_to_use = "language-generation"
        if task_to_use == "text-generation":
            task_to_use = "language-generation"
-            
+
        candidate_models = []
        for candidate in [self.model_path, *LOCAL_LLM_FALLBACKS]:
            if candidate not in candidate_models:
@@ -137,12 +137,49 @@ class LocalLLMWrapper:
            pass
        logger.error(f"Failed to initialize LocalLLMWrapper after fallback attempts: {last_error}")
        raise last_error
-            
-        return _local_llm_cache[cache_key]
-        
+
+    @property
+    def llm(self):
+        """Sync accessor — lazy loads via global cache. Blocks on first call."""
+        cache_key = f"{self.model_path}:{self.task}"
+        if cache_key in _local_llm_cache:
+            return _local_llm_cache[cache_key]
+        result = self._load_model_sync()
+        self._initialized = True
+        return result
+
+    async def initialize(self) -> bool:
+        """Pre-load model asynchronously. Call at server startup to avoid first-request delay."""
+        if self._initialized:
+            return True
+        cache_key = f"{self.model_path}:{self.task}"
+        if cache_key in _local_llm_cache:
+            self._initialized = True
+            return True
+        try:
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, self._load_model_sync)
+            self._initialized = True
+            return True
+        except Exception as e:
+            logger.error(f"[LocalLLMWrapper] Async init failed for {self.model_path}: {e}")
+            return False
+
+    async def ensure_initialized_async(self) -> bool:
+        """Public async hook — ensures model is loaded without blocking the event loop."""
+        if self._initialized:
+            return True
+        return await self.initialize()
+
+    async def shutdown(self):
+        """Release model resources."""
+        cache_key = f"{self.model_path}:{self.task}"
+        _local_llm_cache.pop(cache_key, None)
+        self._initialized = False
+
    def __call__(self, prompt: str, **kwargs) -> str:
        return self.llm(prompt, **kwargs)
-        
+
    def generate(self, prompt: str, **kwargs) -> str:
        return self.llm(prompt, **kwargs)

@@ -177,6 +214,21 @@ class SIFBaseAgent(BaseALwrityAgent):

        return bool(getattr(self.intelligence, "_initialized", False) and self.intelligence.embeddings)

+    async def initialize_async(self):
+        """Async lifecycle hook — pre-initialize both the SIF index and the local LLM."""
+        await self._ensure_intelligence_ready()
+        llm = getattr(self, "llm", None)
+        if hasattr(llm, "ensure_initialized_async"):
+            await llm.ensure_initialized_async()
+        logger.info(f"[{self.__class__.__name__}] Async initialization complete")
+
+    async def shutdown(self):
+        """Async lifecycle hook — release model resources."""
+        llm = getattr(self, "llm", None)
+        if hasattr(llm, "shutdown"):
+            await llm.shutdown()
+        logger.info(f"[{self.__class__.__name__}] Shutdown complete")
+
    def _create_txtai_agent(self):
        """
        SIF agents primarily use the intelligence service directly, but we can expose
@@ -545,6 +597,84 @@ class ContentGuardianAgent(SIFBaseAgent):
        super().__init__(intelligence_service, user_id, agent_type="content_guardian")
        self.sif_service = sif_service

+    async def perform_site_audit(self, website_url: str) -> Dict[str, Any]:
+        """
+        Perform a comprehensive content audit on the indexed website content.
+        Called by the SIF indexing executor after content sync completes.
+        Returns a structured audit report with quality, brand voice, and safety assessments.
+        """
+        self._log_agent_operation("Performing site audit", website_url=website_url)
+        try:
+            # Search the user's SIF index for website content
+            results = await self.intelligence.search(
+                f"website content analysis {website_url}", limit=10
+            )
+
+            audit: Dict[str, Any] = {
+                "website_url": website_url,
+                "audit_timestamp": datetime.utcnow().isoformat(),
+                "total_pages_crawled": len(results),
+                "content_quality": None,
+                "brand_voice_consistency": None,
+                "safety_issues": None,
+                "cannibalization_issues": None,
+            }
+
+            if not results:
+                logger.warning(f"[{self.__class__.__name__}] No indexed content found for {website_url}")
+                return audit
+
+            # Run assessments on each indexed page
+            quality_scores = []
+            style_scores = []
+            safety_flags = []
+
+            for result in results:
+                text = result.get("text", "") or result.get("id", "")
+                if len(text) < 50:
+                    continue
+
+                quality = await self.assess_content_quality({"description": text, "title": website_url})
+                quality_scores.append(quality.get("score", 0.0))
+
+                style = await self.style_enforcer(text)
+                style_scores.append(style.get("compliance_score", 0.0))
+
+                safety = await self.safety_filter(text)
+                if not safety.get("is_safe", True):
+                    safety_flags.append(safety.get("flags", []))
+
+            audit["content_quality"] = {
+                "score": round(sum(quality_scores) / max(len(quality_scores), 1), 4),
+                "pages_analyzed": len(quality_scores),
+            }
+            audit["brand_voice_consistency"] = {
+                "compliance_score": round(sum(style_scores) / max(len(style_scores), 1), 4),
+                "pages_checked": len(style_scores),
+            }
+            audit["safety_issues"] = {
+                "has_issues": len(safety_flags) > 0,
+                "flagged_pages": len(safety_flags),
+            }
+
+            cannibalization = await self.check_cannibalization(website_url)
+            audit["cannibalization_issues"] = cannibalization
+
+            logger.info(
+                f"[{self.__class__.__name__}] Site audit complete for {website_url}: "
+                f"quality={audit['content_quality']['score']}, "
+                f"brand_voice={audit['brand_voice_consistency']['compliance_score']}"
+            )
+            return audit
+
+        except Exception as e:
+            logger.error(f"[{self.__class__.__name__}] Site audit failed for {website_url}: {e}")
+            return {
+                "website_url": website_url,
+                "error": str(e),
+                "audit_timestamp": datetime.utcnow().isoformat(),
+            }
+
    async def assess_content_quality(self, website_data: Dict[str, Any]) -> Dict[str, Any]:
        """Assess overall content quality based on website data."""
        self._log_agent_operation("Assessing content quality")
@@ -826,51 +956,21 @@ class LinkGraphAgent(SIFBaseAgent):
                logger.info(f"[{self.__class__.__name__}] No relevant internal pages found")
                return []
            
-            # 2. Get Authority Data (if available)
-            authority_map = {}
-            if self.sif_service:
-                try:
-                    # Fetch dashboard context to get top performing content
-                    # Note: This relies on what's available in the SIF index/dashboard summary
-                    dashboard_context = await self.sif_service.get_seo_dashboard_context()
-                    
-                    if "error" not in dashboard_context:
-                         # Extract top queries/pages if available in summary
-                         # Ideally, we'd have a map of URL -> Authority Score
-                         # For now, we'll try to extract what we can
-                         data = dashboard_context.get("dashboard_data", {})
-                         summary = data.get("summary", {})
-                         
-                         # Example: Boost if site health is good (general confidence)
-                         site_health = data.get("health_score", {}).get("score", 0)
-                         
-                         # If we had top pages in the summary, we'd use them.
-                         # For now, we'll use a placeholder authority map or just the site health
-                         pass
-                except Exception as e:
-                    logger.warning(f"Failed to fetch authority data: {e}")
-
            suggestions = []
            for result in results:
                relevance_score = result.get('score', 0.0)
                url = result.get('id', 'unknown')
                
-                # Apply authority boost (placeholder logic)
-                # In a full implementation, we'd look up 'url' in authority_map
-                authority_boost = 1.0
-                
-                final_score = relevance_score * authority_boost
-                
-                if final_score >= self.RELEVANCE_THRESHOLD:
+                if relevance_score >= self.RELEVANCE_THRESHOLD:
                    suggestion = {
                        "url": url,
                        "relevance": relevance_score,
-                        "final_score": final_score,
-                        "confidence": self._calculate_link_confidence(final_score),
+                        "final_score": relevance_score,
+                        "confidence": self._calculate_link_confidence(relevance_score),
                        "reason": f"Semantic similarity: {relevance_score:.3f}"
                    }
                    suggestions.append(suggestion)
-                    logger.debug(f"[{self.__class__.__name__}] Added link suggestion: {url} (score: {final_score:.3f})")
+                    logger.debug(f"[{self.__class__.__name__}] Added link suggestion: {url} (score: {relevance_score:.3f})")
            
            # Sort by final score
            suggestions.sort(key=lambda x: x['final_score'], reverse=True)
@@ -974,23 +1074,39 @@ class LinkGraphAgent(SIFBaseAgent):
        return min(1.0, relevance_score * 1.5)

    async def optimize_anchor_text(self, target_url: str, context: str) -> str:
-        """Suggest the best anchor text for a given link based on target page context."""
+        """Suggest anchor text for a link by searching the SIF index for the target page."""
        self._log_agent_operation("Optimizing anchor text", target_url=target_url, context_length=len(context))
-        
+
        try:
-            # In a real implementation, we would fetch the target page content via SIF
-            # and use an LLM to generate the anchor text.
-            
-            # Placeholder for LLM call
-            # if self.llm: ...
-            
-            logger.info(f"[{self.__class__.__name__}] Anchor text optimization stub completed")
-            return "relevant anchor text"  # Placeholder
-            
+            if not await self._ensure_intelligence_ready():
+                return self._extract_anchor_from_context(target_url, context)
+
+            results = await self.intelligence.search(f"{target_url} {context}", limit=3)
+            if results:
+                text = results[0].get("text", "") or results[0].get("id", "")
+                words = [w for w in text.split() if len(w) > 4][:5]
+                if words:
+                    return " ".join(words)
+            return self._extract_anchor_from_context(target_url, context)
+
        except Exception as e:
-            logger.error(f"[{self.__class__.__name__}] Failed to optimize anchor text: {e}")
-            logger.error(f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}")
-            return "click here"  # Fallback anchor text
+            logger.error(f"[{self.__class__.__name__}] optimize_anchor_text failed: {e}")
+            return self._extract_anchor_from_context(target_url, context)
+
+    def _extract_anchor_from_context(self, target_url: str, context: str) -> str:
+        """Extract a usable anchor text from the URL or context when SIF is unavailable."""
+        from urllib.parse import urlparse
+        try:
+            parsed = urlparse(target_url)
+            path = parsed.path.strip("/").replace("-", " ").replace("/", " ")
+            if path:
+                words = [w for w in path.split() if len(w) > 3]
+                if words:
+                    return " ".join(words[:4]).title()
+        except Exception:
+            pass
+        words = [w for w in context.split() if len(w) > 4]
+        return " ".join(words[:4]).title() if words else "learn more"

 class CitationExpert(SIFBaseAgent):
    """