Save local changes (GSC/Bing integrations) before merging PR #354

2026-02-13 13:11:27 +05:30
parent 43e66835ac
commit 08a1f4a1d8
144 changed files with 8310 additions and 2748 deletions
--- a/backend/services/intelligence/agents/specialized_agents.py
+++ b/backend/services/intelligence/agents/specialized_agents.py
@@ -13,6 +13,7 @@ from loguru import logger
 from ..txtai_service import TxtaiIntelligenceService
 from services.intelligence.agents.core_agent_framework import BaseALwrityAgent, AgentAction
 from services.seo_tools.content_strategy_service import ContentStrategyService
+from services.intelligence.sif_agents import SharedLLMWrapper, LocalLLMWrapper
 try:
    from services.intelligence.sif_integration import SIFIntegrationService
    SIF_AVAILABLE = True
@@ -20,14 +21,36 @@ except ImportError:
    SIF_AVAILABLE = False

 try:
-    from txtai import Agent, LLM
+    # Try importing from pipeline first (standard location)
+    from txtai.pipeline import Agent, LLM
    TXTAI_AVAILABLE = True
 except ImportError:
-    TXTAI_AVAILABLE = False
-    logger.warning("txtai not available, using fallback implementation")
+    try:
+        # Fallback to top-level import
+        from txtai import Agent, LLM
+        TXTAI_AVAILABLE = True
+    except ImportError:
+        TXTAI_AVAILABLE = False
+        Agent = None
+        LLM = None
+        logger.warning("txtai not available, using fallback implementation")

-class SIFBaseAgent:
-    def __init__(self, intelligence_service: TxtaiIntelligenceService):
+class SIFBaseAgent(BaseALwrityAgent):
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-3B-Instruct", llm: Any = None):
+        # Hybrid LLM Strategy:
+        # 1. Shared LLM for external/high-quality generation
+        self.shared_llm = SharedLLMWrapper(user_id)
+        
+        # 2. Local LLM for internal agent work (default for SIF agents)
+        if llm is None:
+            if TXTAI_AVAILABLE:
+                # Use Lazy Local LLM
+                llm = LocalLLMWrapper(model_name)
+            else:
+                # Fallback to Shared if txtai not available
+                llm = self.shared_llm
+                
+        super().__init__(user_id, agent_type, model_name, llm)
        self.intelligence = intelligence_service
        
    def _log_agent_operation(self, operation: str, **kwargs):
@@ -36,9 +59,27 @@ class SIFBaseAgent:
        if kwargs:
            logger.debug(f"[{self.__class__.__name__}] Parameters: {kwargs}")

+    def _create_txtai_agent(self):
+        """
+        SIF agents use the intelligence service directly, but we can expose
+        capabilities via a standard agent interface if needed.
+        """
+        if not TXTAI_AVAILABLE or Agent is None:
+            return None
+        
+        # Return a simple agent that can use the LLM
+        try:
+            return Agent(llm=self.llm, tools=[])
+        except Exception as e:
+            logger.warning(f"Failed to create txtai Agent: {e}")
+            return None
+
 class StrategyArchitectAgent(SIFBaseAgent):
    """Agent for discovering content pillars and identifying strategic gaps."""
    
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str):
+        super().__init__(intelligence_service, user_id, agent_type="strategy_architect")
+
    async def discover_pillars(self) -> List[Dict[str, Any]]:
        """Identify content pillars through semantic clustering."""
        self._log_agent_operation("Discovering content pillars")
@@ -108,9 +149,61 @@ class ContentGuardianAgent(SIFBaseAgent):
    CANNIBALIZATION_THRESHOLD = 0.85  # Similarity threshold for cannibalization warning
    ORIGINALITY_THRESHOLD = 0.75  # Minimum originality score
    
-    def __init__(self, intelligence_service: TxtaiIntelligenceService, sif_service: Any = None):
-        super().__init__(intelligence_service)
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, sif_service: Any = None):
+        super().__init__(intelligence_service, user_id, agent_type="content_guardian")
        self.sif_service = sif_service
+        
+        # Lazy initialization of SIF service if not provided
+        if self.sif_service is None and SIF_AVAILABLE:
+            try:
+                self.sif_service = SIFIntegrationService(user_id)
+                logger.info(f"[{self.__class__.__name__}] Lazily initialized SIFIntegrationService")
+            except Exception as e:
+                logger.warning(f"[{self.__class__.__name__}] Failed to lazily initialize SIF service: {e}")
+
+    async def assess_content_quality(self, content: str) -> Dict[str, Any]:
+        """
+        Assess content quality based on originality, readability, and cannibalization risks.
+        """
+        self._log_agent_operation("Assessing content quality", content_length=len(content))
+        
+        try:
+            # 1. Check for cannibalization
+            cannibalization_result = await self.check_cannibalization(content)
+            
+            # 2. Check originality (if not cannibalized)
+            originality_score = 1.0
+            if not cannibalization_result.get("warning"):
+                originality_result = await self.verify_originality(content, None)
+                originality_score = originality_result.get("originality_score", 1.0)
+            
+            # 3. Check Style Compliance
+            style_result = await self.style_enforcer(content)
+            style_score = style_result.get("compliance_score", 1.0)
+            
+            # 4. Basic Readability (Flesch-Kincaid proxy via sentence length/word complexity)
+            # Simple heuristic for now
+            words = content.split()
+            sentences = content.split('.')
+            avg_sentence_length = len(words) / max(1, len(sentences))
+            readability_score = 1.0 if avg_sentence_length < 20 else max(0.5, 1.0 - (avg_sentence_length - 20) * 0.05)
+            
+            # Weighted Score: Originality (40%) + Style (30%) + Readability (30%)
+            quality_score = (originality_score * 0.4) + (style_score * 0.3) + (readability_score * 0.3)
+            
+            return {
+                "quality_score": quality_score,
+                "originality_score": originality_score,
+                "readability_score": readability_score,
+                "style_score": style_score,
+                "cannibalization_risk": cannibalization_result,
+                "style_compliance": style_result,
+                "is_acceptable": quality_score > 0.7 and not cannibalization_result.get("warning", False)
+            }
+            
+        except Exception as e:
+            logger.error(f"[{self.__class__.__name__}] Failed to assess content quality: {e}")
+            return {"error": str(e), "quality_score": 0.0}

    async def check_cannibalization(self, new_draft: str) -> Dict[str, Any]:
        """Check if a new draft competes semantically with existing pages."""
@@ -193,25 +286,74 @@ class ContentGuardianAgent(SIFBaseAgent):
            # 1. Fetch Style Guidelines from SIF if not provided
            if not style_guidelines and self.sif_service:
                try:
-                    # Search for website analysis to get brand voice/style
-                    # We assume the most relevant 'website_analysis' doc contains the guidelines
-                    results = await self.intelligence.search("website analysis brand voice style", limit=1)
-                    if results:
-                        import json
-                        res = results[0]
-                        metadata_str = res.get('object')
-                        metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res)
+                    # Use central SIF service to get robust context
+                    seo_context = await self.sif_service.get_seo_context()
+                    
+                    if seo_context and "error" not in seo_context:
+                        # Extract brand voice/style from the context
+                        # The context structure is normalized in get_seo_context
                        
-                        if metadata.get('type') == 'website_analysis':
-                            report = metadata.get('full_report', {})
-                            style_guidelines = {
-                                "tone": report.get('brand_analysis', {}).get('brand_voice', 'neutral'),
-                                "style_patterns": report.get('style_patterns', {}),
-                                "writing_style": report.get('writing_style', {})
-                            }
-                            logger.info(f"[{self.__class__.__name__}] Retrieved style guidelines from SIF: {style_guidelines.get('tone')}")
+                        # Note: get_seo_context returns a flattened dict. 
+                        # We need to dig into the original structure if available, or rely on what's mapped.
+                        # However, get_seo_context maps 'seo_audit', 'sitemap_analysis', etc.
+                        # Brand info is usually in 'brand_analysis' col of WebsiteAnalysis, which might not be fully exposed
+                        # in the simplified get_seo_context return.
+                        # Let's check if we can get the full object or if we need to expand get_seo_context.
+                        # For now, we'll try to use what's there or fall back to a specific search if needed.
+                        
+                        # Actually, looking at get_seo_context implementation:
+                        # It returns 'seo_audit', 'crawl_result'.
+                        # Brand analysis is often stored in WebsiteAnalysis.brand_analysis.
+                        # We might need to extend get_seo_context or do a specific retrieval here.
+                        # But wait! I saw get_seo_context implementation earlier:
+                        # It retrieves the "full_report" from the SIF metadata.
+                        # If the SIF index contains the full WebsiteAnalysis object, we are good.
+                        
+                        # Let's try to get it from the full report if we can access it, 
+                        # but get_seo_context returns a filtered dict.
+                        
+                        # Alternative: Use the robust retrieval logic but specifically for brand info if get_seo_context is too narrow.
+                        # But get_seo_context logic includes "website analysis seo audit" query.
+                        
+                        # Let's assume for now we use the same retrieval logic but locally adapted, 
+                        # OR better, trust get_seo_context to be the single point of truth.
+                        # If get_seo_context doesn't return brand info, we should update IT, not hack here.
+                        # But I can't update SIFIntegrationService right now without context switch.
+                        
+                        # Let's stick to the previous manual search pattern BUT use the SIF service helper if possible.
+                        # Actually, the previous code was:
+                        # results = await self.intelligence.search("website analysis brand voice style", limit=1)
+                        
+                        # Let's keep it simple and robust:
+                        # Try to get it from SIF service if possible.
+                        # Since get_seo_context might not return brand_voice directly, let's try to see if we can use it.
+                        
+                        # Actually, let's use the manual search but with better error handling, 
+                        # mirroring get_seo_context's robustness (e.g. parsing).
+                        
+                        results = await self.intelligence.search("website analysis brand voice style", limit=1)
+                        if results:
+                            res = results[0]
+                            metadata_str = res.get('object')
+                            metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res)
+                            
+                            if metadata.get('type') == 'website_analysis':
+                                report = metadata.get('full_report', {})
+                                # Support both flat and nested structures
+                                brand_analysis = report.get('brand_analysis') or report.get('brand_voice', {})
+                                if isinstance(brand_analysis, str):
+                                    # Handle case where it might be a JSON string
+                                    try: brand_analysis = json.loads(brand_analysis)
+                                    except: brand_analysis = {"brand_voice": brand_analysis}
+                                    
+                                style_guidelines = {
+                                    "tone": brand_analysis.get('brand_voice', 'neutral') if isinstance(brand_analysis, dict) else 'neutral',
+                                    "style_patterns": report.get('style_patterns', {}),
+                                    "writing_style": report.get('writing_style', {})
+                                }
+                                logger.info(f"[{self.__class__.__name__}] Retrieved style guidelines from SIF index")
                except Exception as e:
-                    logger.warning(f"[{self.__class__.__name__}] Failed to retrieve style guidelines from SIF: {e}")
+                    logger.warning(f"[{self.__class__.__name__}] Failed to retrieve style guidelines: {e}")

            issues = []
            score = 1.0
@@ -246,6 +388,55 @@ class ContentGuardianAgent(SIFBaseAgent):
            logger.error(f"[{self.__class__.__name__}] Style enforcement failed: {e}")
            return {"error": str(e)}

+    async def perform_site_audit(self, website_url: str, limit: int = 10) -> Dict[str, Any]:
+        """
+        Perform a quality audit on the user's website content.
+        """
+        self._log_agent_operation("Performing site audit", website_url=website_url)
+        
+        try:
+            # 1. Retrieve recent content for the site from SIF
+            # We search for everything with the website_url in metadata
+            # Note: This depends on how data is indexed.
+            results = await self.intelligence.search(f"site:{website_url}", limit=limit)
+            
+            if not results:
+                logger.info(f"[{self.__class__.__name__}] No content found for site audit")
+                return {"error": "No content found"}
+            
+            audit_results = []
+            total_quality = 0.0
+            
+            for res in results:
+                text = res.get('text', '')
+                if not text or len(text) < 100:
+                    continue
+                    
+                quality = await self.assess_content_quality(text)
+                audit_results.append({
+                    "id": res.get('id'),
+                    "title": res.get('title', 'Unknown'),
+                    "quality": quality
+                })
+                total_quality += quality.get('quality_score', 0.0)
+            
+            avg_quality = total_quality / len(audit_results) if audit_results else 0.0
+            
+            report = {
+                "website_url": website_url,
+                "pages_audited": len(audit_results),
+                "average_quality_score": avg_quality,
+                "details": audit_results,
+                "timestamp": datetime.utcnow().isoformat()
+            }
+            
+            logger.info(f"[{self.__class__.__name__}] Site audit completed. Avg Quality: {avg_quality:.2f}")
+            return report
+            
+        except Exception as e:
+            logger.error(f"[{self.__class__.__name__}] Site audit failed: {e}")
+            return {"error": str(e)}
+
    async def safety_filter(self, text: str) -> Dict[str, Any]:
        """
        Tool: Flags potentially harmful, offensive, or sensitive content.
@@ -290,8 +481,8 @@ class LinkGraphAgent(SIFBaseAgent):
    RELEVANCE_THRESHOLD = 0.6  # Minimum relevance score for link suggestions
    MAX_SUGGESTIONS = 10  # Maximum number of link suggestions
    
-    def __init__(self, intelligence_service: TxtaiIntelligenceService, sif_service: Any = None):
-        super().__init__(intelligence_service)
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, sif_service: Any = None):
+        super().__init__(intelligence_service, user_id, agent_type="link_graph")
        self.sif_service = sif_service
    
    async def suggest_internal_links(self, draft: str) -> List[Dict[str, Any]]:
@@ -823,9 +1014,10 @@ class ContentStrategyAgent(BaseALwrityAgent):
            Maintain the original meaning and tone.
            """
            
-            if hasattr(self.llm, "generate"):
+            if self.llm:
                # We assume the LLM returns JSON-like text or we parse it
-                response = self.llm.generate(f"{system_prompt}\n\nText to rewrite:\n{content}")
+                response = await self._generate_llm_response(f"{system_prompt}\n\nText to rewrite:\n{content}")
+                
                # Simple parsing fallback if LLM returns raw text
                if isinstance(response, str) and not response.strip().startswith("{"):
                    optimized_content = response
@@ -1456,34 +1648,7 @@ class SEOOptimizationAgent(BaseALwrityAgent):
            "timestamp": datetime.utcnow().isoformat()
        }

-    async def _generate_llm_response(self, prompt: str) -> str:
-        """Helper to generate text using the agent's LLM"""
-        if not self.llm:
-            return "[LLM Unavailable]"
-            
-        try:
-            # Run in executor to avoid blocking if LLM is synchronous
-            loop = asyncio.get_event_loop()
-            
-            # Check if LLM is a txtai pipeline (callable) or has generate method
-            if hasattr(self.llm, "generate"):
-                # Some txtai pipelines use generate, some are just called
-                response = await loop.run_in_executor(None, lambda: self.llm.generate(prompt))
-            else:
-                # Assume callable (standard txtai pipeline)
-                response = await loop.run_in_executor(None, lambda: self.llm(prompt))
-            
-            # Handle list output (some models return list of dicts)
-            if isinstance(response, list):
-                if response and isinstance(response[0], dict) and 'generated_text' in response[0]:
-                    return response[0]['generated_text']
-                return str(response[0])
-                
-            return str(response)
-        except Exception as e:
-            logger.error(f"LLM generation failed: {e}")
-            return "[Generation Failed]"
-    
+
    async def _strategy_generator_tool(self, context: Dict[str, Any]) -> Dict[str, Any]:
        """SEO strategy generation tool"""
        audit_results = context.get("audit_results", {})
@@ -1629,8 +1794,8 @@ class SocialAmplificationAgent(BaseALwrityAgent):
            Return ONLY the adapted content.
            """
            
-            if hasattr(self.llm, "generate"):
-                adapted_content = self.llm.generate(prompt)
+            if self.llm:
+                adapted_content = await self._generate_llm_response(prompt)
            else:
                adapted_content = f"[Mock {platform}]: {content[:50]}... #adapted"