Save local changes (GSC/Bing integrations) before merging PR #354

2026-02-13 13:11:27 +05:30
parent 43e66835ac
commit 08a1f4a1d8
144 changed files with 8310 additions and 2748 deletions
--- a/backend/services/intelligence/sif_agents.py
+++ b/backend/services/intelligence/sif_agents.py
@@ -5,13 +5,76 @@ Each agent leverages TxtaiIntelligenceService for semantic operations.
 """

 import traceback
+import json
+import asyncio
 from typing import List, Dict, Any, Optional
 from datetime import datetime
 from loguru import logger
-from .txtai_service import TxtaiIntelligenceService
+from .txtai_service import TxtaiIntelligenceService, TXTAI_AVAILABLE
+from services.intelligence.agents.core_agent_framework import BaseALwrityAgent
+from services.llm_providers.main_text_generation import llm_text_gen

-class SIFBaseAgent:
-    def __init__(self, intelligence_service: TxtaiIntelligenceService):
+# Optional txtai imports
+try:
+    from txtai.pipeline import Agent, LLM
+except ImportError:
+    Agent = None
+    LLM = None
+
+class SharedLLMWrapper:
+    """Wraps the shared ALwrity LLM service to look like a txtai LLM."""
+    def __init__(self, user_id: str):
+        self.user_id = user_id
+    
+    def generate(self, prompt: str, **kwargs) -> str:
+        """Generate text using the shared LLM provider."""
+        # We ignore kwargs like 'max_tokens' as llm_text_gen handles defaults,
+        # but we could map them if needed.
+        return llm_text_gen(prompt, user_id=self.user_id)
+        
+    def __call__(self, prompt: str, **kwargs) -> str:
+        return self.generate(prompt, **kwargs)
+
+class LocalLLMWrapper:
+    """
+    Lazily loads a local LLM via txtai.
+    This prevents blocking server startup with heavy model loads.
+    """
+    def __init__(self, model_path: str):
+        self.model_path = model_path
+        self._llm = None
+        
+    @property
+    def llm(self):
+        if self._llm is None:
+            if LLM is None:
+                raise ImportError("txtai.pipeline.LLM is not available")
+            logger.info(f"Loading local LLM: {self.model_path}")
+            self._llm = LLM(path=self.model_path)
+        return self._llm
+        
+    def __call__(self, prompt: str, **kwargs) -> str:
+        return self.llm(prompt, **kwargs)
+        
+    def generate(self, prompt: str, **kwargs) -> str:
+        return self.llm(prompt, **kwargs)
+
+class SIFBaseAgent(BaseALwrityAgent):
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, agent_type: str = "sif_agent", model_name: str = "Qwen/Qwen2.5-3B-Instruct", llm: Any = None):
+        # Hybrid LLM Strategy:
+        # 1. Shared LLM for external/high-quality generation (available to all agents)
+        self.shared_llm = SharedLLMWrapper(user_id)
+        
+        # 2. Local LLM for internal agent work (default for SIF agents)
+        if llm is None:
+            if TXTAI_AVAILABLE:
+                # Use Lazy Local LLM
+                llm = LocalLLMWrapper(model_name)
+            else:
+                # Fallback to Shared if txtai not available
+                llm = self.shared_llm
+            
+        super().__init__(user_id, agent_type, model_name, llm)
        self.intelligence = intelligence_service
        
    def _log_agent_operation(self, operation: str, **kwargs):
@@ -20,9 +83,23 @@ class SIFBaseAgent:
        if kwargs:
            logger.debug(f"[{self.__class__.__name__}] Parameters: {kwargs}")

+    def _create_txtai_agent(self):
+        """
+        SIF agents use the intelligence service directly, but we can expose
+        capabilities via a standard agent interface if needed.
+        """
+        if not TXTAI_AVAILABLE:
+            return None
+        
+        # Return a simple agent that can use the LLM
+        return Agent(llm=self.llm, tools=[])
+
 class StrategyArchitectAgent(SIFBaseAgent):
    """Agent for discovering content pillars and identifying strategic gaps."""
    
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str):
+        super().__init__(intelligence_service, user_id, agent_type="strategy_architect")
+    
    async def discover_pillars(self) -> List[Dict[str, Any]]:
        """Identify content pillars through semantic clustering."""
        self._log_agent_operation("Discovering content pillars")
@@ -58,6 +135,61 @@ class StrategyArchitectAgent(SIFBaseAgent):
            logger.error(f"[{self.__class__.__name__}] Failed to discover pillars: {e}")
            logger.error(f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}")
            return []
+
+    async def analyze_content_strategy(self, website_data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Analyze content strategy based on website data and semantic insights.
+        
+        Args:
+            website_data: Dictionary containing website analysis data
+            
+        Returns:
+            List of strategic recommendations
+        """
+        self._log_agent_operation("Analyzing content strategy")
+        
+        try:
+            recommendations = []
+            
+            # 1. Discover existing pillars
+            pillars = await self.discover_pillars()
+            
+            # 2. Analyze gaps based on pillars (simplified logic for now)
+            if not pillars:
+                recommendations.append({
+                    "type": "strategy_gap",
+                    "priority": "high",
+                    "title": "Establish Core Content Pillars",
+                    "description": "No clear content clusters found. Focus on defining 3-5 core topics to build authority."
+                })
+            else:
+                # Suggest strengthening weak pillars
+                for pillar in pillars:
+                    if pillar['size'] < 3:
+                        recommendations.append({
+                            "type": "content_depth",
+                            "priority": "medium",
+                            "title": f"Strengthen Pillar {pillar['pillar_id']}",
+                            "description": "This topic cluster has few articles. Create more content to establish authority.",
+                            "pillar_id": pillar['pillar_id']
+                        })
+            
+            # 3. Add generic recommendations based on website data if available
+            if website_data:
+                if not website_data.get('description'):
+                     recommendations.append({
+                        "type": "metadata",
+                        "priority": "high",
+                        "title": "Missing Meta Description",
+                        "description": "Website is missing a meta description. Add one to improve SEO CTR."
+                    })
+            
+            logger.info(f"[{self.__class__.__name__}] Generated {len(recommendations)} strategic recommendations")
+            return recommendations
+            
+        except Exception as e:
+            logger.error(f"[{self.__class__.__name__}] Failed to analyze content strategy: {e}")
+            return []
    
    def _calculate_cluster_confidence(self, cluster_indices: List[int]) -> float:
        """Calculate confidence score for a cluster based on its size and coherence."""
@@ -92,10 +224,40 @@ class ContentGuardianAgent(SIFBaseAgent):
    CANNIBALIZATION_THRESHOLD = 0.85  # Similarity threshold for cannibalization warning
    ORIGINALITY_THRESHOLD = 0.75  # Minimum originality score
    
-    def __init__(self, intelligence_service: TxtaiIntelligenceService, sif_service: Any = None):
-        super().__init__(intelligence_service)
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, sif_service: Any = None):
+        super().__init__(intelligence_service, user_id, agent_type="content_guardian")
        self.sif_service = sif_service

+    async def assess_content_quality(self, website_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Assess overall content quality based on website data."""
+        self._log_agent_operation("Assessing content quality")
+        try:
+             # Extract sample text or description from website_data
+             text_to_analyze = website_data.get('description', '') or website_data.get('title', '')
+             if not text_to_analyze:
+                 return {"score": 0.5, "reason": "No content to analyze"}
+                 
+             # Run style check
+             style_result = await self.style_enforcer(text_to_analyze)
+             
+             # Run safety check
+             safety_result = await self.safety_filter(text_to_analyze)
+             
+             # Calculate aggregate score
+             base_score = style_result.get('compliance_score', 0.8)
+             if safety_result.get('action') == 'flag_for_review':
+                 base_score *= 0.5
+                 
+             return {
+                 "score": base_score,
+                 "style_analysis": style_result,
+                 "safety_analysis": safety_result,
+                 "analyzed_text_length": len(text_to_analyze)
+             }
+        except Exception as e:
+            logger.error(f"[{self.__class__.__name__}] Quality assessment failed: {e}")
+            return {"score": 0.0, "error": str(e)}
+
    async def check_cannibalization(self, new_draft: str) -> Dict[str, Any]:
        """Check if a new draft competes semantically with existing pages."""
        self._log_agent_operation("Checking for semantic cannibalization", draft_length=len(new_draft))
@@ -274,8 +436,8 @@ class LinkGraphAgent(SIFBaseAgent):
    RELEVANCE_THRESHOLD = 0.6  # Minimum relevance score for link suggestions
    MAX_SUGGESTIONS = 10  # Maximum number of link suggestions
    
-    def __init__(self, intelligence_service: TxtaiIntelligenceService, sif_service: Any = None):
-        super().__init__(intelligence_service)
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str, sif_service: Any = None):
+        super().__init__(intelligence_service, user_id, agent_type="link_graph")
        self.sif_service = sif_service
    
    async def suggest_internal_links(self, draft: str) -> List[Dict[str, Any]]:
@@ -479,6 +641,9 @@ class CitationExpert(SIFBaseAgent):
    EVIDENCE_THRESHOLD = 0.7  # Minimum relevance score for evidence
    MAX_EVIDENCE = 5  # Maximum number of evidence pieces to return
    
+    def __init__(self, intelligence_service: TxtaiIntelligenceService, user_id: str):
+        super().__init__(intelligence_service, user_id, agent_type="citation_expert")
+        
    async def fact_checker(self, claim: str) -> List[Dict[str, Any]]:
        """
        Tool: Verifies facts against trusted research data.
@@ -542,60 +707,25 @@ class CitationExpert(SIFBaseAgent):
                "claim": claim,
                "status": status,
                "evidence_count": len(evidence),
-                "top_evidence": evidence[0]['source'] if evidence else None
+                "top_evidence": evidence[0] if evidence else None
            })
            
        return {
-            "status": "verification_complete",
-            "total_claims": len(claims),
+            "status": "completed",
            "verified_claims": verified_results,
-            "unsupported_count": len([c for c in verified_results if c['status'] == 'unsupported']),
-            "timestamp": datetime.utcnow().isoformat()
+            "verification_score": len([c for c in verified_results if c['status'] == 'supported']) / len(verified_results)
        }

    async def verify_facts(self, claim: str) -> List[Dict[str, Any]]:
-        """Find supporting or contradicting evidence in the indexed research."""
-        self._log_agent_operation("Verifying facts", claim_length=len(claim))
+        """Verify a single claim against intelligence data."""
+        results = await self.intelligence.search(claim, limit=3)
        
-        try:
-            if not self.intelligence.is_initialized():
-                logger.error(f"[{self.__class__.__name__}] Intelligence service not initialized")
-                return []
-            
-            if not claim or len(claim.strip()) < 20:
-                logger.warning(f"[{self.__class__.__name__}] Claim too short for meaningful verification")
-                return []
-            
-            results = await self.intelligence.search(claim, limit=self.MAX_EVIDENCE)
-            
-            if not results:
-                logger.info(f"[{self.__class__.__name__}] No evidence found for claim")
-                return []
-            
-            evidence = []
-            for result in results:
-                relevance_score = result.get('score', 0.0)
-                
-                if relevance_score >= self.EVIDENCE_THRESHOLD:
-                    evidence_piece = {
-                        "source": result.get('id', 'unknown'),
-                        "relevance": relevance_score,
-                        "confidence": self._calculate_evidence_confidence(relevance_score),
-                        "type": "supporting" if relevance_score > 0.8 else "related",
-                        "excerpt": result.get('text', '')[:200] + "..." if len(result.get('text', '')) > 200 else result.get('text', '')
-                    }
-                    evidence.append(evidence_piece)
-                    logger.debug(f"[{self.__class__.__name__}] Found evidence: {evidence_piece['source']} (score: {relevance_score:.3f})")
-            
-            logger.info(f"[{self.__class__.__name__}] Found {len(evidence)} pieces of evidence for claim")
-            return evidence
-            
-        except Exception as e:
-            logger.error(f"[{self.__class__.__name__}] Failed to verify facts: {e}")
-            logger.error(f"[{self.__class__.__name__}] Full traceback: {traceback.format_exc()}")
-            return []
-    
-    def _calculate_evidence_confidence(self, relevance_score: float) -> float:
-        """Calculate confidence score for evidence."""
-        # Simple confidence based on relevance score
-        return min(1.0, relevance_score * 1.2)
+        evidence = []
+        for result in results:
+            if result.get('score', 0) > self.EVIDENCE_THRESHOLD:
+                evidence.append({
+                    "text": result.get('text'),
+                    "source": result.get('id'),
+                    "confidence": result.get('score')
+                })
+        return evidence