""" SIF Phase 2 Integration Module This module demonstrates how to integrate the intelligent caching system with the existing SIF framework for improved performance and user experience. """ import asyncio from typing import Dict, List, Any, Optional from loguru import logger from datetime import datetime from sqlalchemy import select, desc import json from services.database import get_session_for_user from models.onboarding import WebsiteAnalysis, OnboardingSession, CompetitorAnalysis # Import existing SIF components from .txtai_service import TxtaiIntelligenceService from .semantic_cache import semantic_cache_manager, SemanticCacheStats from services.intelligence.harvester import SemanticHarvesterService class SIFIntegrationService: """ Semantic Intelligence Framework service with Phase 2 improvements. Features: - Intelligent caching for all semantic operations - Performance monitoring and analytics - Real-time cache invalidation - User-specific semantic memory optimization """ def __init__(self, user_id: str, enable_caching: bool = True): self.user_id = user_id self.enable_caching = enable_caching self.cache_manager = semantic_cache_manager if enable_caching else None # Initialize core services with caching self.intelligence_service = TxtaiIntelligenceService( user_id=user_id, enable_caching=enable_caching ) self.harvester = SemanticHarvesterService() # Initialize agents (will be created when needed to avoid circular imports) self.strategy_agent = None self.guardian_agent = None self.trend_surfer_agent = None logger.info(f"SIF Integration Service initialized for user {user_id}") def get_trend_surfer_agent(self): """Lazy load TrendSurferAgent""" if not self.trend_surfer_agent: from services.intelligence.agents.trend_surfer_agent import TrendSurferAgent self.trend_surfer_agent = TrendSurferAgent( intelligence_service=self.intelligence_service, user_id=self.user_id ) return self.trend_surfer_agent async def index_market_trends_run(self, trends_result: Dict[str, Any], run_id: str) -> bool: try: latest_id = f"market_trends_latest:{self.user_id}" run_doc_id = f"market_trends_run:{self.user_id}:{run_id}" geo = trends_result.get("geo", "US") timeframe = trends_result.get("timeframe", "today 12-m") keywords = trends_result.get("keywords") or [] keywords_text = ", ".join([str(k) for k in keywords]) if isinstance(keywords, list) else str(keywords) related_queries_top = (trends_result.get("related_queries") or {}).get("top", []) related_topics_top = (trends_result.get("related_topics") or {}).get("top", []) text_content = ( f"Market Trends run for {geo} ({timeframe}). Keywords: {keywords_text}. " f"Related queries top: {len(related_queries_top)}. Related topics top: {len(related_topics_top)}." ) base_metadata = { "type": "market_trends", "user_id": self.user_id, "run_id": run_id, "run_timestamp": trends_result.get("timestamp") or datetime.utcnow().isoformat(), "timeframe": timeframe, "geo": geo, "keywords": keywords if isinstance(keywords, list) else [keywords_text], "full_report": trends_result, } await self.intelligence_service.index_content( [ (latest_id, f"LATEST {text_content}", {**base_metadata, "is_latest": True}), (run_doc_id, text_content, {**base_metadata, "is_latest": False}), ] ) return True except Exception as e: logger.error(f"Failed to index market trends run: {e}") return False async def sync_content_strategy_dashboard_to_sif(self, db=None) -> bool: close_db = False try: if db is None: db = get_session_for_user(self.user_id) close_db = True if not db: return False items_to_index = [] try: from sqlalchemy import select, desc from models.enhanced_strategy_models import EnhancedContentStrategy, EnhancedAIAnalysisResult stmt = ( select(EnhancedContentStrategy) .where(EnhancedContentStrategy.user_id == self.user_id) .order_by(desc(EnhancedContentStrategy.updated_at)) ) strategies = db.execute(stmt).scalars().all() if strategies: latest = strategies[0] latest_id = f"enhanced_strategy_latest:{self.user_id}" latest_text = f"Latest Content Strategy Dashboard snapshot. Name: {latest.name}. Industry: {latest.industry}." latest_meta = { "type": "enhanced_content_strategy", "user_id": self.user_id, "is_latest": True, "strategy_id": latest.id, "timestamp": (latest.updated_at or latest.created_at or datetime.utcnow()).isoformat(), "full_report": latest.to_dict() if hasattr(latest, "to_dict") else {}, } items_to_index.append((latest_id, latest_text, latest_meta)) for st in strategies[:25]: ts = (st.updated_at or st.created_at or datetime.utcnow()).isoformat() run_doc_id = f"enhanced_strategy_run:{self.user_id}:{st.id}:{ts}" text = f"Content Strategy Dashboard snapshot. Name: {st.name}. Industry: {st.industry}. " if st.market_gaps: text += f"Market gaps: {str(st.market_gaps)[:300]}. " if st.emerging_trends: text += f"Emerging trends: {str(st.emerging_trends)[:300]}. " if st.industry_trends: text += f"Industry trends: {str(st.industry_trends)[:300]}. " meta = { "type": "enhanced_content_strategy", "user_id": self.user_id, "is_latest": False, "strategy_id": st.id, "timestamp": ts, "full_report": st.to_dict() if hasattr(st, "to_dict") else {}, } items_to_index.append((run_doc_id, text, meta)) stmt_ai = ( select(EnhancedAIAnalysisResult) .where(EnhancedAIAnalysisResult.user_id == self.user_id) .order_by(desc(EnhancedAIAnalysisResult.updated_at)) ) ai_results = db.execute(stmt_ai).scalars().all() if ai_results: latest_ai = ai_results[0] latest_ai_id = f"enhanced_ai_latest:{self.user_id}" ts_ai = (latest_ai.updated_at or latest_ai.created_at or datetime.utcnow()).isoformat() text_ai = f"Latest strategic intelligence. analysis_type: {latest_ai.analysis_type}. " meta_ai = { "type": "enhanced_ai_analysis", "user_id": self.user_id, "is_latest": True, "analysis_id": latest_ai.id, "analysis_type": latest_ai.analysis_type, "timestamp": ts_ai, "full_report": latest_ai.to_dict() if hasattr(latest_ai, "to_dict") else {}, } items_to_index.append((latest_ai_id, text_ai, meta_ai)) for r in ai_results[:50]: ts_ai = (r.updated_at or r.created_at or datetime.utcnow()).isoformat() run_ai_id = f"enhanced_ai_run:{self.user_id}:{r.id}:{ts_ai}" text_ai = f"Strategic intelligence run. analysis_type: {r.analysis_type}. " meta_ai = { "type": "enhanced_ai_analysis", "user_id": self.user_id, "is_latest": False, "analysis_id": r.id, "analysis_type": r.analysis_type, "timestamp": ts_ai, "full_report": r.to_dict() if hasattr(r, "to_dict") else {}, } items_to_index.append((run_ai_id, text_ai, meta_ai)) except Exception as e: logger.warning(f"Failed to embed enhanced content strategy dashboard data: {e}") try: from sqlalchemy import select, desc from models.content_planning import ContentGapAnalysis stmt_gap = ( select(ContentGapAnalysis) .where(ContentGapAnalysis.user_id == self.user_id) .order_by(desc(ContentGapAnalysis.updated_at)) ) gaps = db.execute(stmt_gap).scalars().all() if gaps: latest_gap = gaps[0] latest_gap_id = f"content_gap_latest:{self.user_id}" ts_gap = (latest_gap.updated_at or latest_gap.created_at or datetime.utcnow()).isoformat() text_gap = f"Latest Content Gap Analysis for {latest_gap.website_url}. " meta_gap = { "type": "content_gap_analysis", "user_id": self.user_id, "is_latest": True, "gap_id": latest_gap.id, "website_url": latest_gap.website_url, "timestamp": ts_gap, "full_report": latest_gap.to_dict() if hasattr(latest_gap, "to_dict") else {}, } items_to_index.append((latest_gap_id, text_gap, meta_gap)) for g in gaps[:25]: ts_gap = (g.updated_at or g.created_at or datetime.utcnow()).isoformat() run_gap_id = f"content_gap_run:{self.user_id}:{g.id}:{ts_gap}" text_gap = f"Content Gap Analysis for {g.website_url}. " if g.target_keywords: text_gap += f"Target keywords: {str(g.target_keywords)[:300]}. " meta_gap = { "type": "content_gap_analysis", "user_id": self.user_id, "is_latest": False, "gap_id": g.id, "website_url": g.website_url, "timestamp": ts_gap, "full_report": g.to_dict() if hasattr(g, "to_dict") else {}, } items_to_index.append((run_gap_id, text_gap, meta_gap)) except Exception as e: logger.warning(f"Failed to embed content gap analysis data: {e}") if items_to_index: await self.intelligence_service.index_content(items_to_index) return True return False except Exception as e: logger.error(f"Failed to sync content strategy dashboard to SIF: {e}") return False finally: if close_db and db: db.close() async def sync_onboarding_data_to_sif(self): """ Embeds existing onboarding data (WebsiteAnalysis, CompetitorAnalysis) into the SIF index. This ensures agents can query this data semantically without direct DB access. """ try: logger.info(f"Syncing onboarding data to SIF for user {self.user_id}") db = get_session_for_user(self.user_id) if not db: return False items_to_index = [] # 1. Fetch Website Analysis stmt = ( select(WebsiteAnalysis) .join(OnboardingSession, WebsiteAnalysis.session_id == OnboardingSession.id) .where(OnboardingSession.user_id == self.user_id) .order_by(desc(WebsiteAnalysis.created_at)) ) website_analyses = db.execute(stmt).scalars().all() for analysis in website_analyses: # Create a rich text representation for semantic search text_content = f"Website Analysis for {analysis.website_url}. " if analysis.brand_analysis: text_content += f"Brand Voice: {analysis.brand_analysis.get('brand_voice', 'Unknown')}. " if analysis.seo_audit: issues = analysis.seo_audit.get('technical_issues', []) issue_summary = ", ".join([i.get('type', '') for i in issues[:5]]) text_content += f"SEO Issues: {issue_summary}. " if analysis.social_media_presence: social = analysis.social_media_presence platforms = ", ".join(social.keys()) if isinstance(social, dict) else "Unknown" text_content += f"Social Platforms: {platforms}. " # Metadata stores the structured data for retrieval metadata = { "type": "website_analysis", "url": analysis.website_url, "timestamp": analysis.created_at.isoformat() if analysis.created_at else datetime.utcnow().isoformat(), "full_report": analysis.to_dict() } items_to_index.append((f"wa_{analysis.id}", text_content, metadata)) # 2. Fetch Competitor Analysis stmt_comp = ( select(CompetitorAnalysis) .join(OnboardingSession, CompetitorAnalysis.session_id == OnboardingSession.id) .where(OnboardingSession.user_id == self.user_id) ) competitor_analyses = db.execute(stmt_comp).scalars().all() for comp in competitor_analyses: text_content = f"Competitor Analysis for {comp.competitor_url}. " if comp.analysis_data: text_content += f"Summary: {comp.analysis_data.get('summary', '')[:200]}... " metadata = { "type": "competitor_analysis", "url": comp.competitor_url, "timestamp": comp.created_at.isoformat() if comp.created_at else datetime.utcnow().isoformat(), "full_report": comp.analysis_data } items_to_index.append((f"ca_{comp.id}", text_content, metadata)) # Index content if items_to_index: await self.intelligence_service.index_content(items_to_index) logger.info(f"Successfully synced {len(items_to_index)} onboarding items to SIF") try: await self.sync_content_strategy_dashboard_to_sif(db=db) except Exception: pass return True else: logger.info("No onboarding data found to sync") return False except Exception as e: logger.error(f"Failed to sync onboarding data to SIF: {e}") return False finally: if db: db.close() async def sync_seo_dashboard_to_sif(self): """ Embeds SEO Dashboard data (GSC/Bing metrics) into the SIF index. """ try: logger.info(f"Syncing SEO Dashboard data to SIF for user {self.user_id}") db = get_session_for_user(self.user_id) if not db: return False from services.seo.dashboard_service import SEODashboardService dashboard_service = SEODashboardService(db) # Fetch aggregated dashboard data dashboard_data = await dashboard_service.get_dashboard_overview(self.user_id) items_to_index = [] # Create rich text representation site_url = dashboard_data.get('website_url', 'Unknown') summary = dashboard_data.get('summary', {}) health = dashboard_data.get('health_score', {}) text_content = f"SEO Dashboard Analysis for {site_url}. " text_content += f"Health Score: {health.get('score', 0)} ({health.get('label', 'Unknown')}). " text_content += f"Total Clicks: {summary.get('clicks', 0)}, Impressions: {summary.get('impressions', 0)}. " text_content += f"CTR: {summary.get('ctr', 0):.1%}, Avg Position: {summary.get('position', 0):.1f}. " # Add AI insights to text ai_insights = dashboard_data.get('ai_insights', []) if ai_insights: insights_text = " ".join([i.get('text', '') for i in ai_insights]) text_content += f"Insights: {insights_text} " # Add Competitor Insights comp_insights = dashboard_data.get('competitor_insights', {}) if comp_insights: opp_score = comp_insights.get('opportunity_score', 0) text_content += f"Competitive Opportunity Score: {opp_score}%. " gaps = comp_insights.get('content_gaps', []) if gaps: text_content += f"Content Gaps: {', '.join(gaps[:5])}. " # Add Advertools Insights adv_insights = dashboard_data.get('advertools_insights', {}) if adv_insights: themes = adv_insights.get('augmented_themes', []) if themes: text_content += f"Augmented Themes: {', '.join(themes[:5])}. " # Add Technical SEO overview tech_audit = dashboard_data.get('technical_seo_audit', {}) if tech_audit: text_content += f"Technical Audit: {tech_audit.get('pages_audited', 0)} pages audited. " text_content += f"Avg Score: {tech_audit.get('avg_score', 0)}. " if tech_audit.get('worst_pages'): worst = ", ".join([p.get('page_url', '') for p in tech_audit.get('worst_pages', [])[:3]]) text_content += f"Worst Pages: {worst}. " metadata = { "type": "seo_dashboard", "url": site_url, "timestamp": datetime.utcnow().isoformat(), "full_report": dashboard_data } items_to_index.append((f"seo_dash_{self.user_id}", text_content, metadata)) if items_to_index: await self.intelligence_service.index_content(items_to_index) logger.info(f"Successfully synced SEO Dashboard data to SIF") return True return False except Exception as e: logger.error(f"Failed to sync SEO Dashboard data: {e}") return False finally: if db: db.close() async def sync_user_website_content(self, website_url: str) -> bool: """ Harvests and indexes user website content using incremental upsert strategy. This ensures that: 1. New content is added to the index. 2. Existing content is updated (refreshed). 3. Only recent/relevant pages are processed (snapshot approach). """ try: logger.info(f"Syncing user website content for {website_url} (User: {self.user_id})") # 1. Harvest content (Limit to 50 pages for snapshot) # Use 'limit' to act as a snapshot, assuming harvester fetches most relevant/recent harvested_pages = await self.harvester.harvest_website(website_url, limit=50) if not harvested_pages: logger.warning(f"No content harvested from {website_url}") return False logger.info(f"Harvested {len(harvested_pages)} pages from {website_url}") # 2. Prepare items for indexing (Upsert Strategy) # Using URL as the unique ID ensures updates overwrite existing entries items_to_index = [] for page in harvested_pages: url = page.get("url") if not url: continue # Rich text content text_content = page.get("content", "") title = page.get("title", "") # Metadata metadata = { "type": "user_content", "url": url, "title": title, "source": "user_website", "crawled_at": datetime.utcnow().isoformat(), "full_report": { "url": url, "title": title, "snippet": text_content[:200] } } # ID format: "user_content_{url_hash}" or just URL if safe? # Txtai usually handles string IDs. Let's use a consistent prefix. # But wait, existing logic in SIFOnboardingIntegration uses URL as ID? # "user_items = [(page['url'], ...)]" # Yes, it uses URL directly. items_to_index.append((url, text_content, metadata)) # 3. Index (Upsert) if items_to_index: await self.intelligence_service.index_content(items_to_index) logger.info(f"Successfully synced {len(items_to_index)} pages to SIF index") return True return False except Exception as e: logger.error(f"Failed to sync user website content: {e}") return False async def get_seo_dashboard_context(self) -> Dict[str, Any]: """ Retrieve SEO Dashboard context from SIF (txtai index). If not found, triggers a sync and tries again. """ try: logger.info(f"Retrieving SEO Dashboard context via SIF for user {self.user_id}") # 1. Construct semantic query query = "seo dashboard analysis health score clicks" # 2. Search SIF results = await self.intelligence_service.search(query, limit=5) # 3. Filter for valid dashboard objects valid_result = None if results: for res in results: try: metadata_str = res.get('object') metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res) if metadata.get('type') == 'seo_dashboard': valid_result = metadata.get('full_report') break except Exception as parse_err: continue if valid_result: logger.info("Found SEO Dashboard context in SIF index") return { "dashboard_data": valid_result, "source": "sif_index" } # 4. If not found, Sync and Retry logger.info("SEO Dashboard context not found in SIF. Triggering sync...") synced = await self.sync_seo_dashboard_to_sif() if synced: results_retry = await self.intelligence_service.search(query, limit=5) if results_retry: for res in results_retry: try: metadata_str = res.get('object') metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res) if metadata.get('type') == 'seo_dashboard': valid_result = metadata.get('full_report') return { "dashboard_data": valid_result, "source": "sif_index_after_sync" } except: continue logger.warning("No SEO Dashboard data found in SIF even after sync.") return { "error": "No SEO Dashboard data found.", "source": "empty" } except Exception as e: logger.error(f"Failed to get SEO Dashboard context via SIF: {e}") return {"error": str(e)} async def get_seo_context(self, website_url: Optional[str] = None) -> Dict[str, Any]: """ Retrieve existing SEO context from SIF (txtai index). If not found, triggers a sync from DB and tries again. """ try: logger.info(f"Retrieving SEO context via SIF for user {self.user_id}") # 1. Construct semantic query query = f"website analysis seo audit {website_url if website_url else ''}" # 2. Search SIF results = await self.intelligence_service.search(query, limit=5) # 3. Filter for valid website analysis objects valid_result = None if results: for res in results: # txtai returns metadata in the result object directly if objects=True # Structure: {'id': '...', 'score': ..., 'text': '...', 'metadata': {...}} # Note: txtai_service.py search returns results. # If objects=True in embeddings, result is dict with metadata fields merged or in 'metadata'? # Let's check txtai_service.py implementation of search. # It calls self.embeddings.search(query, limit). # With objects=True, it usually returns list of dicts. # We check if the result is of type 'website_analysis' and matches URL if provided # Since we serialized metadata to JSON string in index_content, we might need to parse it back? # txtai_service.py: "metadata_json = json.dumps(metadata) ... processed_items.append((id, text, metadata_json))" # So the stored object IS the JSON string. try: # txtai might return the object as the 'object' field or merge it. # Let's assume standard txtai behavior: # If we indexed (id, text, object), search returns {'id': id, 'score': score, 'text': text, ...object_fields...} # OR if object was a string, it might be in 'object' field. # In txtai_service.py, we did: processed_items.append((id_val, text, metadata_json)) # So 'object' is a JSON string. metadata_str = res.get('object') # or it might be unpacked if it was a dict, but we stored string. if not metadata_str and 'type' in res: # Maybe it unpacks automatically? # If we stored a string, it is likely in 'object'. pass if metadata_str: if isinstance(metadata_str, str): metadata = json.loads(metadata_str) else: metadata = metadata_str # Already dict? else: # Fallback: maybe the dict keys are merged into res? metadata = res if metadata.get('type') == 'website_analysis': if website_url and website_url not in metadata.get('url', ''): continue # URL mismatch valid_result = metadata.get('full_report') break except Exception as parse_err: logger.warning(f"Failed to parse SIF result metadata: {parse_err}") continue if valid_result: logger.info(f"Found SEO context in SIF index for {valid_result.get('website_url')}") return { "website_url": valid_result.get('website_url'), "seo_audit": valid_result.get('seo_audit') or {}, "crawl_result": valid_result.get('crawl_result') or {}, "sitemap_analysis": valid_result.get('crawl_result', {}).get('sitemap_analysis', {}) if valid_result.get('crawl_result') else {}, "pagespeed_data": valid_result.get('crawl_result', {}).get('pagespeed', {}) if valid_result.get('crawl_result') else {}, "analysis_date": valid_result.get('analysis_date'), "source": "sif_index" } # 4. If not found, Sync and Retry (Lazy Embedding) logger.info("SEO context not found in SIF. Triggering DB sync...") synced = await self.sync_onboarding_data_to_sif() if synced: # Retry search once results_retry = await self.intelligence_service.search(query, limit=5) if results_retry: for res in results_retry: try: metadata_str = res.get('object') metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res) if metadata.get('type') == 'website_analysis': if website_url and website_url not in metadata.get('url', ''): continue valid_result = metadata.get('full_report') return { "website_url": valid_result.get('website_url'), "seo_audit": valid_result.get('seo_audit') or {}, "crawl_result": valid_result.get('crawl_result') or {}, "sitemap_analysis": valid_result.get('crawl_result', {}).get('sitemap_analysis', {}) if valid_result.get('crawl_result') else {}, "pagespeed_data": valid_result.get('crawl_result', {}).get('pagespeed', {}) if valid_result.get('crawl_result') else {}, "analysis_date": valid_result.get('analysis_date'), "source": "sif_index_after_sync" } except: continue logger.warning("No SEO data found in SIF even after sync.") return { "error": "No SEO data found. Please complete onboarding.", "source": "empty" } except Exception as e: logger.error(f"Failed to get SEO context via SIF: {e}") return {"error": str(e)} async def track_agent_failure(self, agent_id: str, error: Exception, context: Dict[str, Any]): """ Tracks agent failures to identify root causes and patterns. """ try: error_type = type(error).__name__ error_message = str(error) timestamp = datetime.utcnow().isoformat() # Categorize error category = "unknown" if "context window" in error_message.lower() or "token limit" in error_message.lower(): category = "context_window_exceeded" elif "timeout" in error_message.lower(): category = "timeout" elif "rate limit" in error_message.lower(): category = "rate_limit" elif "parse" in error_message.lower() or "json" in error_message.lower(): category = "parsing_error" elif "safety" in error_message.lower(): category = "safety_violation" elif "tool" in error_message.lower(): category = "tool_execution_failed" failure_record = { "agent_id": agent_id, "error_type": error_type, "error_message": error_message, "category": category, "context": context, "timestamp": timestamp } logger.error(f"Agent Failure Tracked: {agent_id} - {category} - {error_message}") # Index failure for semantic analysis (optional, but useful for 'why failed?') text_content = f"Agent Failure: {agent_id} encountered {category}. Error: {error_message}." metadata = { "type": "agent_failure_log", "agent_id": agent_id, "category": category, "timestamp": timestamp, "full_report": failure_record } # Fire and forget indexing to avoid blocking asyncio.create_task(self.intelligence_service.index_content([(f"fail_{agent_id}_{timestamp}", text_content, metadata)])) try: from services.database import get_session_for_user from services.agent_activity_service import AgentActivityService db = get_session_for_user(self.user_id) if db: service = AgentActivityService(db, self.user_id) service.create_alert( alert_type="agent_failure", title=f"Agent failure: {category}", message=error_message[:2000], severity="error" if category in {"timeout", "context_window_exceeded", "tool_execution_failed", "safety_violation"} else "warning", payload=failure_record, cta_path="/content-planning", ) db.close() except Exception: pass return failure_record except Exception as e: logger.error(f"Failed to track agent failure: {e}") async def get_agent_failure_analysis(self, time_window_hours: int = 24) -> Dict[str, Any]: """ Analyzes recent agent failures to provide insights. """ try: # Search for failure logs query = "agent failure error" results = await self.intelligence_service.search(query, limit=50) failures = [] if results: for res in results: try: metadata_str = res.get('object') metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res) if metadata.get('type') == 'agent_failure_log': failures.append(metadata.get('full_report')) except: continue # Aggregate stats categories = {} for f in failures: cat = f.get('category', 'unknown') categories[cat] = categories.get(cat, 0) + 1 return { "total_failures": len(failures), "breakdown": categories, "recent_failures": failures[:5] } except Exception as e: logger.error(f"Failed to analyze agent failures: {e}") return {"error": str(e)} async def get_competitor_context(self, competitor_url: Optional[str] = None) -> Dict[str, Any]: """ Retrieve existing Competitor context from SIF (txtai index). If not found, triggers a sync from DB and tries again. """ try: logger.info(f"Retrieving Competitor context via SIF for user {self.user_id}") # 1. Construct semantic query query = f"competitor analysis {competitor_url if competitor_url else ''}" # 2. Search SIF results = await self.intelligence_service.search(query, limit=5) # 3. Filter for valid competitor analysis objects valid_results = [] if results: for res in results: try: metadata_str = res.get('object') metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res) if metadata.get('type') == 'competitor_analysis': if competitor_url and competitor_url not in metadata.get('url', ''): continue valid_results.append(metadata.get('full_report')) except Exception as parse_err: continue if valid_results: logger.info(f"Found {len(valid_results)} competitor contexts in SIF index") return { "competitors": valid_results, "source": "sif_index" } # 4. If not found, Sync and Retry logger.info("Competitor context not found in SIF. Triggering DB sync...") synced = await self.sync_onboarding_data_to_sif() if synced: results_retry = await self.intelligence_service.search(query, limit=5) if results_retry: for res in results_retry: try: metadata_str = res.get('object') metadata = json.loads(metadata_str) if isinstance(metadata_str, str) else (metadata_str or res) if metadata.get('type') == 'competitor_analysis': if competitor_url and competitor_url not in metadata.get('url', ''): continue valid_results.append(metadata.get('full_report')) except: continue if valid_results: return { "competitors": valid_results, "source": "sif_index_after_sync" } logger.warning("No Competitor data found in SIF even after sync.") return { "error": "No Competitor data found. Please complete onboarding.", "source": "empty" } except Exception as e: logger.error(f"Failed to get Competitor context via SIF: {e}") return {"error": str(e)} async def get_semantic_insights(self, website_data: Dict[str, Any]) -> Dict[str, Any]: """ Get semantic insights with intelligent caching. Args: website_data: User website analysis data Returns: Semantic insights with caching metadata """ try: logger.info(f"Getting semantic insights for user {self.user_id}") # Check cache first if self.enable_caching and self.cache_manager: cached_insights = self.cache_manager.get_cached_semantic_insights( user_id=self.user_id, force_refresh=False ) if cached_insights: logger.info("Returning cached semantic insights") return { "insights": cached_insights, "source": "cache", "cached_at": cached_insights.get("timestamp", "unknown"), "cache_hit": True } # Generate new insights if cache miss or caching disabled logger.info("Generating new semantic insights") # Perform semantic analysis insights = await self._generate_semantic_insights(website_data) # Cache the results if self.enable_caching and self.cache_manager: self.cache_manager.cache_semantic_insights( user_id=self.user_id, insights=insights, ttl=3600, # 1 hour TTL metadata={ "generated_at": datetime.now().isoformat(), "website_data_hash": hash(str(website_data)), "analysis_version": "v2.0" } ) logger.info("Cached new semantic insights") return { "insights": insights, "source": "analysis", "generated_at": datetime.now().isoformat(), "cache_hit": False } except Exception as e: logger.error(f"Failed to get semantic insights: {e}") return { "insights": {}, "error": str(e), "source": "error" } async def _generate_semantic_insights(self, website_data: Dict[str, Any]) -> Dict[str, Any]: """Generate semantic insights using multiple analysis methods.""" try: insights = { "user_id": self.user_id, "timestamp": datetime.now().isoformat(), "analysis_version": "v2.0" } # Content pillar analysis if self.intelligence_service.is_initialized(): clusters = await self.intelligence_service.cluster(min_score=0.6) if asyncio.iscoroutine(clusters): clusters = await clusters insights["content_pillars"] = self._format_clusters_as_pillars(clusters) # Semantic gaps analysis gaps = await self._identify_semantic_gaps(website_data) insights["semantic_gaps"] = gaps # Competitor comparison competitor_analysis = await self._analyze_competitor_semantics(website_data) insights["competitor_analysis"] = competitor_analysis # Strategic recommendations (lazy initialization to avoid circular imports) if not self.strategy_agent: from .sif_agents import StrategyArchitectAgent self.strategy_agent = StrategyArchitectAgent(self.intelligence_service, user_id=self.user_id) recommendations = await self.strategy_agent.analyze_content_strategy(website_data) insights["strategic_recommendations"] = recommendations # Content quality assessment (lazy initialization to avoid circular imports) if not self.guardian_agent: from .sif_agents import ContentGuardianAgent self.guardian_agent = ContentGuardianAgent(self.intelligence_service, user_id=self.user_id, sif_service=self) quality_score = await self.guardian_agent.assess_content_quality(website_data) insights["content_quality"] = quality_score return insights except Exception as e: logger.error(f"Failed to generate semantic insights: {e}") return {"error": str(e)} def _format_clusters_as_pillars(self, clusters: List[List[int]]) -> List[Dict[str, Any]]: """Format clustering results as content pillars.""" pillars = [] for i, cluster in enumerate(clusters): if cluster: # Only include non-empty clusters pillar = { "pillar_id": f"pillar_{i}", "size": len(cluster), "relevance_score": 0.8, # Placeholder - would be calculated "key_topics": [f"topic_{j}" for j in range(min(5, len(cluster)))], "competitor_coverage": 0.6, # Placeholder "user_coverage": 0.4 # Placeholder } pillars.append(pillar) return pillars async def _identify_semantic_gaps(self, website_data: Dict[str, Any]) -> List[Dict[str, Any]]: """Identify semantic gaps using StrategyArchitectAgent evidence-driven analysis.""" try: if not self.strategy_agent: from .sif_agents import StrategyArchitectAgent self.strategy_agent = StrategyArchitectAgent(self.intelligence_service, user_id=self.user_id) competitor_ids = website_data.get("competitor_indices", []) or [] gaps = await self.strategy_agent.find_semantic_gaps(competitor_indices=competitor_ids) normalized_gaps = [] for gap in gaps: density = gap.get("topic_density", {}) normalized_gaps.append({ "topic": gap.get("topic"), "priority": gap.get("priority", "medium"), "reason": gap.get("reason", "Competitor coverage gap"), "confidence": gap.get("confidence", 0.0), "current_coverage_score": density.get("user", 0.0), "competitor_coverage_score": density.get("competitor", 0.0), "gap_severity": gap.get("priority", "medium"), "suggested_action": f"Create dedicated content for '{gap.get('topic', 'this topic')}'", "topic_density": density, "evidence": gap.get("evidence", {}) }) return normalized_gaps except Exception as e: logger.error(f"Error identifying semantic gaps: {e}") return [] async def _analyze_competitor_semantics(self, website_data: Dict[str, Any]) -> Dict[str, Any]: """Analyze competitor semantic positioning.""" # This would perform actual competitor analysis return { "total_competitors_analyzed": 5, "semantic_overlap": 0.65, "unique_positioning": ["AI-powered content", "Data-driven insights"], "competitive_advantages": ["Technical depth", "Industry expertise"], "threats": ["Large competitor budgets", "Established brand presence"] } def get_cache_performance_stats(self) -> Optional[Dict[str, Any]]: """Get cache performance statistics.""" if not self.enable_caching or not self.cache_manager: return None try: stats = self.cache_manager.get_cache_stats() return { "hit_rate": stats.hit_rate, "total_hits": stats.total_hits, "total_misses": stats.total_misses, "cache_size": stats.cache_size, "memory_usage_mb": stats.memory_usage_mb, "average_hit_time_ms": stats.average_hit_time_ms, "total_invalidations": stats.total_invalidations } except Exception as e: logger.error(f"Failed to get cache stats: {e}") return None async def invalidate_user_cache(self, reason: str = "user_request") -> bool: """Invalidate cache for the current user.""" try: if self.enable_caching and self.cache_manager: self.cache_manager.invalidate_user_cache(self.user_id) logger.info(f"Invalidated cache for user {self.user_id}. Reason: {reason}") return True return False except Exception as e: logger.error(f"Failed to invalidate user cache: {e}") return False async def warm_user_cache(self, common_queries: List[str]) -> bool: """Pre-populate cache with common queries for the user.""" try: if self.enable_caching and self.cache_manager: self.cache_manager.warm_cache_for_user(self.user_id, common_queries) logger.info(f"Warmed cache for user {self.user_id} with {len(common_queries)} queries") return True return False except Exception as e: logger.error(f"Failed to warm user cache: {e}") return False # Integration with existing API endpoints class SIFIntegrationAPI: """API wrapper for SIF operations with caching integration.""" def __init__(self): self.services: Dict[str, SIFIntegrationService] = {} def get_service(self, user_id: str) -> SIFIntegrationService: """Get or create SIF service for a user.""" if user_id not in self.services: self.services[user_id] = SIFIntegrationService(user_id) return self.services[user_id] async def get_semantic_insights_with_cache(self, user_id: str, website_data: Dict[str, Any]) -> Dict[str, Any]: """Get semantic insights with caching metadata.""" service = self.get_service(user_id) return await service.get_semantic_insights(website_data) async def get_cache_performance(self, user_id: str) -> Dict[str, Any]: """Get cache performance metrics for a user.""" service = self.get_service(user_id) stats = service.get_cache_performance_stats() return { "user_id": user_id, "cache_enabled": stats is not None, "performance": stats or {}, "timestamp": datetime.now().isoformat() } async def invalidate_user_cache(self, user_id: str, reason: str = "api_request") -> Dict[str, Any]: """Invalidate cache for a specific user.""" service = self.get_service(user_id) success = await service.invalidate_user_cache(reason) return { "user_id": user_id, "success": success, "reason": reason, "timestamp": datetime.now().isoformat() } # Global API instance sif_integration_api = SIFIntegrationAPI() # Example usage and testing async def test_sif_integration_service(): """Test the SIF integration service with caching.""" logger.info("Testing SIF Integration Service with Caching") # Create test service user_id = "test_user_123" service = SIFIntegrationService(user_id, enable_caching=True) # Test data website_data = { "url": "https://example.com", "content": [ {"title": "SEO Best Practices", "content": "Learn about search engine optimization..."}, {"title": "Content Marketing", "content": "Discover content marketing strategies..."} ], "competitors": [ {"url": "https://competitor1.com", "name": "Competitor 1"}, {"url": "https://competitor2.com", "name": "Competitor 2"} ] } # First call - should generate new insights logger.info("First call (cache miss expected):") result1 = await service.get_semantic_insights(website_data) logger.info(f"Source: {result1.get('source')}") logger.info(f"Cache hit: {result1.get('cache_hit')}") # Second call - should hit cache logger.info("\nSecond call (cache hit expected):") result2 = await service.get_semantic_insights(website_data) logger.info(f"Source: {result2.get('source')}") logger.info(f"Cache hit: {result2.get('cache_hit')}") # Get cache performance stats logger.info("\nCache Performance Stats:") stats = service.get_cache_performance_stats() if stats: logger.info(f"Hit rate: {stats['hit_rate']:.2%}") logger.info(f"Total hits: {stats['total_hits']}") logger.info(f"Total misses: {stats['total_misses']}") logger.info(f"Memory usage: {stats['memory_usage_mb']:.2f} MB") logger.info("SIF Integration Service test completed successfully!") if __name__ == "__main__": # Run test asyncio.run(test_sif_integration_service())