Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions
--- a/backend/services/intelligence/harvester.py
+++ b/backend/services/intelligence/harvester.py
@@ -0,0 +1,145 @@
+"""
+Semantic Harvester Service
+Handles deep content acquisition using Exa AI.
+Prioritizes Exa for scale (hundreds of URLs) to avoid IP bans.
+"""
+
+import traceback
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from loguru import logger
+from services.research.exa_service import ExaService
+
+class SemanticHarvesterService:
+    def __init__(self, api_key: Optional[str] = None):
+        self.exa_service = ExaService()
+        self._harvest_stats = {
+            "total_urls_processed": 0,
+            "successful_extractions": 0,
+            "failed_extractions": 0,
+            "last_harvest_time": None
+        }
+
+    async def harvest_website(self, website_url: str, limit: int = 100) -> List[Dict[str, Any]]:
+        """
+        Deep crawl a website using Exa AI.
+        
+        Args:
+            website_url: The root URL to crawl.
+            limit: Maximum number of pages to retrieve.
+            
+        Returns:
+            List of pages with content and metadata.
+        """
+        logger.info(f"[SemanticHarvester] Starting harvest for {website_url} (Limit: {limit})")
+        
+        try:
+            # Validate input
+            if not website_url or not website_url.strip():
+                logger.error(f"[SemanticHarvester] Invalid website URL provided: {website_url}")
+                return []
+            
+            # Normalize URL
+            website_url = website_url.strip()
+            if not website_url.startswith(('http://', 'https://')):
+                website_url = f"https://{website_url}"
+                logger.debug(f"[SemanticHarvester] Normalized URL to: {website_url}")
+            
+            logger.debug(f"[SemanticHarvester] Processing domain: {website_url}")
+            
+            # Use ExaService to find similar contents (which effectively crawls the site if we search by domain)
+            # OR better: Use Exa's search with 'site:' operator or include_domains
+            
+            # Since ExaService.discover_competitors finds *similar* sites, we need a method to crawl *specific* site.
+            # Exa SDK supports searching within a domain.
+            
+            if not self.exa_service.enabled:
+                self.exa_service._try_initialize()
+                if not self.exa_service.enabled:
+                     logger.warning("[SemanticHarvester] Exa service disabled. Returning placeholder data.")
+                     return self._get_placeholder_data(website_url)
+
+            # Use Exa to search for all pages in this domain
+            search_response = self.exa_service.exa.search_and_contents(
+                query=f"site:{website_url}",
+                num_results=min(limit, 50), # Exa limit per request
+                text=True,
+                highlights=True
+            )
+            
+            results = []
+            if search_response and hasattr(search_response, 'results'):
+                for result in search_response.results:
+                    results.append({
+                        "url": getattr(result, 'url', ''),
+                        "title": getattr(result, 'title', ''),
+                        "content": getattr(result, 'text', '') or getattr(result, 'summary', ''),
+                        "metadata": {
+                            "published_date": getattr(result, 'published_date', None),
+                            "author": getattr(result, 'author', None),
+                            "highlights": getattr(result, 'highlights', [])
+                        }
+                    })
+            
+            logger.info(f"[SemanticHarvester] Successfully harvested {len(results)} pages from {website_url}")
+            return results
+            
+        except Exception as e:
+            logger.error(f"[SemanticHarvester] Failed to harvest {website_url}: {e}")
+            logger.error(f"[SemanticHarvester] Full traceback: {traceback.format_exc()}")
+            return []
+
+    def _get_placeholder_data(self, website_url: str) -> List[Dict[str, Any]]:
+        """Return placeholder data for testing."""
+        return [
+            {
+                "url": f"{website_url}/page1",
+                "title": "Sample Page 1",
+                "content": "This is sample content from page 1",
+                "metadata": {"word_count": 100}
+            }
+        ]
+
+    async def harvest_competitors(self, competitor_urls: List[str], pages_per_competitor: int = 10) -> List[Dict[str, Any]]:
+        """Harvest content from multiple competitors with detailed logging."""
+        logger.info(f"[SemanticHarvester] Starting competitor harvest for {len(competitor_urls)} competitors")
+        
+        if not competitor_urls:
+            logger.warning("[SemanticHarvester] No competitor URLs provided")
+            return []
+        
+        all_content = []
+        successful_harvests = 0
+        failed_harvests = 0
+        
+        for i, url in enumerate(competitor_urls, 1):
+            try:
+                logger.debug(f"[SemanticHarvester] Processing competitor {i}/{len(competitor_urls)}: {url}")
+                content = await self.harvest_website(url, limit=pages_per_competitor)
+                
+                if content:
+                    all_content.extend(content)
+                    successful_harvests += 1
+                    logger.debug(f"[SemanticHarvester] Successfully harvested {len(content)} pages from {url}")
+                else:
+                    failed_harvests += 1
+                    logger.warning(f"[SemanticHarvester] No content harvested from {url}")
+                    
+            except Exception as e:
+                failed_harvests += 1
+                logger.error(f"[SemanticHarvester] Failed to harvest competitor {url}: {e}")
+        
+        # Update statistics
+        self._harvest_stats["total_urls_processed"] += len(competitor_urls)
+        self._harvest_stats["successful_extractions"] += successful_harvests
+        self._harvest_stats["failed_extractions"] += failed_harvests
+        self._harvest_stats["last_harvest_time"] = datetime.now().isoformat()
+        
+        logger.info(f"[SemanticHarvester] Competitor harvest completed: {successful_harvests} successful, {failed_harvests} failed")
+        logger.info(f"[SemanticHarvester] Total content pieces harvested: {len(all_content)}")
+        
+        return all_content
+    
+    def get_harvest_stats(self) -> Dict[str, Any]:
+        """Get statistics about harvesting operations."""
+        return self._harvest_stats.copy()