AI Analysis and Content Strategy fixes. Enhanced Strategy Routes refactoring.

2026-01-10 19:32:50 +05:30
parent 0b63ae7fc1
commit 8193cdba67
298 changed files with 45678 additions and 10952 deletions
--- a/backend/services/blog_writer/README.md
+++ b/backend/services/blog_writer/README.md
@@ -35,7 +35,7 @@ blog_writer/
 - Delegates to specialized modules for specific functionality

 ### Research Module (`research/`)
- **`ResearchService`**: Orchestrates comprehensive research using Google Search grounding
+- **`ResearchService`**: Orchestrates comprehensive research using Exa neural search (currently Exa-only for testing)
 - **`KeywordAnalyzer`**: AI-powered keyword analysis and extraction
 - **`CompetitorAnalyzer`**: Competitor intelligence and market analysis
 - **`ContentAngleGenerator`**: Strategic content angle discovery
--- a/backend/services/blog_writer/research/init.py
+++ b/backend/services/blog_writer/research/init.py
@@ -2,10 +2,12 @@
 Research module for AI Blog Writer.

 This module handles all research-related functionality including:
- Google Search grounding integration
+- Exa neural search integration (primary provider for testing)
 - Keyword analysis and competitor research
 - Content angle discovery
 - Research caching and optimization
+
+Note: Currently Exa-only for testing. Google Search grounding code preserved for future use.
 """

 from .research_service import ResearchService
--- a/backend/services/blog_writer/research/exa_provider.py
+++ b/backend/services/blog_writer/research/exa_provider.py
@@ -29,10 +29,15 @@ class ExaResearchProvider(BaseProvider):
        # Determine category: use exa_category if set, otherwise map from source_types
        category = config.exa_category if config.exa_category else self._map_source_type_to_category(config.source_types)
        
+        # Use exa_num_results if available, otherwise fallback to max_sources
+        num_results = config.exa_num_results if hasattr(config, 'exa_num_results') and config.exa_num_results else min(config.max_sources, 25)
+        # Cap at 100 as per Exa API limits
+        num_results = min(num_results, 100)
+        
        # Build search kwargs - use correct Exa API format
        search_kwargs = {
            'type': config.exa_search_type or "auto",
-            'num_results': min(config.max_sources, 25),
+            'num_results': num_results,
            'text': {'max_characters': 1000},
            'summary': {'query': f"Key insights about {topic}"},
            'highlights': {
@@ -49,37 +54,133 @@ class ExaResearchProvider(BaseProvider):
        if config.exa_exclude_domains:
            search_kwargs['exclude_domains'] = config.exa_exclude_domains
        
+        # Add date filters if configured
+        if hasattr(config, 'exa_date_filter') and config.exa_date_filter:
+            search_kwargs['start_published_date'] = config.exa_date_filter
+        if hasattr(config, 'exa_end_published_date') and config.exa_end_published_date:
+            search_kwargs['end_published_date'] = config.exa_end_published_date
+        if hasattr(config, 'exa_start_crawl_date') and config.exa_start_crawl_date:
+            search_kwargs['start_crawl_date'] = config.exa_start_crawl_date
+        if hasattr(config, 'exa_end_crawl_date') and config.exa_end_crawl_date:
+            search_kwargs['end_crawl_date'] = config.exa_end_crawl_date
+        
+        # Add context if configured (supports boolean or object with maxCharacters)
+        if hasattr(config, 'exa_context') and config.exa_context is not None:
+            if config.exa_context:
+                if hasattr(config, 'exa_context_max_characters') and config.exa_context_max_characters:
+                    search_kwargs['context'] = {'maxCharacters': config.exa_context_max_characters}
+                else:
+                    search_kwargs['context'] = True
+            # If False, don't add context parameter (default behavior)
+        
+        # Add text filters if configured
+        if hasattr(config, 'exa_include_text') and config.exa_include_text:
+            search_kwargs['include_text'] = config.exa_include_text
+        if hasattr(config, 'exa_exclude_text') and config.exa_exclude_text:
+            search_kwargs['exclude_text'] = config.exa_exclude_text
+        
        logger.info(f"[Exa Research] Executing search: {query}")
        
        # Execute Exa search - pass contents parameters directly, not nested
        try:
+            # Build optional parameters dict
+            optional_params = {}
+            if category:
+                optional_params['category'] = category
+            if config.exa_include_domains:
+                optional_params['include_domains'] = config.exa_include_domains
+            if config.exa_exclude_domains:
+                optional_params['exclude_domains'] = config.exa_exclude_domains
+            if hasattr(config, 'exa_date_filter') and config.exa_date_filter:
+                optional_params['start_published_date'] = config.exa_date_filter
+            if hasattr(config, 'exa_end_published_date') and config.exa_end_published_date:
+                optional_params['end_published_date'] = config.exa_end_published_date
+            if hasattr(config, 'exa_start_crawl_date') and config.exa_start_crawl_date:
+                optional_params['start_crawl_date'] = config.exa_start_crawl_date
+            if hasattr(config, 'exa_end_crawl_date') and config.exa_end_crawl_date:
+                optional_params['end_crawl_date'] = config.exa_end_crawl_date
+            # Add context if configured (supports boolean or object with maxCharacters)
+            if hasattr(config, 'exa_context') and config.exa_context:
+                if hasattr(config, 'exa_context_max_characters') and config.exa_context_max_characters:
+                    optional_params['context'] = {'maxCharacters': config.exa_context_max_characters}
+                else:
+                    optional_params['context'] = True
+            
+            # Add text filters if configured
+            if hasattr(config, 'exa_include_text') and config.exa_include_text:
+                optional_params['include_text'] = config.exa_include_text
+            if hasattr(config, 'exa_exclude_text') and config.exa_exclude_text:
+                optional_params['exclude_text'] = config.exa_exclude_text
+            
+            # Add additional_queries for Deep search (only works with type="deep")
+            if config.exa_search_type == 'deep' and hasattr(config, 'exa_additional_queries') and config.exa_additional_queries:
+                optional_params['additional_queries'] = config.exa_additional_queries
+            
+            # Build contents parameters (text, summary, highlights)
+            text_params = {}
+            if hasattr(config, 'exa_text_max_characters') and config.exa_text_max_characters:
+                text_params['max_characters'] = config.exa_text_max_characters
+            else:
+                text_params['max_characters'] = 1000  # Default
+            
+            summary_params = {}
+            if hasattr(config, 'exa_summary_query') and config.exa_summary_query:
+                summary_params['query'] = config.exa_summary_query
+            else:
+                summary_params['query'] = f"Key insights about {topic}"  # Default
+            
+            highlights_params = {}
+            if hasattr(config, 'exa_highlights') and config.exa_highlights:
+                if hasattr(config, 'exa_highlights_num_sentences') and config.exa_highlights_num_sentences:
+                    highlights_params['num_sentences'] = config.exa_highlights_num_sentences
+                else:
+                    highlights_params['num_sentences'] = 2  # Default
+                
+                if hasattr(config, 'exa_highlights_per_url') and config.exa_highlights_per_url:
+                    highlights_params['highlights_per_url'] = config.exa_highlights_per_url
+                else:
+                    highlights_params['highlights_per_url'] = 3  # Default
+            
            results = self.exa.search_and_contents(
                query,
-                text={'max_characters': 1000},
-                summary={'query': f"Key insights about {topic}"},
-                highlights={'num_sentences': 2, 'highlights_per_url': 3},
+                text=text_params,
+                summary=summary_params,
+                highlights=highlights_params if highlights_params else None,
                type=config.exa_search_type or "auto",
-                num_results=min(config.max_sources, 25),
-                **({k: v for k, v in {
-                    'category': category,
-                    'include_domains': config.exa_include_domains,
-                    'exclude_domains': config.exa_exclude_domains
-                }.items() if v})
+                num_results=num_results,
+                **optional_params
            )
        except Exception as e:
            logger.error(f"[Exa Research] API call failed: {e}")
            # Try simpler call without contents if the above fails
            try:
                logger.info("[Exa Research] Retrying with simplified parameters")
+                # Build minimal optional parameters for retry
+                optional_params = {}
+                if category:
+                    optional_params['category'] = category
+                if config.exa_include_domains:
+                    optional_params['include_domains'] = config.exa_include_domains
+                if config.exa_exclude_domains:
+                    optional_params['exclude_domains'] = config.exa_exclude_domains
+                if hasattr(config, 'exa_date_filter') and config.exa_date_filter:
+                    optional_params['start_published_date'] = config.exa_date_filter
+                if hasattr(config, 'exa_end_published_date') and config.exa_end_published_date:
+                    optional_params['end_published_date'] = config.exa_end_published_date
+                if hasattr(config, 'exa_start_crawl_date') and config.exa_start_crawl_date:
+                    optional_params['start_crawl_date'] = config.exa_start_crawl_date
+                if hasattr(config, 'exa_end_crawl_date') and config.exa_end_crawl_date:
+                    optional_params['end_crawl_date'] = config.exa_end_crawl_date
+                
+                # Add additional_queries for Deep search (only works with type="deep")
+                if config.exa_search_type == 'deep' and hasattr(config, 'exa_additional_queries') and config.exa_additional_queries:
+                    optional_params['additional_queries'] = config.exa_additional_queries
+                
                results = self.exa.search_and_contents(
                    query,
                    type=config.exa_search_type or "auto",
-                    num_results=min(config.max_sources, 25),
-                    **({k: v for k, v in {
-                        'category': category,
-                        'include_domains': config.exa_include_domains,
-                        'exclude_domains': config.exa_exclude_domains
-                    }.items() if v})
+                    num_results=num_results,
+                    **optional_params
                )
            except Exception as retry_error:
                logger.error(f"[Exa Research] Retry also failed: {retry_error}")
--- a/backend/services/blog_writer/research/research_service.py
+++ b/backend/services/blog_writer/research/research_service.py
@@ -31,7 +31,11 @@ from .research_strategies import get_strategy_for_mode


 class ResearchService:
-    """Service for conducting comprehensive research using Google Search grounding."""
+    """Service for conducting comprehensive research using Exa neural search.
+    
+    Currently supports Exa as the primary and only provider for testing and debugging.
+    Google Search grounding code is preserved for future use.
+    """
    
    def __init__(self):
        self.keyword_analyzer = KeywordAnalyzer()
@@ -43,9 +47,11 @@ class ResearchService:
    async def research(self, request: BlogResearchRequest, user_id: str) -> BlogResearchResponse:
        """
        Stage 1: Research & Strategy (AI Orchestration)
-        Uses ONLY Gemini's native Google Search grounding - ONE API call for everything.
+        Uses Exa neural search as the primary research provider.
        Follows LinkedIn service pattern for efficiency and cost optimization.
        Includes intelligent caching for exact keyword matches.
+        
+        Note: Currently Exa-only for testing. Failures will raise errors instead of falling back.
        """
        try:
            from services.cache.research_cache import research_cache
@@ -88,7 +94,7 @@ class ResearchService:

            # Determine research mode and get appropriate strategy
            research_mode = request.research_mode or ResearchMode.BASIC
-            config = request.config or ResearchConfig(mode=research_mode, provider=ResearchProvider.GOOGLE)
+            config = request.config or ResearchConfig(mode=research_mode, provider=ResearchProvider.EXA)
            strategy = get_strategy_for_mode(research_mode)
            
            logger.info(f"Research: mode={research_mode.value}, provider={config.provider.value}")
@@ -96,7 +102,11 @@ class ResearchService:
            # Build research prompt based on strategy
            research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
            
-            # Route to appropriate provider
+            # Currently Exa-only for testing - fail if other providers are requested
+            if config.provider != ResearchProvider.EXA:
+                raise ValueError(f"Only Exa provider is currently supported for testing. Requested provider: {config.provider.value}")
+            
+            # Route to Exa provider
            if config.provider == ResearchProvider.EXA:
                # Exa research workflow
                from .exa_provider import ExaResearchProvider
@@ -145,13 +155,9 @@ class ResearchService:
                    grounding_metadata = None  # Exa doesn't provide grounding metadata
                    
                except RuntimeError as e:
-                    if "EXA_API_KEY not configured" in str(e):
-                        logger.warning("Exa not configured, falling back to Google")
-                        config.provider = ResearchProvider.GOOGLE
-                        # Continue to Google flow below
-                        raw_result = None
-                    else:
-                        raise
+                    # Fail fast - no fallback for testing/debugging
+                    logger.error(f"Exa research failed: {e}")
+                    raise RuntimeError(f"Exa research failed: {e}. Please ensure EXA_API_KEY is configured.") from e
            
            elif config.provider == ResearchProvider.TAVILY:
                # Tavily research workflow
@@ -231,41 +237,13 @@ class ResearchService:
                    grounding_metadata = None  # Tavily doesn't provide grounding metadata
                    
                except RuntimeError as e:
-                    if "TAVILY_API_KEY not configured" in str(e):
-                        logger.warning("Tavily not configured, falling back to Google")
-                        config.provider = ResearchProvider.GOOGLE
-                        # Continue to Google flow below
-                        raw_result = None
-                    else:
-                        raise
-                
-            if config.provider not in [ResearchProvider.EXA, ResearchProvider.TAVILY]:
-                # Google research (existing flow) or fallback from Exa
-                from .google_provider import GoogleResearchProvider
-                import time
-                
-                api_start_time = time.time()
-                google_provider = GoogleResearchProvider()
-                gemini_result = await google_provider.search(
-                    research_prompt, topic, industry, target_audience, config, user_id
-                )
-                api_duration_ms = (time.time() - api_start_time) * 1000
-                
-                # Log API call performance
-                blog_writer_logger.log_api_call(
-                    "gemini_grounded",
-                    "generate_grounded_content",
-                    api_duration_ms,
-                    token_usage=gemini_result.get("token_usage", {}),
-                    content_length=len(gemini_result.get("content", ""))
-                )
-                
-                # Extract sources and content
-                sources = self._extract_sources_from_grounding(gemini_result)
-                content = gemini_result.get("content", "")
-                search_widget = gemini_result.get("search_widget", "") or ""
-                search_queries = gemini_result.get("search_queries", []) or []
-                grounding_metadata = self._extract_grounding_metadata(gemini_result)
+                    # Fail fast - no fallback for testing/debugging
+                    logger.error(f"Tavily research failed: {e}")
+                    raise RuntimeError(f"Tavily research failed: {e}. Please ensure TAVILY_API_KEY is configured.") from e
+            
+            # Validate that we have content and sources before proceeding
+            if 'content' not in locals() or 'sources' not in locals():
+                raise RuntimeError(f"{config.provider.value} research did not return content or sources. Research failed.")
            
            # Continue with common analysis (same for both providers)
            keyword_analysis = self.keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
@@ -434,7 +412,7 @@ class ResearchService:
            
            # Determine research mode and get appropriate strategy
            research_mode = request.research_mode or ResearchMode.BASIC
-            config = request.config or ResearchConfig(mode=research_mode, provider=ResearchProvider.GOOGLE)
+            config = request.config or ResearchConfig(mode=research_mode, provider=ResearchProvider.EXA)
            strategy = get_strategy_for_mode(research_mode)
            
            logger.info(f"Research: mode={research_mode.value}, provider={config.provider.value}")
@@ -442,7 +420,11 @@ class ResearchService:
            # Build research prompt based on strategy
            research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
            
-            # Route to appropriate provider
+            # Currently Exa-only for testing - fail if other providers are requested
+            if config.provider != ResearchProvider.EXA:
+                raise ValueError(f"Only Exa provider is currently supported for testing. Requested provider: {config.provider.value}")
+            
+            # Route to Exa provider
            if config.provider == ResearchProvider.EXA:
                # Exa research workflow
                from .exa_provider import ExaResearchProvider
@@ -495,13 +477,10 @@ class ResearchService:
                    grounding_metadata = None  # Exa doesn't provide grounding metadata
                    
                except RuntimeError as e:
-                    if "EXA_API_KEY not configured" in str(e):
-                        logger.warning("Exa not configured, falling back to Google")
-                        await task_manager.update_progress(task_id, "⚠️ Exa not configured, falling back to Google Search")
-                        config.provider = ResearchProvider.GOOGLE
-                        # Continue to Google flow below
-                    else:
-                        raise
+                    # Fail fast - no fallback for testing/debugging
+                    logger.error(f"Exa research failed: {e}")
+                    await task_manager.update_progress(task_id, f"❌ Exa research failed: {str(e)}")
+                    raise RuntimeError(f"Exa research failed: {e}. Please ensure EXA_API_KEY is configured.") from e
            
            elif config.provider == ResearchProvider.TAVILY:
                # Tavily research workflow
@@ -581,43 +560,18 @@ class ResearchService:
                    grounding_metadata = None  # Tavily doesn't provide grounding metadata
                    
                except RuntimeError as e:
-                    if "TAVILY_API_KEY not configured" in str(e):
-                        logger.warning("Tavily not configured, falling back to Google")
-                        await task_manager.update_progress(task_id, "⚠️ Tavily not configured, falling back to Google Search")
-                        config.provider = ResearchProvider.GOOGLE
-                        # Continue to Google flow below
-                    else:
-                        raise
-                
-            if config.provider not in [ResearchProvider.EXA, ResearchProvider.TAVILY]:
-                # Google research (existing flow)
-                from .google_provider import GoogleResearchProvider
-                
-                await task_manager.update_progress(task_id, "🌐 Connecting to Google Search grounding...")
-                google_provider = GoogleResearchProvider()
-                
-                await task_manager.update_progress(task_id, "🤖 Making AI request to Gemini with Google Search grounding...")
-                try:
-                    gemini_result = await google_provider.search(
-                        research_prompt, topic, industry, target_audience, config, user_id
-                    )
-                except HTTPException as http_error:
-                    logger.error(f"Subscription limit exceeded for Google research: {http_error.detail}")
-                    await task_manager.update_progress(task_id, f"❌ Subscription limit exceeded: {http_error.detail.get('message', str(http_error.detail)) if isinstance(http_error.detail, dict) else str(http_error.detail)}")
-                    raise
-                
-                await task_manager.update_progress(task_id, "📊 Processing research results and extracting insights...")
-                # Extract sources and content
-                # Handle None result case
-                if gemini_result is None:
-                    logger.error("gemini_result is None after search - this should not happen if HTTPException was raised")
-                    raise ValueError("Research result is None - search operation failed unexpectedly")
-                
-                sources = self._extract_sources_from_grounding(gemini_result)
-                content = gemini_result.get("content", "") if isinstance(gemini_result, dict) else ""
-                search_widget = gemini_result.get("search_widget", "") or "" if isinstance(gemini_result, dict) else ""
-                search_queries = gemini_result.get("search_queries", []) or [] if isinstance(gemini_result, dict) else []
-                grounding_metadata = self._extract_grounding_metadata(gemini_result)
+                    # Fail fast - no fallback for testing/debugging
+                    logger.error(f"Tavily research failed: {e}")
+                    await task_manager.update_progress(task_id, f"❌ Tavily research failed: {str(e)}")
+                    raise RuntimeError(f"Tavily research failed: {e}. Please ensure TAVILY_API_KEY is configured.") from e
+            
+            # Validate that we have content and sources before proceeding
+            if config.provider == ResearchProvider.EXA and ('content' not in locals() or 'sources' not in locals()):
+                await task_manager.update_progress(task_id, "❌ Exa research did not return content or sources")
+                raise RuntimeError("Exa research did not return content or sources. Research failed.")
+            elif config.provider == ResearchProvider.TAVILY and ('content' not in locals() or 'sources' not in locals()):
+                await task_manager.update_progress(task_id, "❌ Tavily research did not return content or sources")
+                raise RuntimeError("Tavily research did not return content or sources. Research failed.")
            
            # Continue with common analysis (same for both providers)
            await task_manager.update_progress(task_id, "🔍 Analyzing keywords and content angles...")