feat: LinkedIn LLM alignment - Phase 1-3 complete

Phase 1: Dead Code Cleanup - Remove GeminiGroundedProvider import and property from linkedin_service.py - Remove fallback_provider property (gemini_provider imports) - Fix routers/linkedin.py edit endpoint to use llm_text_gen - Delete dead LinkedInImageEditor class - Remove dead _transform_gemini_sources from content_generator.py Phase 2: Research Infrastructure Alignment - Add user_id to _conduct_research() for pre-flight validation - Add validate_exa_research_operations() before Exa/Tavily calls - Pass user_id to provider.simple_search() for usage tracking - Inject research content into LLM prompts via _build_research_context() - Fix Google engine path to fallback to Exa - Add Exa → Tavily fallback on research failure Phase 3: Cosmetic Cleanup - Rename _generate_prompts_with_gemini → _generate_prompts_with_llm - Rename _build_gemini_prompt → _build_image_prompt - Rename _parse_gemini_response → _parse_llm_response - Remove all Gemini references from LinkedIn code (0 remaining) - Update docstrings and log messages Additional: - Research caching using existing ResearchCache - Shared ExaContentResearchProvider in services/research/ - Persona service uses llm_text_gen instead of gemini_structured_json_response - LinkedInWriter.tsx ChatMessage → ChatMsg type mapping fix - RegisterLinkedInActionsEnhanced.tsx content_format_rules typing fix
2026-06-12 18:58:53 +05:30
parent e54aaa7a3e
commit 63a0df2536
37 changed files with 2891 additions and 1355 deletions
--- a/backend/services/research/init.py
+++ b/backend/services/research/init.py
@@ -7,6 +7,7 @@ replacing mock research with real-time industry information.
 Available Services:
 - GoogleSearchService: Real-time industry research using Google Custom Search API
 - ExaService: Competitor discovery and analysis using Exa API
+- ExaContentResearchProvider: Shared content research provider for LinkedIn/Blog
 - TavilyService: AI-powered web search with real-time information
 - Source ranking and credibility assessment
 - Content extraction and insight generation
@@ -17,12 +18,13 @@ Core Module (v2.0):
 - ParameterOptimizer: AI-driven parameter optimization

 Author: ALwrity Team
-Version: 2.0
-Last Updated: December 2025
+Version: 2.1
+Last Updated: June 2026
 """

 from .google_search_service import GoogleSearchService
 from .exa_service import ExaService
+from .exa_content_research import ExaContentResearchProvider, get_exa_content_provider
 from .tavily_service import TavilyService

 # Core Research Engine (v2.0)
@@ -43,6 +45,10 @@ __all__ = [
    "ExaService",
    "TavilyService",
    
+    # Shared content research provider
+    "ExaContentResearchProvider",
+    "get_exa_content_provider",
+    
    # Core Research Engine (v2.0)
    "ResearchEngine",
    "ResearchContext",
--- a/backend/services/research/exa_content_research.py
+++ b/backend/services/research/exa_content_research.py
@@ -0,0 +1,198 @@
+"""
+Exa Content Research Provider
+
+Shared Exa neural search provider for content research across ALwrity modules.
+Provides simple_search() for fact-checking, content grounding, and research.
+
+Used by:
+- LinkedIn Writer (content generation research)
+- Blog Writer (fact-checking and writing assistance)
+
+This is the content-research variant. For competitor discovery/analysis,
+use ExaService in exa_service.py.
+"""
+
+import os
+import asyncio
+from typing import List, Dict, Any, Optional
+from loguru import logger
+
+
+class ExaContentResearchProvider:
+    """Exa neural search provider for content research."""
+    
+    def __init__(self):
+        """Initialize the Exa content research provider."""
+        self.api_key = os.getenv("EXA_API_KEY")
+        if not self.api_key:
+            raise RuntimeError("EXA_API_KEY not configured")
+        
+        from exa_py import Exa
+        self.exa = Exa(self.api_key)
+        logger.info("✅ Exa Content Research Provider initialized")
+    
+    async def simple_search(
+        self,
+        query: str,
+        num_results: int = 5,
+        user_id: str = None,
+        include_domains: List[str] = None,
+        exclude_domains: List[str] = None,
+    ) -> List[Dict[str, Any]]:
+        """
+        Simple Exa search for content research and fact-checking.
+        Handles subscription preflight check and usage tracking.
+        
+        Args:
+            query: Search query string
+            num_results: Number of results to return (default 5)
+            user_id: Optional user ID for subscription checking
+            include_domains: Only return results from these domains
+            exclude_domains: Exclude results from these domains
+            
+        Returns:
+            List of source dicts with title, url, text, publishedDate, author, score keys
+            
+        Raises:
+            HTTPException(429): If user has exceeded subscription limits
+            Exception: If Exa API key not configured or search fails
+        """
+        # Preflight subscription check
+        if user_id:
+            from models.subscription_models import APIProvider
+            from services.subscription import PricingService
+            from services.database import get_session_for_user
+            from fastapi import HTTPException
+            
+            db = get_session_for_user(user_id)
+            if db:
+                try:
+                    pricing_service = PricingService(db)
+                    can_proceed, message, usage_info = pricing_service.check_usage_limits(
+                        user_id=user_id,
+                        provider=APIProvider.EXA,
+                        tokens_requested=0,
+                        actual_provider_name="exa",
+                    )
+                    if not can_proceed:
+                        raise HTTPException(status_code=429, detail={
+                            'error': 'insufficient_balance',
+                            'message': message,
+                            'provider': 'exa',
+                            'usage_info': usage_info or {}
+                        })
+                except HTTPException:
+                    raise
+                except Exception as e:
+                    logger.warning(f"[Exa simple_search] Preflight check failed: {e}")
+                finally:
+                    try:
+                        db.close()
+                    except Exception:
+                        pass
+
+        search_kwargs = {
+            "type": "auto",
+            "num_results": num_results,
+            "text": {"max_characters": 1000},
+            "highlights": {"num_sentences": 2, "highlights_per_url": 2},
+        }
+        if include_domains:
+            search_kwargs["include_domains"] = include_domains
+        if exclude_domains:
+            search_kwargs["exclude_domains"] = exclude_domains
+        
+        try:
+            loop = asyncio.get_running_loop()
+            results = await loop.run_in_executor(
+                None,
+                lambda: self.exa.search_and_contents(query, **search_kwargs),
+            )
+        except Exception as e:
+            logger.error(f"[Exa simple_search] API call failed: {e}")
+            # Retry with simpler parameters
+            retry_kwargs = {"type": "auto", "num_results": num_results, "text": True}
+            if include_domains:
+                retry_kwargs["include_domains"] = include_domains
+            if exclude_domains:
+                retry_kwargs["exclude_domains"] = exclude_domains
+            try:
+                logger.info("[Exa simple_search] Retrying with simplified parameters")
+                results = await loop.run_in_executor(
+                    None,
+                    lambda: self.exa.search_and_contents(query, **retry_kwargs),
+                )
+            except Exception as retry_error:
+                logger.error(f"[Exa simple_search] Retry also failed: {retry_error}")
+                raise RuntimeError(f"Exa search failed: {str(retry_error)}") from retry_error
+        
+        sources = []
+        for result in results.results:
+            sources.append({
+                'title': getattr(result, 'title', 'Untitled'),
+                'url': getattr(result, 'url', ''),
+                'text': getattr(result, 'text', ''),
+                'publishedDate': getattr(result, 'publishedDate', ''),
+                'author': getattr(result, 'author', ''),
+                'score': (lambda v: v if v is not None else 0.5)(getattr(result, 'score', 0.5)),
+            })
+        
+        # Track usage
+        if user_id:
+            cost = 0.005  # ~0.5 cents per search
+            try:
+                self.track_usage(user_id, cost)
+            except Exception as e:
+                logger.warning(f"[Exa simple_search] Failed to track usage: {e}")
+        
+        logger.info(f"[Exa simple_search] Found {len(sources)} sources for query: {query[:80]}...")
+        return sources
+    
+    def track_usage(self, user_id: str, cost: float):
+        """Track Exa API usage after successful call."""
+        from services.database import get_session_for_user
+        from services.subscription import PricingService
+        from sqlalchemy import text
+        
+        db = get_session_for_user(user_id)
+        if not db:
+            logger.warning(f"[track_usage] Could not get DB session for user {user_id}")
+            return
+        try:
+            pricing_service = PricingService(db)
+            current_period = pricing_service.get_current_billing_period(user_id)
+            
+            # Update exa_calls and exa_cost via SQL UPDATE
+            update_query = text("""
+                UPDATE usage_summaries 
+                SET exa_calls = COALESCE(exa_calls, 0) + 1,
+                    exa_cost = COALESCE(exa_cost, 0) + :cost,
+                    total_calls = total_calls + 1,
+                    total_cost = total_cost + :cost
+                WHERE user_id = :user_id AND billing_period = :period
+            """)
+            db.execute(update_query, {
+                'cost': cost,
+                'user_id': user_id,
+                'period': current_period
+            })
+            db.commit()
+            
+            logger.info(f"[Exa] Tracked usage: user={user_id}, cost=${cost}")
+        except Exception as e:
+            logger.error(f"[Exa] Failed to track usage: {e}")
+            db.rollback()
+        finally:
+            db.close()
+
+
+# Global singleton instance
+_exa_content_provider: Optional[ExaContentResearchProvider] = None
+
+
+def get_exa_content_provider() -> ExaContentResearchProvider:
+    """Get or create the global Exa content research provider instance."""
+    global _exa_content_provider
+    if _exa_content_provider is None:
+        _exa_content_provider = ExaContentResearchProvider()
+    return _exa_content_provider