feat: LinkedIn LLM alignment - Phase 1-3 complete

Phase 1: Dead Code Cleanup - Remove GeminiGroundedProvider import and property from linkedin_service.py - Remove fallback_provider property (gemini_provider imports) - Fix routers/linkedin.py edit endpoint to use llm_text_gen - Delete dead LinkedInImageEditor class - Remove dead _transform_gemini_sources from content_generator.py Phase 2: Research Infrastructure Alignment - Add user_id to _conduct_research() for pre-flight validation - Add validate_exa_research_operations() before Exa/Tavily calls - Pass user_id to provider.simple_search() for usage tracking - Inject research content into LLM prompts via _build_research_context() - Fix Google engine path to fallback to Exa - Add Exa → Tavily fallback on research failure Phase 3: Cosmetic Cleanup - Rename _generate_prompts_with_gemini → _generate_prompts_with_llm - Rename _build_gemini_prompt → _build_image_prompt - Rename _parse_gemini_response → _parse_llm_response - Remove all Gemini references from LinkedIn code (0 remaining) - Update docstrings and log messages Additional: - Research caching using existing ResearchCache - Shared ExaContentResearchProvider in services/research/ - Persona service uses llm_text_gen instead of gemini_structured_json_response - LinkedInWriter.tsx ChatMessage → ChatMsg type mapping fix - RegisterLinkedInActionsEnhanced.tsx content_format_rules typing fix
2026-06-12 18:58:53 +05:30
parent e54aaa7a3e
commit 63a0df2536
37 changed files with 2891 additions and 1355 deletions
--- a/backend/services/linkedin/content_generator.py
+++ b/backend/services/linkedin/content_generator.py
@@ -2,6 +2,7 @@
 Content Generator for LinkedIn Content Generation

 Handles the main content generation logic for posts and articles.
+Uses llm_text_gen for provider-agnostic LLM access (respects GPT_PROVIDER).
 """

 from typing import Dict, Any, List, Optional
@@ -21,6 +22,7 @@ from services.linkedin.content_generator_prompts import (
    CarouselGenerator,
    VideoScriptGenerator
 )
+from services.llm_providers.main_text_generation import llm_text_gen
 from services.persona_analysis_service import PersonaAnalysisService
 import time

@@ -28,11 +30,9 @@ import time
 class ContentGenerator:
    """Handles content generation for all LinkedIn content types."""
    
-    def __init__(self, citation_manager=None, quality_analyzer=None, gemini_grounded=None, fallback_provider=None):
+    def __init__(self, citation_manager=None, quality_analyzer=None):
        self.citation_manager = citation_manager
        self.quality_analyzer = quality_analyzer
-        self.gemini_grounded = gemini_grounded
-        self.fallback_provider = fallback_provider
        
        # Persona caching
        self._persona_cache: Dict[str, Dict[str, Any]] = {}
@@ -105,22 +105,24 @@ class ContentGenerator:
                del self._cache_timestamps[key]
            logger.info(f"Cleared persona cache for user {user_id}")
    
-    def _transform_gemini_sources(self, gemini_sources):
-        """Transform Gemini sources to ResearchSource format."""
-        transformed_sources = []
-        for source in gemini_sources:
-            transformed_source = ResearchSource(
-                title=source.get('title', 'Unknown Source'),
-                url=source.get('url', ''),
-                content=f"Source from {source.get('title', 'Unknown')}",
-                relevance_score=0.8,  # Default relevance score
-                credibility_score=0.7,  # Default credibility score
-                domain_authority=0.6,   # Default domain authority
-                source_type=source.get('type', 'web'),
-                publication_date=datetime.now().strftime('%Y-%m-%d')
-            )
-            transformed_sources.append(transformed_source)
-        return transformed_sources
+    def _build_research_context(self, research_sources: List) -> str:
+        """Build research context string from research sources for prompt injection."""
+        if not research_sources:
+            return ""
+        
+        context_parts = ["\n\nRESEARCH CONTEXT (use this information to ground your content with facts and data):"]
+        for i, source in enumerate(research_sources[:5], 1):  # Limit to top 5 sources
+            title = getattr(source, 'title', f'Source {i}')
+            url = getattr(source, 'url', '')
+            content = getattr(source, 'content', '')
+            context_parts.append(f"\n{i}. {title}")
+            if url:
+                context_parts.append(f"   URL: {url}")
+            if content:
+                context_parts.append(f"   Key insight: {content[:300]}")
+        
+        context_parts.append("\nInstructions: Use the research above to include specific data points, statistics, and factual claims in your content. Cite sources where appropriate.")
+        return "\n".join(context_parts)
    
    async def generate_post(
        self,
@@ -155,21 +157,12 @@ class ContentGenerator:
                logger.info(f"  - First research source: {research_sources[0] if research_sources else 'None'}")
                logger.info(f"  - Research sources types: {[type(s) for s in research_sources[:3]]}")
            
-            # Step 3: Add citations if requested - POST METHOD
+            # Step 3: Add citations if requested
            citations = []
            source_list = None
-            final_research_sources = research_sources  # Default to passed research_sources
+            final_research_sources = research_sources
            
-            # Use sources and citations from content_result if available (from Gemini grounding)
-            if content_result.get('citations') and content_result.get('sources'):
-                logger.info(f"Using citations and sources from Gemini grounding: {len(content_result['citations'])} citations, {len(content_result['sources'])} sources")
-                citations = content_result['citations']
-                # Transform Gemini sources to ResearchSource format
-                gemini_sources = self._transform_gemini_sources(content_result['sources'])
-                source_list = self.citation_manager.generate_source_list(gemini_sources) if self.citation_manager else None
-                # Use transformed sources for the response
-                final_research_sources = gemini_sources
-            elif request.include_citations and research_sources and self.citation_manager:
+            if request.include_citations and research_sources and self.citation_manager:
                try:
                    logger.info(f"Processing citations for content length: {len(content_result['content'])}")
                    citations = self.citation_manager.extract_citations(content_result['content'])
@@ -224,7 +217,7 @@ class ContentGenerator:
                data=post_content,
                research_sources=final_research_sources,  # Use final_research_sources
                generation_metadata={
-                    'model_used': 'gemini-2.0-flash-001',
+                    'model_used': 'llm_text_gen',
                    'generation_time': generation_time,
                    'research_time': research_time,
                    'grounding_enabled': grounding_enabled
@@ -251,21 +244,12 @@ class ContentGenerator:
        try:
            start_time = datetime.now()
            
-            # Step 3: Add citations if requested - ARTICLE METHOD
+            # Step 3: Add citations if requested
            citations = []
            source_list = None
-            final_research_sources = research_sources  # Default to passed research_sources
+            final_research_sources = research_sources
            
-            # Use sources and citations from content_result if available (from Gemini grounding)
-            if content_result.get('citations') and content_result.get('sources'):
-                logger.info(f"Using citations and sources from Gemini grounding: {len(content_result['citations'])} citations, {len(content_result['sources'])} sources")
-                citations = content_result['citations']
-                # Transform Gemini sources to ResearchSource format
-                gemini_sources = self._transform_gemini_sources(content_result['sources'])
-                source_list = self.citation_manager.generate_source_list(gemini_sources) if self.citation_manager else None
-                # Use transformed sources for the response
-                final_research_sources = gemini_sources
-            elif request.include_citations and research_sources and self.citation_manager:
+            if request.include_citations and research_sources and self.citation_manager:
                try:
                    citations = self.citation_manager.extract_citations(content_result['content'])
                    source_list = self.citation_manager.generate_source_list(research_sources)
@@ -317,7 +301,7 @@ class ContentGenerator:
                data=article_content,
                research_sources=final_research_sources,  # Use final_research_sources
                generation_metadata={
-                    'model_used': 'gemini-2.0-flash-001',
+                    'model_used': 'llm_text_gen',
                    'generation_time': generation_time,
                    'research_time': research_time,
                    'grounding_enabled': grounding_enabled
@@ -386,7 +370,7 @@ class ContentGenerator:
                'alternative_responses': content_result.get('alternative_responses', []),
                'tone_analysis': content_result.get('tone_analysis'),
                'generation_metadata': {
-                    'model_used': 'gemini-2.0-flash-001',
+                    'model_used': 'llm_text_gen',
                    'generation_time': generation_time,
                    'research_time': research_time,
                    'grounding_enabled': grounding_enabled
@@ -402,19 +386,14 @@ class ContentGenerator:
            }
    
    # Grounded content generation methods
-    async def generate_grounded_post_content(self, request, research_sources: List) -> Dict[str, Any]:
-        """Generate grounded post content using the enhanced Gemini provider with native grounding."""
+    async def generate_grounded_post_content(self, request, research_sources: List, user_id: str = None) -> Dict[str, Any]:
+        """Generate post content using provider-agnostic llm_text_gen."""
        try:
-            if not self.gemini_grounded:
-                logger.error("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                raise Exception("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                
-            # Build the prompt for grounded generation using persona if available (DB vs session override)
-            user_id = int(getattr(request, "user_id", 0) or 0)
-            persona_data = self._get_cached_persona_data(user_id, 'linkedin')
+            # Build the prompt using persona if available
+            uid = int(getattr(request, "user_id", 0) or 0)
+            persona_data = self._get_cached_persona_data(uid, 'linkedin')
            if getattr(request, 'persona_override', None):
                try:
-                    # Merge shallowly: override core and platform adaptation parts
                    override = request.persona_override
                    if persona_data:
                        core = persona_data.get('core_persona', {})
@@ -431,61 +410,40 @@ class ContentGenerator:
                    pass
            prompt = PostPromptBuilder.build_post_prompt(request, persona=persona_data)
            
-            # Generate grounded content using native Google Search grounding
-            result = await self.gemini_grounded.generate_grounded_content(
+            # Inject research context into prompt
+            research_context = self._build_research_context(research_sources)
+            if research_context:
+                prompt += research_context
+            
+            # Generate content using provider-agnostic gateway
+            raw_response = llm_text_gen(
                prompt=prompt,
-                content_type="linkedin_post",
-                temperature=0.7,
-                max_tokens=request.max_length
+                user_id=user_id,
+                flow_type="linkedin_post",
+                max_tokens=request.max_length,
+                temperature=0.7
            )
            
-            return result
+            content_text = raw_response if isinstance(raw_response, str) else str(raw_response or "")
+            
+            return {
+                'content': content_text,
+                'sources': [],
+                'citations': [],
+                'grounding_enabled': bool(research_sources),
+                'fallback_used': False
+            }
            
        except Exception as e:
-            logger.error(f"Error generating grounded post content: {str(e)}")
-            logger.info("Attempting fallback to standard content generation...")
-            
-            # Fallback to standard content generation without grounding
-            try:
-                if not self.fallback_provider:
-                    raise Exception("No fallback provider available")
-                
-                # Build a simpler prompt for fallback generation
-                prompt = PostPromptBuilder.build_post_prompt(request)
-                
-                # Generate content using fallback provider (it's a dict with functions)
-                if 'generate_text' in self.fallback_provider:
-                    result = await self.fallback_provider['generate_text'](
-                        prompt=prompt,
-                        temperature=0.7,
-                        max_tokens=request.max_length
-                    )
-                else:
-                    raise Exception("Fallback provider doesn't have generate_text method")
-                
-                # Return result in the expected format
-                return {
-                    'content': result.get('content', '') if isinstance(result, dict) else str(result),
-                    'sources': [],
-                    'citations': [],
-                    'grounding_enabled': False,
-                    'fallback_used': True
-                }
-                
-            except Exception as fallback_error:
-                logger.error(f"Fallback generation also failed: {str(fallback_error)}")
-                raise Exception(f"Failed to generate content: {str(e)}. Fallback also failed: {str(fallback_error)}")
+            logger.error(f"Error generating post content: {str(e)}")
+            raise Exception(f"Failed to generate LinkedIn post: {str(e)}")
    
-    async def generate_grounded_article_content(self, request, research_sources: List) -> Dict[str, Any]:
-        """Generate grounded article content using the enhanced Gemini provider with native grounding."""
+    async def generate_grounded_article_content(self, request, research_sources: List, user_id: str = None) -> Dict[str, Any]:
+        """Generate article content using provider-agnostic llm_text_gen."""
        try:
-            if not self.gemini_grounded:
-                logger.error("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                raise Exception("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                
-            # Build the prompt for grounded generation using persona if available (DB vs session override)
-            user_id = int(getattr(request, "user_id", 0) or 0)
-            persona_data = self._get_cached_persona_data(user_id, 'linkedin')
+            # Build the prompt using persona if available
+            uid = int(getattr(request, "user_id", 0) or 0)
+            persona_data = self._get_cached_persona_data(uid, 'linkedin')
            if getattr(request, 'persona_override', None):
                try:
                    override = request.persona_override
@@ -504,88 +462,129 @@ class ContentGenerator:
                    pass
            prompt = ArticlePromptBuilder.build_article_prompt(request, persona=persona_data)
            
-            # Generate grounded content using native Google Search grounding
-            result = await self.gemini_grounded.generate_grounded_content(
+            # Inject research context into prompt
+            research_context = self._build_research_context(research_sources)
+            if research_context:
+                prompt += research_context
+            
+            # Generate content using provider-agnostic gateway
+            raw_response = llm_text_gen(
                prompt=prompt,
-                content_type="linkedin_article",
-                temperature=0.7,
-                max_tokens=request.word_count * 10  # Approximate character count
+                user_id=user_id,
+                flow_type="linkedin_article",
+                max_tokens=request.word_count * 10,
+                temperature=0.7
            )
            
-            return result
+            content_text = raw_response if isinstance(raw_response, str) else str(raw_response or "")
+            
+            return {
+                'content': content_text,
+                'sources': [],
+                'citations': [],
+                'grounding_enabled': bool(research_sources),
+                'fallback_used': False
+            }
                
        except Exception as e:
-            logger.error(f"Error generating grounded article content: {str(e)}")
-            raise Exception(f"Failed to generate grounded article content: {str(e)}")
+            logger.error(f"Error generating article content: {str(e)}")
+            raise Exception(f"Failed to generate LinkedIn article: {str(e)}")
    
-    async def generate_grounded_carousel_content(self, request, research_sources: List) -> Dict[str, Any]:
-        """Generate grounded carousel content using the enhanced Gemini provider with native grounding."""
+    async def generate_grounded_carousel_content(self, request, research_sources: List, user_id: str = None) -> Dict[str, Any]:
+        """Generate carousel content using provider-agnostic llm_text_gen."""
        try:
-            if not self.gemini_grounded:
-                logger.error("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                raise Exception("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                
-            # Build the prompt for grounded generation using the new prompt builder
            prompt = CarouselPromptBuilder.build_carousel_prompt(request)
            
-            # Generate grounded content using native Google Search grounding
-            result = await self.gemini_grounded.generate_grounded_content(
+            # Inject research context into prompt
+            research_context = self._build_research_context(research_sources)
+            if research_context:
+                prompt += research_context
+            
+            # Generate content using provider-agnostic gateway
+            raw_response = llm_text_gen(
                prompt=prompt,
-                content_type="linkedin_carousel",
-                temperature=0.7,
-                max_tokens=2000
+                user_id=user_id,
+                flow_type="linkedin_carousel",
+                max_tokens=2000,
+                temperature=0.7
            )
            
-            return result
+            content_text = raw_response if isinstance(raw_response, str) else str(raw_response or "")
+            
+            return {
+                'content': content_text,
+                'sources': [],
+                'citations': [],
+                'grounding_enabled': bool(research_sources),
+                'fallback_used': False
+            }
            
        except Exception as e:
-            logger.error(f"Error generating grounded carousel content: {str(e)}")
-            raise Exception(f"Failed to generate grounded carousel content: {str(e)}")
+            logger.error(f"Error generating carousel content: {str(e)}")
+            raise Exception(f"Failed to generate LinkedIn carousel: {str(e)}")
    
-    async def generate_grounded_video_script_content(self, request, research_sources: List) -> Dict[str, Any]:
-        """Generate grounded video script content using the enhanced Gemini provider with native grounding."""
+    async def generate_grounded_video_script_content(self, request, research_sources: List, user_id: str = None) -> Dict[str, Any]:
+        """Generate video script content using provider-agnostic llm_text_gen."""
        try:
-            if not self.gemini_grounded:
-                logger.error("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                raise Exception("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                
-            # Build the prompt for grounded generation using the new prompt builder
            prompt = VideoScriptPromptBuilder.build_video_script_prompt(request)
            
-            # Generate grounded content using native Google Search grounding
-            result = await self.gemini_grounded.generate_grounded_content(
+            # Inject research context into prompt
+            research_context = self._build_research_context(research_sources)
+            if research_context:
+                prompt += research_context
+            
+            # Generate content using provider-agnostic gateway
+            raw_response = llm_text_gen(
                prompt=prompt,
-                content_type="linkedin_video_script",
-                temperature=0.7,
-                max_tokens=1500
+                user_id=user_id,
+                flow_type="linkedin_video_script",
+                max_tokens=1500,
+                temperature=0.7
            )
            
-            return result
+            content_text = raw_response if isinstance(raw_response, str) else str(raw_response or "")
+            
+            return {
+                'content': content_text,
+                'sources': [],
+                'citations': [],
+                'grounding_enabled': bool(research_sources),
+                'fallback_used': False
+            }
            
        except Exception as e:
-            logger.error(f"Error generating grounded video script content: {str(e)}")
-            raise Exception(f"Failed to generate grounded video script content: {str(e)}")
+            logger.error(f"Error generating video script content: {str(e)}")
+            raise Exception(f"Failed to generate LinkedIn video script: {str(e)}")
    
-    async def generate_grounded_comment_response(self, request, research_sources: List) -> Dict[str, Any]:
-        """Generate grounded comment response using the enhanced Gemini provider with native grounding."""
+    async def generate_grounded_comment_response(self, request, research_sources: List, user_id: str = None) -> Dict[str, Any]:
+        """Generate comment response using provider-agnostic llm_text_gen."""
        try:
-            if not self.gemini_grounded:
-                logger.error("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                raise Exception("Gemini Grounded Provider not available - cannot generate content without AI provider")
-                
-            # Build the prompt for grounded generation using the new prompt builder
            prompt = CommentResponsePromptBuilder.build_comment_response_prompt(request)
            
-            # Generate grounded content using native Google Search grounding
-            result = await self.gemini_grounded.generate_grounded_content(
+            # Inject research context into prompt
+            research_context = self._build_research_context(research_sources)
+            if research_context:
+                prompt += research_context
+            
+            # Generate content using provider-agnostic gateway
+            raw_response = llm_text_gen(
                prompt=prompt,
-                content_type="linkedin_comment_response",
-                temperature=0.7,
-                max_tokens=2000
+                user_id=user_id,
+                flow_type="linkedin_comment_response",
+                max_tokens=2000,
+                temperature=0.7
            )
            
-            return result
+            content_text = raw_response if isinstance(raw_response, str) else str(raw_response or "")
+            
+            return {
+                'content': content_text,
+                'sources': [],
+                'citations': [],
+                'grounding_enabled': bool(research_sources),
+                'fallback_used': False
+            }
                
        except Exception as e:
-            logger.error(f"Error generating grounded comment response: {str(e)}")
-            raise Exception(f"Failed to generate grounded comment response: {str(e)}")
+            logger.error(f"Error generating comment response: {str(e)}")
+            raise Exception(f"Failed to generate LinkedIn comment response: {str(e)}")