Added citation and quality metrics to the content editor.

2025-09-03 09:40:05 +05:30
parent 10b50f9732
commit 5efee4235d
35 changed files with 6987 additions and 1123 deletions
--- a/backend/services/llm_providers/init.py
+++ b/backend/services/llm_providers/init.py
@@ -4,11 +4,11 @@ This service handles all LLM (Language Model) provider integrations,
 migrated from the legacy lib/gpt_providers functionality.
 """

-from .main_text_generation import llm_text_gen
-from .openai_provider import openai_chatgpt, test_openai_api_key
-from .gemini_provider import gemini_text_response, gemini_structured_json_response, test_gemini_api_key
-from .anthropic_provider import anthropic_text_response
-from .deepseek_provider import deepseek_text_response
+from services.llm_providers.main_text_generation import llm_text_gen
+from services.llm_providers.openai_provider import openai_chatgpt, test_openai_api_key
+from services.llm_providers.gemini_provider import gemini_text_response, gemini_structured_json_response
+from services.llm_providers.anthropic_provider import anthropic_text_response
+from services.llm_providers.deepseek_provider import deepseek_text_response

 __all__ = [
    "llm_text_gen",
@@ -16,7 +16,6 @@ __all__ = [
    "test_openai_api_key",
    "gemini_text_response", 
    "gemini_structured_json_response",
-    "test_gemini_api_key",
    "anthropic_text_response",
    "deepseek_text_response"
 ] 
--- a/backend/services/llm_providers/gemini_grounded_provider.py
+++ b/backend/services/llm_providers/gemini_grounded_provider.py
@@ -0,0 +1,577 @@
+"""
+Enhanced Gemini Provider for Grounded Content Generation
+
+This provider uses native Google Search grounding to generate content that is
+factually grounded in current web sources, with automatic citation generation.
+Based on Google AI's official grounding documentation.
+"""
+
+import os
+import json
+import re
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+from loguru import logger
+
+try:
+    from google import genai
+    from google.genai import types
+    GOOGLE_GENAI_AVAILABLE = True
+except ImportError:
+    GOOGLE_GENAI_AVAILABLE = False
+    logger.warn("Google GenAI not available. Install with: pip install google-genai")
+
+
+class GeminiGroundedProvider:
+    """
+    Enhanced Gemini provider for grounded content generation with native Google Search.
+    
+    This provider uses the official Google Search grounding tool to generate content
+    that is factually grounded in current web sources, with automatic citation generation.
+    
+    Based on: https://ai.google.dev/gemini-api/docs/google-search
+    """
+    
+    def __init__(self):
+        """Initialize the Gemini Grounded Provider."""
+        if not GOOGLE_GENAI_AVAILABLE:
+            raise ImportError("Google GenAI library not available. Install with: pip install google-genai")
+        
+        self.api_key = os.getenv('GEMINI_API_KEY')
+        if not self.api_key:
+            raise ValueError("GEMINI_API_KEY environment variable is required")
+        
+        # Initialize the Gemini client
+        self.client = genai.Client(api_key=self.api_key)
+        logger.info("✅ Gemini Grounded Provider initialized with native Google Search grounding")
+    
+    async def generate_grounded_content(
+        self, 
+        prompt: str, 
+        content_type: str = "linkedin_post",
+        temperature: float = 0.7,
+        max_tokens: int = 2048
+    ) -> Dict[str, Any]:
+        """
+        Generate grounded content using native Google Search grounding.
+        
+        Args:
+            prompt: The content generation prompt
+            content_type: Type of content to generate
+            temperature: Creativity level (0.0-1.0)
+            max_tokens: Maximum tokens in response
+            
+        Returns:
+            Dictionary containing generated content and grounding metadata
+        """
+        try:
+            logger.info(f"Generating grounded content for {content_type} using native Google Search")
+            
+            # Build the grounded prompt
+            grounded_prompt = self._build_grounded_prompt(prompt, content_type)
+            
+            # Configure the grounding tool
+            grounding_tool = types.Tool(
+                google_search=types.GoogleSearch()
+            )
+            
+            # Configure generation settings
+            config = types.GenerateContentConfig(
+                tools=[grounding_tool],
+                max_output_tokens=max_tokens,
+                temperature=temperature
+            )
+            
+            # Make the request with native grounding
+            response = self.client.models.generate_content(
+                model="gemini-2.5-flash",
+                contents=grounded_prompt,
+                config=config,
+            )
+            
+            # Process the grounded response
+            result = self._process_grounded_response(response, content_type)
+            
+            logger.info(f"✅ Grounded content generated successfully with {len(result.get('sources', []))} sources")
+            return result
+            
+        except Exception as e:
+            logger.error(f"❌ Error generating grounded content: {str(e)}")
+            raise
+    
+    def _build_grounded_prompt(self, prompt: str, content_type: str) -> str:
+        """
+        Build a prompt optimized for grounded content generation.
+        
+        Args:
+            prompt: Base prompt
+            content_type: Type of content being generated
+            
+        Returns:
+            Enhanced prompt for grounded generation
+        """
+        content_type_instructions = {
+            "linkedin_post": "Generate a professional LinkedIn post that is factually accurate and cites current sources. Include engaging hashtags and a call-to-action.",
+            "linkedin_article": "Generate a comprehensive LinkedIn article with proper structure, factual accuracy, and source citations. Include an engaging title and conclusion.",
+            "linkedin_carousel": "Generate LinkedIn carousel content with multiple slides, each containing factual information with proper source attribution.",
+            "linkedin_video_script": "Generate a video script with hook, main content, and conclusion. Ensure all claims are factually grounded.",
+            "linkedin_comment_response": "Generate a professional comment response that adds value to the conversation."
+        }
+        
+        instruction = content_type_instructions.get(content_type, "Generate professional content with factual accuracy.")
+        
+        grounded_prompt = f"""
+        {instruction}
+        
+        IMPORTANT: Use current, factual information from reliable sources. Cite specific sources for any claims, statistics, or recent developments.
+        
+        User Request: {prompt}
+        
+        Requirements:
+        - Ensure all factual claims are backed by current sources
+        - Use professional, engaging language appropriate for LinkedIn
+        - Include relevant industry insights and trends
+        - Make content shareable and valuable for the target audience
+        """
+        
+        return grounded_prompt.strip()
+    
+    def _process_grounded_response(self, response, content_type: str) -> Dict[str, Any]:
+        """
+        Process the Gemini response with grounding metadata.
+        
+        Args:
+            response: Gemini API response
+            content_type: Type of content generated
+            
+        Returns:
+            Processed content with sources and citations
+        """
+        try:
+            # Extract the main content
+            content = ""
+            if hasattr(response, 'text'):
+                content = response.text
+            elif hasattr(response, 'candidates') and response.candidates:
+                candidate = response.candidates[0]
+                if hasattr(candidate, 'content') and candidate.content:
+                    # Extract text from content parts
+                    text_parts = []
+                    for part in candidate.content:
+                        if hasattr(part, 'text'):
+                            text_parts.append(part.text)
+                    content = " ".join(text_parts)
+            
+            logger.info(f"Extracted content length: {len(content) if content else 0}")
+            if not content:
+                logger.warning("No content extracted from response")
+                content = "Generated content about the requested topic."
+            
+            # Initialize result structure
+            result = {
+                'content': content,
+                'sources': [],
+                'citations': [],
+                'search_queries': [],
+                'grounding_metadata': {},
+                'content_type': content_type,
+                'generation_timestamp': datetime.now().isoformat()
+            }
+            
+            # Debug: Log response structure
+            logger.info(f"Response type: {type(response)}")
+            logger.info(f"Response attributes: {dir(response)}")
+            
+            # Extract grounding metadata if available
+            if hasattr(response, 'candidates') and response.candidates:
+                candidate = response.candidates[0]
+                logger.info(f"Candidate attributes: {dir(candidate)}")
+                
+                if hasattr(candidate, 'grounding_metadata') and candidate.grounding_metadata:
+                    grounding_metadata = candidate.grounding_metadata
+                    result['grounding_metadata'] = grounding_metadata
+                    logger.info(f"Grounding metadata attributes: {dir(grounding_metadata)}")
+                    logger.info(f"Grounding metadata type: {type(grounding_metadata)}")
+                    logger.info(f"Grounding metadata value: {grounding_metadata}")
+                    
+                    # Log all available attributes and their values
+                    for attr in dir(grounding_metadata):
+                        if not attr.startswith('_'):
+                            try:
+                                value = getattr(grounding_metadata, attr)
+                                logger.info(f"  {attr}: {type(value)} = {value}")
+                            except Exception as e:
+                                logger.warning(f"  {attr}: Error accessing - {e}")
+                    
+                    # Extract search queries
+                    if hasattr(grounding_metadata, 'web_search_queries'):
+                        result['search_queries'] = grounding_metadata.web_search_queries
+                        logger.info(f"Search queries: {grounding_metadata.web_search_queries}")
+                    
+                    # Extract sources from grounding chunks
+                    if hasattr(grounding_metadata, 'grounding_chunks') and grounding_metadata.grounding_chunks:
+                        sources = []
+                        for i, chunk in enumerate(grounding_metadata.grounding_chunks):
+                            logger.info(f"Chunk {i} attributes: {dir(chunk)}")
+                            if hasattr(chunk, 'web'):
+                                source = {
+                                    'index': i,
+                                    'title': getattr(chunk.web, 'title', f'Source {i+1}'),
+                                    'url': getattr(chunk.web, 'uri', ''),
+                                    'type': 'web'
+                                }
+                                sources.append(source)
+                        result['sources'] = sources
+                        logger.info(f"Extracted {len(sources)} sources")
+                    else:
+                        logger.error("❌ CRITICAL: No grounding chunks found in response")
+                        logger.error(f"Grounding metadata structure: {dir(grounding_metadata)}")
+                        if hasattr(grounding_metadata, 'grounding_chunks'):
+                            logger.error(f"Grounding chunks type: {type(grounding_metadata.grounding_chunks)}")
+                            logger.error(f"Grounding chunks value: {grounding_metadata.grounding_chunks}")
+                        raise ValueError("No grounding chunks found - grounding is not working properly")
+                    
+                    # Extract citations from grounding supports
+                    if hasattr(grounding_metadata, 'grounding_supports') and grounding_metadata.grounding_supports:
+                        citations = []
+                        for support in grounding_metadata.grounding_supports:
+                            if hasattr(support, 'segment') and hasattr(support, 'grounding_chunk_indices'):
+                                citation = {
+                                    'type': 'inline',
+                                    'start_index': getattr(support.segment, 'start_index', 0),
+                                    'end_index': getattr(support.segment, 'end_index', 0),
+                                    'text': getattr(support.segment, 'text', ''),
+                                    'source_indices': support.grounding_chunk_indices,
+                                    'reference': f"Source {support.grounding_chunk_indices[0] + 1}" if support.grounding_chunk_indices else "Unknown"
+                                }
+                                citations.append(citation)
+                        result['citations'] = citations
+                        logger.info(f"Extracted {len(citations)} citations")
+                    else:
+                        logger.error("❌ CRITICAL: No grounding supports found in response")
+                        logger.error(f"Grounding metadata structure: {dir(grounding_metadata)}")
+                        if hasattr(grounding_metadata, 'grounding_supports'):
+                            logger.error(f"Grounding supports type: {type(grounding_metadata.grounding_supports)}")
+                            logger.error(f"Grounding supports value: {grounding_metadata.grounding_supports}")
+                        raise ValueError("No grounding supports found - grounding is not working properly")
+                    
+                    logger.info(f"✅ Successfully extracted {len(result['sources'])} sources and {len(result['citations'])} citations from grounding metadata")
+                    logger.info(f"Sources: {result['sources']}")
+                    logger.info(f"Citations: {result['citations']}")
+                else:
+                    logger.error("❌ CRITICAL: No grounding metadata found in response")
+                    logger.error(f"Response structure: {dir(response)}")
+                    logger.error(f"First candidate structure: {dir(candidates[0]) if candidates else 'No candidates'}")
+                    raise ValueError("No grounding metadata found - grounding is not working properly")
+            else:
+                logger.error("❌ CRITICAL: No candidates found in response")
+                logger.error(f"Response structure: {dir(response)}")
+                raise ValueError("No candidates found in response - grounding is not working properly")
+            
+            # Add content-specific processing
+            if content_type == "linkedin_post":
+                result.update(self._process_post_content(content))
+            elif content_type == "linkedin_article":
+                result.update(self._process_article_content(content))
+            elif content_type == "linkedin_carousel":
+                result.update(self._process_carousel_content(content))
+            elif content_type == "linkedin_video_script":
+                result.update(self._process_video_script_content(content))
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"❌ CRITICAL: Error processing grounded response: {str(e)}")
+            logger.error(f"Exception type: {type(e)}")
+            logger.error(f"Exception details: {e}")
+            raise ValueError(f"Failed to process grounded response: {str(e)}")
+    
+    def _process_post_content(self, content: str) -> Dict[str, Any]:
+        """Process LinkedIn post content for hashtags and engagement elements."""
+        try:
+            # Handle None content
+            if content is None:
+                content = ""
+                logger.warning("Content is None, using empty string")
+            
+            # Extract hashtags
+            hashtags = re.findall(r'#\w+', content)
+            
+            # Generate call-to-action if not present
+            cta_patterns = [
+                r'What do you think\?',
+                r'Share your thoughts',
+                r'Comment below',
+                r'What\'s your experience\?',
+                r'Let me know in the comments'
+            ]
+            
+            has_cta = any(re.search(pattern, content, re.IGNORECASE) for pattern in cta_patterns)
+            call_to_action = None
+            if not has_cta:
+                call_to_action = "What are your thoughts on this? Share in the comments!"
+            
+            return {
+                'hashtags': [{'hashtag': tag, 'category': 'general', 'popularity_score': 0.8} for tag in hashtags],
+                'call_to_action': call_to_action,
+                'engagement_prediction': {
+                    'estimated_likes': max(50, len(content) // 10),
+                    'estimated_comments': max(5, len(content) // 100)
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error processing post content: {str(e)}")
+            return {}
+    
+    def _process_article_content(self, content: str) -> Dict[str, Any]:
+        """Process LinkedIn article content for structure and SEO."""
+        try:
+            # Extract title (first line or first sentence)
+            lines = content.split('\n')
+            title = lines[0].strip() if lines else "Article Title"
+            
+            # Estimate word count
+            word_count = len(content.split())
+            
+            # Generate sections based on content structure
+            sections = []
+            current_section = ""
+            
+            for line in lines:
+                if line.strip().startswith('#') or line.strip().startswith('##'):
+                    if current_section:
+                        sections.append({'title': 'Section', 'content': current_section.strip()})
+                        current_section = ""
+                else:
+                    current_section += line + "\n"
+            
+            if current_section:
+                sections.append({'title': 'Content', 'content': current_section.strip()})
+            
+            return {
+                'title': title,
+                'word_count': word_count,
+                'sections': sections,
+                'reading_time': max(1, word_count // 200),  # 200 words per minute
+                'seo_metadata': {
+                    'meta_description': content[:160] + "..." if len(content) > 160 else content,
+                    'keywords': self._extract_keywords(content)
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error processing article content: {str(e)}")
+            return {}
+    
+    def _process_carousel_content(self, content: str) -> Dict[str, Any]:
+        """Process LinkedIn carousel content for slide structure."""
+        try:
+            # Split content into slides (basic implementation)
+            slides = []
+            content_parts = content.split('\n\n')
+            
+            for i, part in enumerate(content_parts[:10]):  # Max 10 slides
+                if part.strip():
+                    slides.append({
+                        'slide_number': i + 1,
+                        'title': f"Slide {i + 1}",
+                        'content': part.strip(),
+                        'visual_elements': [],
+                        'design_notes': None
+                    })
+            
+            return {
+                'title': f"Carousel on {content[:50]}...",
+                'slides': slides,
+                'design_guidelines': {
+                    'color_scheme': 'professional',
+                    'typography': 'clean',
+                    'layout': 'minimal'
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error processing carousel content: {str(e)}")
+            return {}
+    
+    def _process_video_script_content(self, content: str) -> Dict[str, Any]:
+        """Process LinkedIn video script content for structure."""
+        try:
+            # Basic video script processing
+            lines = content.split('\n')
+            hook = ""
+            main_content = []
+            conclusion = ""
+            
+            # Extract hook (first few lines)
+            hook_lines = []
+            for line in lines[:3]:
+                if line.strip() and not line.strip().startswith('#'):
+                    hook_lines.append(line.strip())
+                    if len(' '.join(hook_lines)) > 100:
+                        break
+            hook = ' '.join(hook_lines)
+            
+            # Extract conclusion (last few lines)
+            conclusion_lines = []
+            for line in lines[-3:]:
+                if line.strip() and not line.strip().startswith('#'):
+                    conclusion_lines.insert(0, line.strip())
+                    if len(' '.join(conclusion_lines)) > 100:
+                        break
+            conclusion = ' '.join(conclusion_lines)
+            
+            # Main content (everything in between)
+            main_content_text = content[len(hook):len(content)-len(conclusion)].strip()
+            
+            return {
+                'hook': hook,
+                'main_content': [{
+                    'scene_number': 1,
+                    'content': main_content_text,
+                    'duration': 60,
+                    'visual_notes': 'Professional presentation style'
+                }],
+                'conclusion': conclusion,
+                'thumbnail_suggestions': ['Professional thumbnail', 'Industry-focused image'],
+                'video_description': f"Professional insights on {content[:100]}..."
+            }
+        except Exception as e:
+            logger.error(f"Error processing video script content: {str(e)}")
+            return {}
+    
+    def _extract_keywords(self, content: str) -> List[str]:
+        """Extract relevant keywords from content."""
+        try:
+            # Simple keyword extraction (can be enhanced with NLP)
+            words = re.findall(r'\b\w+\b', content.lower())
+            word_freq = {}
+            
+            # Filter out common words
+            stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'a', 'an'}
+            
+            for word in words:
+                if word not in stop_words and len(word) > 3:
+                    word_freq[word] = word_freq.get(word, 0) + 1
+            
+            # Return top keywords
+            sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+            return [word for word, freq in sorted_words[:10]]
+            
+        except Exception as e:
+            logger.error(f"Error extracting keywords: {str(e)}")
+            return []
+    
+    def add_citations(self, content: str, sources: List[Dict[str, Any]]) -> str:
+        """
+        Add inline citations to content based on grounding metadata.
+        
+        Args:
+            content: The content to add citations to
+            sources: List of sources from grounding metadata
+            
+        Returns:
+            Content with inline citations
+        """
+        try:
+            if not sources:
+                return content
+            
+            # Create citation mapping
+            citation_map = {}
+            for source in sources:
+                index = source.get('index', 0)
+                citation_map[index] = f"[Source {index + 1}]({source.get('url', '')})"
+            
+            # Add citations at the end of sentences or paragraphs
+            # This is a simplified approach - in practice, you'd use the groundingSupports data
+            citation_text = "\n\n**Sources:**\n"
+            for i, source in enumerate(sources):
+                citation_text += f"{i+1}. **{source.get('title', f'Source {i+1}')}**\n   - URL: [{source.get('url', '')}]({source.get('url', '')})\n\n"
+            
+            return content + citation_text
+            
+        except Exception as e:
+            logger.error(f"Error adding citations: {str(e)}")
+            return content
+    
+    def extract_citations(self, content: str) -> List[Dict[str, Any]]:
+        """
+        Extract citations from content.
+        
+        Args:
+            content: Content to extract citations from
+            
+        Returns:
+            List of citation objects
+        """
+        try:
+            citations = []
+            # Look for citation patterns
+            citation_patterns = [
+                r'\[Source (\d+)\]',
+                r'\[(\d+)\]',
+                r'\(Source (\d+)\)'
+            ]
+            
+            for pattern in citation_patterns:
+                matches = re.finditer(pattern, content)
+                for match in matches:
+                    citations.append({
+                        'type': 'inline',
+                        'reference': match.group(0),
+                        'position': match.start(),
+                        'source_index': int(match.group(1)) - 1
+                    })
+            
+            return citations
+            
+        except Exception as e:
+            logger.error(f"Error extracting citations: {str(e)}")
+            return []
+    
+    def assess_content_quality(self, content: str, sources: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Assess the quality of generated content.
+        
+        Args:
+            content: The generated content
+            sources: List of sources used
+            
+        Returns:
+            Quality metrics dictionary
+        """
+        try:
+            # Basic quality metrics
+            word_count = len(content.split())
+            char_count = len(content)
+            
+            # Source coverage
+            source_coverage = min(1.0, len(sources) / max(1, word_count / 100))
+            
+            # Professional tone indicators
+            professional_indicators = ['research', 'analysis', 'insights', 'trends', 'industry', 'professional']
+            unprofessional_indicators = ['awesome', 'amazing', 'incredible', 'mind-blowing']
+            
+            professional_score = sum(1 for indicator in professional_indicators if indicator.lower() in content.lower()) / len(professional_indicators)
+            unprofessional_score = sum(1 for indicator in unprofessional_indicators if indicator.lower() in content.lower()) / len(unprofessional_indicators)
+            
+            tone_score = max(0, professional_score - unprofessional_score)
+            
+            # Overall quality score
+            overall_score = (source_coverage * 0.4 + tone_score * 0.3 + min(1.0, word_count / 500) * 0.3)
+            
+            return {
+                'overall_score': round(overall_score, 2),
+                'source_coverage': round(source_coverage, 2),
+                'tone_score': round(tone_score, 2),
+                'word_count': word_count,
+                'char_count': char_count,
+                'sources_count': len(sources),
+                'quality_level': 'high' if overall_score > 0.8 else 'medium' if overall_score > 0.6 else 'low'
+            }
+            
+        except Exception as e:
+            logger.error(f"Error assessing content quality: {str(e)}")
+            return {
+                'overall_score': 0.0,
+                'error': str(e)
+            }