578 lines
26 KiB
Python
578 lines
26 KiB
Python
"""
|
|
Enhanced Gemini Provider for Grounded Content Generation
|
|
|
|
This provider uses native Google Search grounding to generate content that is
|
|
factually grounded in current web sources, with automatic citation generation.
|
|
Based on Google AI's official grounding documentation.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import re
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime
|
|
from loguru import logger
|
|
|
|
try:
|
|
from google import genai
|
|
from google.genai import types
|
|
GOOGLE_GENAI_AVAILABLE = True
|
|
except ImportError:
|
|
GOOGLE_GENAI_AVAILABLE = False
|
|
logger.warn("Google GenAI not available. Install with: pip install google-genai")
|
|
|
|
|
|
class GeminiGroundedProvider:
|
|
"""
|
|
Enhanced Gemini provider for grounded content generation with native Google Search.
|
|
|
|
This provider uses the official Google Search grounding tool to generate content
|
|
that is factually grounded in current web sources, with automatic citation generation.
|
|
|
|
Based on: https://ai.google.dev/gemini-api/docs/google-search
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the Gemini Grounded Provider."""
|
|
if not GOOGLE_GENAI_AVAILABLE:
|
|
raise ImportError("Google GenAI library not available. Install with: pip install google-genai")
|
|
|
|
self.api_key = os.getenv('GEMINI_API_KEY')
|
|
if not self.api_key:
|
|
raise ValueError("GEMINI_API_KEY environment variable is required")
|
|
|
|
# Initialize the Gemini client
|
|
self.client = genai.Client(api_key=self.api_key)
|
|
logger.info("✅ Gemini Grounded Provider initialized with native Google Search grounding")
|
|
|
|
async def generate_grounded_content(
|
|
self,
|
|
prompt: str,
|
|
content_type: str = "linkedin_post",
|
|
temperature: float = 0.7,
|
|
max_tokens: int = 2048
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Generate grounded content using native Google Search grounding.
|
|
|
|
Args:
|
|
prompt: The content generation prompt
|
|
content_type: Type of content to generate
|
|
temperature: Creativity level (0.0-1.0)
|
|
max_tokens: Maximum tokens in response
|
|
|
|
Returns:
|
|
Dictionary containing generated content and grounding metadata
|
|
"""
|
|
try:
|
|
logger.info(f"Generating grounded content for {content_type} using native Google Search")
|
|
|
|
# Build the grounded prompt
|
|
grounded_prompt = self._build_grounded_prompt(prompt, content_type)
|
|
|
|
# Configure the grounding tool
|
|
grounding_tool = types.Tool(
|
|
google_search=types.GoogleSearch()
|
|
)
|
|
|
|
# Configure generation settings
|
|
config = types.GenerateContentConfig(
|
|
tools=[grounding_tool],
|
|
max_output_tokens=max_tokens,
|
|
temperature=temperature
|
|
)
|
|
|
|
# Make the request with native grounding
|
|
response = self.client.models.generate_content(
|
|
model="gemini-2.5-flash",
|
|
contents=grounded_prompt,
|
|
config=config,
|
|
)
|
|
|
|
# Process the grounded response
|
|
result = self._process_grounded_response(response, content_type)
|
|
|
|
logger.info(f"✅ Grounded content generated successfully with {len(result.get('sources', []))} sources")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Error generating grounded content: {str(e)}")
|
|
raise
|
|
|
|
def _build_grounded_prompt(self, prompt: str, content_type: str) -> str:
|
|
"""
|
|
Build a prompt optimized for grounded content generation.
|
|
|
|
Args:
|
|
prompt: Base prompt
|
|
content_type: Type of content being generated
|
|
|
|
Returns:
|
|
Enhanced prompt for grounded generation
|
|
"""
|
|
content_type_instructions = {
|
|
"linkedin_post": "Generate a professional LinkedIn post that is factually accurate and cites current sources. Include engaging hashtags and a call-to-action.",
|
|
"linkedin_article": "Generate a comprehensive LinkedIn article with proper structure, factual accuracy, and source citations. Include an engaging title and conclusion.",
|
|
"linkedin_carousel": "Generate LinkedIn carousel content with multiple slides, each containing factual information with proper source attribution.",
|
|
"linkedin_video_script": "Generate a video script with hook, main content, and conclusion. Ensure all claims are factually grounded.",
|
|
"linkedin_comment_response": "Generate a professional comment response that adds value to the conversation."
|
|
}
|
|
|
|
instruction = content_type_instructions.get(content_type, "Generate professional content with factual accuracy.")
|
|
|
|
grounded_prompt = f"""
|
|
{instruction}
|
|
|
|
IMPORTANT: Use current, factual information from reliable sources. Cite specific sources for any claims, statistics, or recent developments.
|
|
|
|
User Request: {prompt}
|
|
|
|
Requirements:
|
|
- Ensure all factual claims are backed by current sources
|
|
- Use professional, engaging language appropriate for LinkedIn
|
|
- Include relevant industry insights and trends
|
|
- Make content shareable and valuable for the target audience
|
|
"""
|
|
|
|
return grounded_prompt.strip()
|
|
|
|
def _process_grounded_response(self, response, content_type: str) -> Dict[str, Any]:
|
|
"""
|
|
Process the Gemini response with grounding metadata.
|
|
|
|
Args:
|
|
response: Gemini API response
|
|
content_type: Type of content generated
|
|
|
|
Returns:
|
|
Processed content with sources and citations
|
|
"""
|
|
try:
|
|
# Extract the main content
|
|
content = ""
|
|
if hasattr(response, 'text'):
|
|
content = response.text
|
|
elif hasattr(response, 'candidates') and response.candidates:
|
|
candidate = response.candidates[0]
|
|
if hasattr(candidate, 'content') and candidate.content:
|
|
# Extract text from content parts
|
|
text_parts = []
|
|
for part in candidate.content:
|
|
if hasattr(part, 'text'):
|
|
text_parts.append(part.text)
|
|
content = " ".join(text_parts)
|
|
|
|
logger.info(f"Extracted content length: {len(content) if content else 0}")
|
|
if not content:
|
|
logger.warning("No content extracted from response")
|
|
content = "Generated content about the requested topic."
|
|
|
|
# Initialize result structure
|
|
result = {
|
|
'content': content,
|
|
'sources': [],
|
|
'citations': [],
|
|
'search_queries': [],
|
|
'grounding_metadata': {},
|
|
'content_type': content_type,
|
|
'generation_timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
# Debug: Log response structure
|
|
logger.info(f"Response type: {type(response)}")
|
|
logger.info(f"Response attributes: {dir(response)}")
|
|
|
|
# Extract grounding metadata if available
|
|
if hasattr(response, 'candidates') and response.candidates:
|
|
candidate = response.candidates[0]
|
|
logger.info(f"Candidate attributes: {dir(candidate)}")
|
|
|
|
if hasattr(candidate, 'grounding_metadata') and candidate.grounding_metadata:
|
|
grounding_metadata = candidate.grounding_metadata
|
|
result['grounding_metadata'] = grounding_metadata
|
|
logger.info(f"Grounding metadata attributes: {dir(grounding_metadata)}")
|
|
logger.info(f"Grounding metadata type: {type(grounding_metadata)}")
|
|
logger.info(f"Grounding metadata value: {grounding_metadata}")
|
|
|
|
# Log all available attributes and their values
|
|
for attr in dir(grounding_metadata):
|
|
if not attr.startswith('_'):
|
|
try:
|
|
value = getattr(grounding_metadata, attr)
|
|
logger.info(f" {attr}: {type(value)} = {value}")
|
|
except Exception as e:
|
|
logger.warning(f" {attr}: Error accessing - {e}")
|
|
|
|
# Extract search queries
|
|
if hasattr(grounding_metadata, 'web_search_queries'):
|
|
result['search_queries'] = grounding_metadata.web_search_queries
|
|
logger.info(f"Search queries: {grounding_metadata.web_search_queries}")
|
|
|
|
# Extract sources from grounding chunks
|
|
if hasattr(grounding_metadata, 'grounding_chunks') and grounding_metadata.grounding_chunks:
|
|
sources = []
|
|
for i, chunk in enumerate(grounding_metadata.grounding_chunks):
|
|
logger.info(f"Chunk {i} attributes: {dir(chunk)}")
|
|
if hasattr(chunk, 'web'):
|
|
source = {
|
|
'index': i,
|
|
'title': getattr(chunk.web, 'title', f'Source {i+1}'),
|
|
'url': getattr(chunk.web, 'uri', ''),
|
|
'type': 'web'
|
|
}
|
|
sources.append(source)
|
|
result['sources'] = sources
|
|
logger.info(f"Extracted {len(sources)} sources")
|
|
else:
|
|
logger.error("❌ CRITICAL: No grounding chunks found in response")
|
|
logger.error(f"Grounding metadata structure: {dir(grounding_metadata)}")
|
|
if hasattr(grounding_metadata, 'grounding_chunks'):
|
|
logger.error(f"Grounding chunks type: {type(grounding_metadata.grounding_chunks)}")
|
|
logger.error(f"Grounding chunks value: {grounding_metadata.grounding_chunks}")
|
|
raise ValueError("No grounding chunks found - grounding is not working properly")
|
|
|
|
# Extract citations from grounding supports
|
|
if hasattr(grounding_metadata, 'grounding_supports') and grounding_metadata.grounding_supports:
|
|
citations = []
|
|
for support in grounding_metadata.grounding_supports:
|
|
if hasattr(support, 'segment') and hasattr(support, 'grounding_chunk_indices'):
|
|
citation = {
|
|
'type': 'inline',
|
|
'start_index': getattr(support.segment, 'start_index', 0),
|
|
'end_index': getattr(support.segment, 'end_index', 0),
|
|
'text': getattr(support.segment, 'text', ''),
|
|
'source_indices': support.grounding_chunk_indices,
|
|
'reference': f"Source {support.grounding_chunk_indices[0] + 1}" if support.grounding_chunk_indices else "Unknown"
|
|
}
|
|
citations.append(citation)
|
|
result['citations'] = citations
|
|
logger.info(f"Extracted {len(citations)} citations")
|
|
else:
|
|
logger.error("❌ CRITICAL: No grounding supports found in response")
|
|
logger.error(f"Grounding metadata structure: {dir(grounding_metadata)}")
|
|
if hasattr(grounding_metadata, 'grounding_supports'):
|
|
logger.error(f"Grounding supports type: {type(grounding_metadata.grounding_supports)}")
|
|
logger.error(f"Grounding supports value: {grounding_metadata.grounding_supports}")
|
|
raise ValueError("No grounding supports found - grounding is not working properly")
|
|
|
|
logger.info(f"✅ Successfully extracted {len(result['sources'])} sources and {len(result['citations'])} citations from grounding metadata")
|
|
logger.info(f"Sources: {result['sources']}")
|
|
logger.info(f"Citations: {result['citations']}")
|
|
else:
|
|
logger.error("❌ CRITICAL: No grounding metadata found in response")
|
|
logger.error(f"Response structure: {dir(response)}")
|
|
logger.error(f"First candidate structure: {dir(candidates[0]) if candidates else 'No candidates'}")
|
|
raise ValueError("No grounding metadata found - grounding is not working properly")
|
|
else:
|
|
logger.error("❌ CRITICAL: No candidates found in response")
|
|
logger.error(f"Response structure: {dir(response)}")
|
|
raise ValueError("No candidates found in response - grounding is not working properly")
|
|
|
|
# Add content-specific processing
|
|
if content_type == "linkedin_post":
|
|
result.update(self._process_post_content(content))
|
|
elif content_type == "linkedin_article":
|
|
result.update(self._process_article_content(content))
|
|
elif content_type == "linkedin_carousel":
|
|
result.update(self._process_carousel_content(content))
|
|
elif content_type == "linkedin_video_script":
|
|
result.update(self._process_video_script_content(content))
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ CRITICAL: Error processing grounded response: {str(e)}")
|
|
logger.error(f"Exception type: {type(e)}")
|
|
logger.error(f"Exception details: {e}")
|
|
raise ValueError(f"Failed to process grounded response: {str(e)}")
|
|
|
|
def _process_post_content(self, content: str) -> Dict[str, Any]:
|
|
"""Process LinkedIn post content for hashtags and engagement elements."""
|
|
try:
|
|
# Handle None content
|
|
if content is None:
|
|
content = ""
|
|
logger.warning("Content is None, using empty string")
|
|
|
|
# Extract hashtags
|
|
hashtags = re.findall(r'#\w+', content)
|
|
|
|
# Generate call-to-action if not present
|
|
cta_patterns = [
|
|
r'What do you think\?',
|
|
r'Share your thoughts',
|
|
r'Comment below',
|
|
r'What\'s your experience\?',
|
|
r'Let me know in the comments'
|
|
]
|
|
|
|
has_cta = any(re.search(pattern, content, re.IGNORECASE) for pattern in cta_patterns)
|
|
call_to_action = None
|
|
if not has_cta:
|
|
call_to_action = "What are your thoughts on this? Share in the comments!"
|
|
|
|
return {
|
|
'hashtags': [{'hashtag': tag, 'category': 'general', 'popularity_score': 0.8} for tag in hashtags],
|
|
'call_to_action': call_to_action,
|
|
'engagement_prediction': {
|
|
'estimated_likes': max(50, len(content) // 10),
|
|
'estimated_comments': max(5, len(content) // 100)
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error processing post content: {str(e)}")
|
|
return {}
|
|
|
|
def _process_article_content(self, content: str) -> Dict[str, Any]:
|
|
"""Process LinkedIn article content for structure and SEO."""
|
|
try:
|
|
# Extract title (first line or first sentence)
|
|
lines = content.split('\n')
|
|
title = lines[0].strip() if lines else "Article Title"
|
|
|
|
# Estimate word count
|
|
word_count = len(content.split())
|
|
|
|
# Generate sections based on content structure
|
|
sections = []
|
|
current_section = ""
|
|
|
|
for line in lines:
|
|
if line.strip().startswith('#') or line.strip().startswith('##'):
|
|
if current_section:
|
|
sections.append({'title': 'Section', 'content': current_section.strip()})
|
|
current_section = ""
|
|
else:
|
|
current_section += line + "\n"
|
|
|
|
if current_section:
|
|
sections.append({'title': 'Content', 'content': current_section.strip()})
|
|
|
|
return {
|
|
'title': title,
|
|
'word_count': word_count,
|
|
'sections': sections,
|
|
'reading_time': max(1, word_count // 200), # 200 words per minute
|
|
'seo_metadata': {
|
|
'meta_description': content[:160] + "..." if len(content) > 160 else content,
|
|
'keywords': self._extract_keywords(content)
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error processing article content: {str(e)}")
|
|
return {}
|
|
|
|
def _process_carousel_content(self, content: str) -> Dict[str, Any]:
|
|
"""Process LinkedIn carousel content for slide structure."""
|
|
try:
|
|
# Split content into slides (basic implementation)
|
|
slides = []
|
|
content_parts = content.split('\n\n')
|
|
|
|
for i, part in enumerate(content_parts[:10]): # Max 10 slides
|
|
if part.strip():
|
|
slides.append({
|
|
'slide_number': i + 1,
|
|
'title': f"Slide {i + 1}",
|
|
'content': part.strip(),
|
|
'visual_elements': [],
|
|
'design_notes': None
|
|
})
|
|
|
|
return {
|
|
'title': f"Carousel on {content[:50]}...",
|
|
'slides': slides,
|
|
'design_guidelines': {
|
|
'color_scheme': 'professional',
|
|
'typography': 'clean',
|
|
'layout': 'minimal'
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error processing carousel content: {str(e)}")
|
|
return {}
|
|
|
|
def _process_video_script_content(self, content: str) -> Dict[str, Any]:
|
|
"""Process LinkedIn video script content for structure."""
|
|
try:
|
|
# Basic video script processing
|
|
lines = content.split('\n')
|
|
hook = ""
|
|
main_content = []
|
|
conclusion = ""
|
|
|
|
# Extract hook (first few lines)
|
|
hook_lines = []
|
|
for line in lines[:3]:
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
hook_lines.append(line.strip())
|
|
if len(' '.join(hook_lines)) > 100:
|
|
break
|
|
hook = ' '.join(hook_lines)
|
|
|
|
# Extract conclusion (last few lines)
|
|
conclusion_lines = []
|
|
for line in lines[-3:]:
|
|
if line.strip() and not line.strip().startswith('#'):
|
|
conclusion_lines.insert(0, line.strip())
|
|
if len(' '.join(conclusion_lines)) > 100:
|
|
break
|
|
conclusion = ' '.join(conclusion_lines)
|
|
|
|
# Main content (everything in between)
|
|
main_content_text = content[len(hook):len(content)-len(conclusion)].strip()
|
|
|
|
return {
|
|
'hook': hook,
|
|
'main_content': [{
|
|
'scene_number': 1,
|
|
'content': main_content_text,
|
|
'duration': 60,
|
|
'visual_notes': 'Professional presentation style'
|
|
}],
|
|
'conclusion': conclusion,
|
|
'thumbnail_suggestions': ['Professional thumbnail', 'Industry-focused image'],
|
|
'video_description': f"Professional insights on {content[:100]}..."
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error processing video script content: {str(e)}")
|
|
return {}
|
|
|
|
def _extract_keywords(self, content: str) -> List[str]:
|
|
"""Extract relevant keywords from content."""
|
|
try:
|
|
# Simple keyword extraction (can be enhanced with NLP)
|
|
words = re.findall(r'\b\w+\b', content.lower())
|
|
word_freq = {}
|
|
|
|
# Filter out common words
|
|
stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'a', 'an'}
|
|
|
|
for word in words:
|
|
if word not in stop_words and len(word) > 3:
|
|
word_freq[word] = word_freq.get(word, 0) + 1
|
|
|
|
# Return top keywords
|
|
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
|
return [word for word, freq in sorted_words[:10]]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting keywords: {str(e)}")
|
|
return []
|
|
|
|
def add_citations(self, content: str, sources: List[Dict[str, Any]]) -> str:
|
|
"""
|
|
Add inline citations to content based on grounding metadata.
|
|
|
|
Args:
|
|
content: The content to add citations to
|
|
sources: List of sources from grounding metadata
|
|
|
|
Returns:
|
|
Content with inline citations
|
|
"""
|
|
try:
|
|
if not sources:
|
|
return content
|
|
|
|
# Create citation mapping
|
|
citation_map = {}
|
|
for source in sources:
|
|
index = source.get('index', 0)
|
|
citation_map[index] = f"[Source {index + 1}]({source.get('url', '')})"
|
|
|
|
# Add citations at the end of sentences or paragraphs
|
|
# This is a simplified approach - in practice, you'd use the groundingSupports data
|
|
citation_text = "\n\n**Sources:**\n"
|
|
for i, source in enumerate(sources):
|
|
citation_text += f"{i+1}. **{source.get('title', f'Source {i+1}')}**\n - URL: [{source.get('url', '')}]({source.get('url', '')})\n\n"
|
|
|
|
return content + citation_text
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding citations: {str(e)}")
|
|
return content
|
|
|
|
def extract_citations(self, content: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract citations from content.
|
|
|
|
Args:
|
|
content: Content to extract citations from
|
|
|
|
Returns:
|
|
List of citation objects
|
|
"""
|
|
try:
|
|
citations = []
|
|
# Look for citation patterns
|
|
citation_patterns = [
|
|
r'\[Source (\d+)\]',
|
|
r'\[(\d+)\]',
|
|
r'\(Source (\d+)\)'
|
|
]
|
|
|
|
for pattern in citation_patterns:
|
|
matches = re.finditer(pattern, content)
|
|
for match in matches:
|
|
citations.append({
|
|
'type': 'inline',
|
|
'reference': match.group(0),
|
|
'position': match.start(),
|
|
'source_index': int(match.group(1)) - 1
|
|
})
|
|
|
|
return citations
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting citations: {str(e)}")
|
|
return []
|
|
|
|
def assess_content_quality(self, content: str, sources: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""
|
|
Assess the quality of generated content.
|
|
|
|
Args:
|
|
content: The generated content
|
|
sources: List of sources used
|
|
|
|
Returns:
|
|
Quality metrics dictionary
|
|
"""
|
|
try:
|
|
# Basic quality metrics
|
|
word_count = len(content.split())
|
|
char_count = len(content)
|
|
|
|
# Source coverage
|
|
source_coverage = min(1.0, len(sources) / max(1, word_count / 100))
|
|
|
|
# Professional tone indicators
|
|
professional_indicators = ['research', 'analysis', 'insights', 'trends', 'industry', 'professional']
|
|
unprofessional_indicators = ['awesome', 'amazing', 'incredible', 'mind-blowing']
|
|
|
|
professional_score = sum(1 for indicator in professional_indicators if indicator.lower() in content.lower()) / len(professional_indicators)
|
|
unprofessional_score = sum(1 for indicator in unprofessional_indicators if indicator.lower() in content.lower()) / len(unprofessional_indicators)
|
|
|
|
tone_score = max(0, professional_score - unprofessional_score)
|
|
|
|
# Overall quality score
|
|
overall_score = (source_coverage * 0.4 + tone_score * 0.3 + min(1.0, word_count / 500) * 0.3)
|
|
|
|
return {
|
|
'overall_score': round(overall_score, 2),
|
|
'source_coverage': round(source_coverage, 2),
|
|
'tone_score': round(tone_score, 2),
|
|
'word_count': word_count,
|
|
'char_count': char_count,
|
|
'sources_count': len(sources),
|
|
'quality_level': 'high' if overall_score > 0.8 else 'medium' if overall_score > 0.6 else 'low'
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error assessing content quality: {str(e)}")
|
|
return {
|
|
'overall_score': 0.0,
|
|
'error': str(e)
|
|
}
|