ALwrity AI Blog Writer - Added Google Grounding UI Implementation
This commit is contained in:
@@ -12,10 +12,12 @@ from .research_service import ResearchService
|
||||
from .keyword_analyzer import KeywordAnalyzer
|
||||
from .competitor_analyzer import CompetitorAnalyzer
|
||||
from .content_angle_generator import ContentAngleGenerator
|
||||
from .data_filter import ResearchDataFilter
|
||||
|
||||
__all__ = [
|
||||
'ResearchService',
|
||||
'KeywordAnalyzer',
|
||||
'CompetitorAnalyzer',
|
||||
'ContentAngleGenerator'
|
||||
'ContentAngleGenerator',
|
||||
'ResearchDataFilter'
|
||||
]
|
||||
|
||||
519
backend/services/blog_writer/research/data_filter.py
Normal file
519
backend/services/blog_writer/research/data_filter.py
Normal file
@@ -0,0 +1,519 @@
|
||||
"""
|
||||
Research Data Filter - Filters and cleans research data for optimal AI processing.
|
||||
|
||||
This module provides intelligent filtering and cleaning of research data to:
|
||||
1. Remove low-quality sources and irrelevant content
|
||||
2. Optimize data for AI processing (reduce tokens, improve quality)
|
||||
3. Ensure only high-value insights are sent to AI prompts
|
||||
4. Maintain data integrity while improving processing efficiency
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
BlogResearchResponse,
|
||||
ResearchSource,
|
||||
GroundingMetadata,
|
||||
GroundingChunk,
|
||||
GroundingSupport,
|
||||
Citation,
|
||||
)
|
||||
|
||||
|
||||
class ResearchDataFilter:
|
||||
"""Filters and cleans research data for optimal AI processing."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the research data filter with default settings."""
|
||||
# Be conservative but avoid over-filtering which can lead to empty UI
|
||||
self.min_credibility_score = 0.5
|
||||
self.min_excerpt_length = 20
|
||||
self.max_sources = 15
|
||||
self.max_grounding_chunks = 20
|
||||
self.max_content_gaps = 5
|
||||
self.max_keywords_per_category = 10
|
||||
self.min_grounding_confidence = 0.5
|
||||
self.max_source_age_days = 365 * 5 # allow up to 5 years if relevant
|
||||
|
||||
# Common stop words for keyword cleaning
|
||||
self.stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
||||
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
||||
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'
|
||||
}
|
||||
|
||||
# Irrelevant source patterns
|
||||
self.irrelevant_patterns = [
|
||||
r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx)$', # Document files
|
||||
r'\.(jpg|jpeg|png|gif|svg|webp)$', # Image files
|
||||
r'\.(mp4|avi|mov|wmv|flv|webm)$', # Video files
|
||||
r'\.(mp3|wav|flac|aac)$', # Audio files
|
||||
r'\.(zip|rar|7z|tar|gz)$', # Archive files
|
||||
r'^https?://(www\.)?(facebook|twitter|instagram|linkedin|youtube)\.com', # Social media
|
||||
r'^https?://(www\.)?(amazon|ebay|etsy)\.com', # E-commerce
|
||||
r'^https?://(www\.)?(wikipedia)\.org', # Wikipedia (too generic)
|
||||
]
|
||||
|
||||
logger.info("✅ ResearchDataFilter initialized with quality thresholds")
|
||||
|
||||
def filter_research_data(self, research_data: BlogResearchResponse) -> BlogResearchResponse:
|
||||
"""
|
||||
Main filtering method that processes all research data components.
|
||||
|
||||
Args:
|
||||
research_data: Raw research data from the research service
|
||||
|
||||
Returns:
|
||||
Filtered and cleaned research data optimized for AI processing
|
||||
"""
|
||||
logger.info(f"Starting research data filtering for {len(research_data.sources)} sources")
|
||||
|
||||
# Track original counts for logging
|
||||
original_counts = {
|
||||
'sources': len(research_data.sources),
|
||||
'grounding_chunks': len(research_data.grounding_metadata.grounding_chunks) if research_data.grounding_metadata else 0,
|
||||
'grounding_supports': len(research_data.grounding_metadata.grounding_supports) if research_data.grounding_metadata else 0,
|
||||
'citations': len(research_data.grounding_metadata.citations) if research_data.grounding_metadata else 0,
|
||||
}
|
||||
|
||||
# Filter sources
|
||||
filtered_sources = self.filter_sources(research_data.sources)
|
||||
|
||||
# Filter grounding metadata
|
||||
filtered_grounding_metadata = self.filter_grounding_metadata(research_data.grounding_metadata)
|
||||
|
||||
# Clean keyword analysis
|
||||
cleaned_keyword_analysis = self.clean_keyword_analysis(research_data.keyword_analysis)
|
||||
|
||||
# Clean competitor analysis
|
||||
cleaned_competitor_analysis = self.clean_competitor_analysis(research_data.competitor_analysis)
|
||||
|
||||
# Filter content gaps
|
||||
filtered_content_gaps = self.filter_content_gaps(
|
||||
research_data.keyword_analysis.get('content_gaps', []),
|
||||
research_data
|
||||
)
|
||||
|
||||
# Update keyword analysis with filtered content gaps
|
||||
cleaned_keyword_analysis['content_gaps'] = filtered_content_gaps
|
||||
|
||||
# Create filtered research response
|
||||
filtered_research = BlogResearchResponse(
|
||||
success=research_data.success,
|
||||
sources=filtered_sources,
|
||||
keyword_analysis=cleaned_keyword_analysis,
|
||||
competitor_analysis=cleaned_competitor_analysis,
|
||||
suggested_angles=research_data.suggested_angles, # Keep as-is for now
|
||||
search_widget=research_data.search_widget,
|
||||
search_queries=research_data.search_queries,
|
||||
grounding_metadata=filtered_grounding_metadata,
|
||||
error_message=research_data.error_message
|
||||
)
|
||||
|
||||
# Log filtering results
|
||||
self._log_filtering_results(original_counts, filtered_research)
|
||||
|
||||
return filtered_research
|
||||
|
||||
def filter_sources(self, sources: List[ResearchSource]) -> List[ResearchSource]:
|
||||
"""
|
||||
Filter sources based on quality, relevance, and recency criteria.
|
||||
|
||||
Args:
|
||||
sources: List of research sources to filter
|
||||
|
||||
Returns:
|
||||
Filtered list of high-quality sources
|
||||
"""
|
||||
if not sources:
|
||||
return []
|
||||
|
||||
filtered_sources = []
|
||||
|
||||
for source in sources:
|
||||
# Quality filters
|
||||
if not self._is_source_high_quality(source):
|
||||
continue
|
||||
|
||||
# Relevance filters
|
||||
if not self._is_source_relevant(source):
|
||||
continue
|
||||
|
||||
# Recency filters
|
||||
if not self._is_source_recent(source):
|
||||
continue
|
||||
|
||||
filtered_sources.append(source)
|
||||
|
||||
# Sort by credibility score and limit to max_sources
|
||||
filtered_sources.sort(key=lambda s: s.credibility_score or 0.8, reverse=True)
|
||||
filtered_sources = filtered_sources[:self.max_sources]
|
||||
|
||||
# Fail-open: if everything was filtered out, return a trimmed set of original sources
|
||||
if not filtered_sources and sources:
|
||||
logger.warning("All sources filtered out by thresholds. Falling back to top sources without strict filters.")
|
||||
fallback = sorted(
|
||||
sources,
|
||||
key=lambda s: (s.credibility_score or 0.8),
|
||||
reverse=True
|
||||
)[: self.max_sources]
|
||||
return fallback
|
||||
|
||||
logger.info(f"Filtered sources: {len(sources)} → {len(filtered_sources)}")
|
||||
return filtered_sources
|
||||
|
||||
def filter_grounding_metadata(self, grounding_metadata: Optional[GroundingMetadata]) -> Optional[GroundingMetadata]:
|
||||
"""
|
||||
Filter grounding metadata to keep only high-confidence, relevant data.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Raw grounding metadata to filter
|
||||
|
||||
Returns:
|
||||
Filtered grounding metadata with high-quality data only
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return None
|
||||
|
||||
# Filter grounding chunks by confidence
|
||||
filtered_chunks = []
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
if chunk.confidence_score and chunk.confidence_score >= self.min_grounding_confidence:
|
||||
filtered_chunks.append(chunk)
|
||||
|
||||
# Limit chunks to max_grounding_chunks
|
||||
filtered_chunks = filtered_chunks[:self.max_grounding_chunks]
|
||||
|
||||
# Filter grounding supports by confidence
|
||||
filtered_supports = []
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
if support.confidence_scores and max(support.confidence_scores) >= self.min_grounding_confidence:
|
||||
filtered_supports.append(support)
|
||||
|
||||
# Filter citations by type and relevance
|
||||
filtered_citations = []
|
||||
for citation in grounding_metadata.citations:
|
||||
if self._is_citation_relevant(citation):
|
||||
filtered_citations.append(citation)
|
||||
|
||||
# Fail-open strategies to avoid empty UI:
|
||||
if not filtered_chunks and grounding_metadata.grounding_chunks:
|
||||
logger.warning("All grounding chunks filtered out. Falling back to first N chunks without confidence filter.")
|
||||
filtered_chunks = grounding_metadata.grounding_chunks[: self.max_grounding_chunks]
|
||||
if not filtered_supports and grounding_metadata.grounding_supports:
|
||||
logger.warning("All grounding supports filtered out. Falling back to first N supports without confidence filter.")
|
||||
filtered_supports = grounding_metadata.grounding_supports[: self.max_grounding_chunks]
|
||||
|
||||
# Create filtered grounding metadata
|
||||
filtered_metadata = GroundingMetadata(
|
||||
grounding_chunks=filtered_chunks,
|
||||
grounding_supports=filtered_supports,
|
||||
citations=filtered_citations,
|
||||
search_entry_point=grounding_metadata.search_entry_point,
|
||||
web_search_queries=grounding_metadata.web_search_queries
|
||||
)
|
||||
|
||||
logger.info(f"Filtered grounding metadata: {len(grounding_metadata.grounding_chunks)} chunks → {len(filtered_chunks)} chunks")
|
||||
return filtered_metadata
|
||||
|
||||
def clean_keyword_analysis(self, keyword_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Clean and deduplicate keyword analysis data.
|
||||
|
||||
Args:
|
||||
keyword_analysis: Raw keyword analysis data
|
||||
|
||||
Returns:
|
||||
Cleaned and deduplicated keyword analysis
|
||||
"""
|
||||
if not keyword_analysis:
|
||||
return {}
|
||||
|
||||
cleaned_analysis = {}
|
||||
|
||||
# Clean and deduplicate keyword lists
|
||||
keyword_categories = ['primary', 'secondary', 'long_tail', 'semantic_keywords', 'trending_terms']
|
||||
|
||||
for category in keyword_categories:
|
||||
if category in keyword_analysis and isinstance(keyword_analysis[category], list):
|
||||
cleaned_keywords = self._clean_keyword_list(keyword_analysis[category])
|
||||
cleaned_analysis[category] = cleaned_keywords[:self.max_keywords_per_category]
|
||||
|
||||
# Clean other fields
|
||||
other_fields = ['search_intent', 'difficulty', 'analysis_insights']
|
||||
for field in other_fields:
|
||||
if field in keyword_analysis:
|
||||
cleaned_analysis[field] = keyword_analysis[field]
|
||||
|
||||
# Clean content gaps separately (handled by filter_content_gaps)
|
||||
# Don't add content_gaps if it's empty to avoid adding empty lists
|
||||
if 'content_gaps' in keyword_analysis and keyword_analysis['content_gaps']:
|
||||
cleaned_analysis['content_gaps'] = keyword_analysis['content_gaps'] # Will be filtered later
|
||||
|
||||
logger.info(f"Cleaned keyword analysis: {len(keyword_analysis)} categories → {len(cleaned_analysis)} categories")
|
||||
return cleaned_analysis
|
||||
|
||||
def clean_competitor_analysis(self, competitor_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Clean and validate competitor analysis data.
|
||||
|
||||
Args:
|
||||
competitor_analysis: Raw competitor analysis data
|
||||
|
||||
Returns:
|
||||
Cleaned competitor analysis data
|
||||
"""
|
||||
if not competitor_analysis:
|
||||
return {}
|
||||
|
||||
cleaned_analysis = {}
|
||||
|
||||
# Clean competitor lists
|
||||
competitor_lists = ['top_competitors', 'opportunities', 'competitive_advantages']
|
||||
for field in competitor_lists:
|
||||
if field in competitor_analysis and isinstance(competitor_analysis[field], list):
|
||||
cleaned_list = [item.strip() for item in competitor_analysis[field] if item.strip()]
|
||||
cleaned_analysis[field] = cleaned_list[:10] # Limit to top 10
|
||||
|
||||
# Clean other fields
|
||||
other_fields = ['market_positioning', 'competitive_landscape', 'market_share']
|
||||
for field in other_fields:
|
||||
if field in competitor_analysis:
|
||||
cleaned_analysis[field] = competitor_analysis[field]
|
||||
|
||||
logger.info(f"Cleaned competitor analysis: {len(competitor_analysis)} fields → {len(cleaned_analysis)} fields")
|
||||
return cleaned_analysis
|
||||
|
||||
def filter_content_gaps(self, content_gaps: List[str], research_data: BlogResearchResponse) -> List[str]:
|
||||
"""
|
||||
Filter content gaps to keep only actionable, high-value ones.
|
||||
|
||||
Args:
|
||||
content_gaps: List of identified content gaps
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Filtered list of actionable content gaps
|
||||
"""
|
||||
if not content_gaps:
|
||||
return []
|
||||
|
||||
filtered_gaps = []
|
||||
|
||||
for gap in content_gaps:
|
||||
# Quality filters
|
||||
if not self._is_gap_high_quality(gap):
|
||||
continue
|
||||
|
||||
# Relevance filters
|
||||
if not self._is_gap_relevant_to_topic(gap, research_data):
|
||||
continue
|
||||
|
||||
# Actionability filters
|
||||
if not self._is_gap_actionable(gap):
|
||||
continue
|
||||
|
||||
filtered_gaps.append(gap)
|
||||
|
||||
# Limit to max_content_gaps
|
||||
filtered_gaps = filtered_gaps[:self.max_content_gaps]
|
||||
|
||||
logger.info(f"Filtered content gaps: {len(content_gaps)} → {len(filtered_gaps)}")
|
||||
return filtered_gaps
|
||||
|
||||
# Private helper methods
|
||||
|
||||
def _is_source_high_quality(self, source: ResearchSource) -> bool:
|
||||
"""Check if source meets quality criteria."""
|
||||
# Credibility score check
|
||||
if source.credibility_score and source.credibility_score < self.min_credibility_score:
|
||||
return False
|
||||
|
||||
# Excerpt length check
|
||||
if source.excerpt and len(source.excerpt) < self.min_excerpt_length:
|
||||
return False
|
||||
|
||||
# Title quality check
|
||||
if not source.title or len(source.title.strip()) < 10:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_source_relevant(self, source: ResearchSource) -> bool:
|
||||
"""Check if source is relevant (not irrelevant patterns)."""
|
||||
if not source.url:
|
||||
return True # Keep sources without URLs
|
||||
|
||||
# Check against irrelevant patterns
|
||||
for pattern in self.irrelevant_patterns:
|
||||
if re.search(pattern, source.url, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_source_recent(self, source: ResearchSource) -> bool:
|
||||
"""Check if source is recent enough."""
|
||||
if not source.published_at:
|
||||
return True # Keep sources without dates
|
||||
|
||||
try:
|
||||
# Parse date (assuming ISO format or common formats)
|
||||
published_date = self._parse_date(source.published_at)
|
||||
if published_date:
|
||||
cutoff_date = datetime.now() - timedelta(days=self.max_source_age_days)
|
||||
return published_date >= cutoff_date
|
||||
except Exception as e:
|
||||
logger.warning(f"Error parsing date '{source.published_at}': {e}")
|
||||
|
||||
return True # Keep sources with unparseable dates
|
||||
|
||||
def _is_citation_relevant(self, citation: Citation) -> bool:
|
||||
"""Check if citation is relevant and high-quality."""
|
||||
# Check citation type
|
||||
relevant_types = ['expert_opinion', 'statistical_data', 'recent_news', 'research_study']
|
||||
if citation.citation_type not in relevant_types:
|
||||
return False
|
||||
|
||||
# Check text quality
|
||||
if not citation.text or len(citation.text.strip()) < 20:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_gap_high_quality(self, gap: str) -> bool:
|
||||
"""Check if content gap is high quality."""
|
||||
gap = gap.strip()
|
||||
|
||||
# Length check
|
||||
if len(gap) < 10:
|
||||
return False
|
||||
|
||||
# Generic gap check
|
||||
generic_gaps = ['general', 'overview', 'introduction', 'basics', 'fundamentals']
|
||||
if gap.lower() in generic_gaps:
|
||||
return False
|
||||
|
||||
# Check for meaningful content
|
||||
if len(gap.split()) < 3:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_gap_relevant_to_topic(self, gap: str, research_data: BlogResearchResponse) -> bool:
|
||||
"""Check if content gap is relevant to the research topic."""
|
||||
# Simple relevance check - could be enhanced with more sophisticated matching
|
||||
primary_keywords = research_data.keyword_analysis.get('primary', [])
|
||||
|
||||
if not primary_keywords:
|
||||
return True # Keep gaps if no keywords available
|
||||
|
||||
gap_lower = gap.lower()
|
||||
for keyword in primary_keywords:
|
||||
if keyword.lower() in gap_lower:
|
||||
return True
|
||||
|
||||
# If no direct keyword match, check for common AI-related terms
|
||||
ai_terms = ['ai', 'artificial intelligence', 'machine learning', 'automation', 'technology', 'digital']
|
||||
for term in ai_terms:
|
||||
if term in gap_lower:
|
||||
return True
|
||||
|
||||
return True # Default to keeping gaps if no clear relevance check
|
||||
|
||||
def _is_gap_actionable(self, gap: str) -> bool:
|
||||
"""Check if content gap is actionable (can be addressed with content)."""
|
||||
gap_lower = gap.lower()
|
||||
|
||||
# Check for actionable indicators
|
||||
actionable_indicators = [
|
||||
'how to', 'guide', 'tutorial', 'steps', 'process', 'method',
|
||||
'best practices', 'tips', 'strategies', 'techniques', 'approach',
|
||||
'comparison', 'vs', 'versus', 'difference', 'pros and cons',
|
||||
'trends', 'future', '2024', '2025', 'emerging', 'new'
|
||||
]
|
||||
|
||||
for indicator in actionable_indicators:
|
||||
if indicator in gap_lower:
|
||||
return True
|
||||
|
||||
return True # Default to actionable if no specific indicators
|
||||
|
||||
def _clean_keyword_list(self, keywords: List[str]) -> List[str]:
|
||||
"""Clean and deduplicate a list of keywords."""
|
||||
cleaned_keywords = []
|
||||
seen_keywords = set()
|
||||
|
||||
for keyword in keywords:
|
||||
if not keyword or not isinstance(keyword, str):
|
||||
continue
|
||||
|
||||
# Clean keyword
|
||||
cleaned_keyword = keyword.strip().lower()
|
||||
|
||||
# Skip empty or too short keywords
|
||||
if len(cleaned_keyword) < 2:
|
||||
continue
|
||||
|
||||
# Skip stop words
|
||||
if cleaned_keyword in self.stop_words:
|
||||
continue
|
||||
|
||||
# Skip duplicates
|
||||
if cleaned_keyword in seen_keywords:
|
||||
continue
|
||||
|
||||
cleaned_keywords.append(cleaned_keyword)
|
||||
seen_keywords.add(cleaned_keyword)
|
||||
|
||||
return cleaned_keywords
|
||||
|
||||
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
||||
"""Parse date string into datetime object."""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
# Common date formats
|
||||
date_formats = [
|
||||
'%Y-%m-%d',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%SZ',
|
||||
'%Y-%m-%dT%H:%M:%S.%fZ',
|
||||
'%B %d, %Y',
|
||||
'%b %d, %Y',
|
||||
'%d %B %Y',
|
||||
'%d %b %Y',
|
||||
'%m/%d/%Y',
|
||||
'%d/%m/%Y'
|
||||
]
|
||||
|
||||
for fmt in date_formats:
|
||||
try:
|
||||
return datetime.strptime(date_str, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def _log_filtering_results(self, original_counts: Dict[str, int], filtered_research: BlogResearchResponse):
|
||||
"""Log the results of filtering operations."""
|
||||
filtered_counts = {
|
||||
'sources': len(filtered_research.sources),
|
||||
'grounding_chunks': len(filtered_research.grounding_metadata.grounding_chunks) if filtered_research.grounding_metadata else 0,
|
||||
'grounding_supports': len(filtered_research.grounding_metadata.grounding_supports) if filtered_research.grounding_metadata else 0,
|
||||
'citations': len(filtered_research.grounding_metadata.citations) if filtered_research.grounding_metadata else 0,
|
||||
}
|
||||
|
||||
logger.info("📊 Research Data Filtering Results:")
|
||||
for key, original_count in original_counts.items():
|
||||
filtered_count = filtered_counts[key]
|
||||
reduction_percent = ((original_count - filtered_count) / original_count * 100) if original_count > 0 else 0
|
||||
logger.info(f" {key}: {original_count} → {filtered_count} ({reduction_percent:.1f}% reduction)")
|
||||
|
||||
# Log content gaps filtering
|
||||
original_gaps = len(filtered_research.keyword_analysis.get('content_gaps', []))
|
||||
logger.info(f" content_gaps: {original_gaps} → {len(filtered_research.keyword_analysis.get('content_gaps', []))}")
|
||||
|
||||
logger.info("✅ Research data filtering completed successfully")
|
||||
@@ -11,11 +11,16 @@ from models.blog_models import (
|
||||
BlogResearchRequest,
|
||||
BlogResearchResponse,
|
||||
ResearchSource,
|
||||
GroundingMetadata,
|
||||
GroundingChunk,
|
||||
GroundingSupport,
|
||||
Citation,
|
||||
)
|
||||
|
||||
from .keyword_analyzer import KeywordAnalyzer
|
||||
from .competitor_analyzer import CompetitorAnalyzer
|
||||
from .content_angle_generator import ContentAngleGenerator
|
||||
from .data_filter import ResearchDataFilter
|
||||
|
||||
|
||||
class ResearchService:
|
||||
@@ -25,6 +30,7 @@ class ResearchService:
|
||||
self.keyword_analyzer = KeywordAnalyzer()
|
||||
self.competitor_analyzer = CompetitorAnalyzer()
|
||||
self.content_angle_generator = ContentAngleGenerator()
|
||||
self.data_filter = ResearchDataFilter()
|
||||
|
||||
async def research(self, request: BlogResearchRequest) -> BlogResearchResponse:
|
||||
"""
|
||||
@@ -85,6 +91,9 @@ class ResearchService:
|
||||
# Extract sources from grounding metadata
|
||||
sources = self._extract_sources_from_grounding(gemini_result)
|
||||
|
||||
# Extract grounding metadata for detailed UI display
|
||||
grounding_metadata = self._extract_grounding_metadata(gemini_result)
|
||||
|
||||
# Extract search widget and queries for UI display
|
||||
search_widget = gemini_result.get("search_widget", "") or ""
|
||||
search_queries = gemini_result.get("search_queries", []) or []
|
||||
@@ -107,17 +116,31 @@ class ResearchService:
|
||||
# Add search widget and queries for UI display
|
||||
search_widget=search_widget if 'search_widget' in locals() else "",
|
||||
search_queries=search_queries if 'search_queries' in locals() else [],
|
||||
# Add grounding metadata for detailed UI display
|
||||
grounding_metadata=grounding_metadata,
|
||||
)
|
||||
|
||||
# Cache the successful result for future exact keyword matches
|
||||
# Filter and clean research data for optimal AI processing
|
||||
filtered_response = self.data_filter.filter_research_data(response)
|
||||
logger.info("Research data filtering completed successfully")
|
||||
|
||||
# Cache the successful result for future exact keyword matches (both caches)
|
||||
persistent_research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
# Also cache in memory for faster access
|
||||
research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=response.dict()
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
return response
|
||||
return filtered_response
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
@@ -142,27 +165,38 @@ class ResearchService:
|
||||
try:
|
||||
from services.llm_providers.gemini_grounded_provider import GeminiGroundedProvider
|
||||
from services.cache.research_cache import research_cache
|
||||
from api.blog_writer.router import _update_progress
|
||||
from services.cache.persistent_research_cache import persistent_research_cache
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or (request.persona.industry if request.persona and request.persona.industry else "General")
|
||||
target_audience = getattr(request.persona, 'target_audience', 'General') if request.persona else 'General'
|
||||
|
||||
# Check cache first for exact keyword match
|
||||
await _update_progress(task_id, "🔍 Checking cache for existing research...")
|
||||
cached_result = research_cache.get_cached_result(
|
||||
# Check cache first for exact keyword match (try both caches)
|
||||
await task_manager.update_progress(task_id, "🔍 Checking cache for existing research...")
|
||||
|
||||
# Try persistent cache first (survives restarts)
|
||||
cached_result = persistent_research_cache.get_cached_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience
|
||||
)
|
||||
|
||||
# Fallback to in-memory cache
|
||||
if not cached_result:
|
||||
cached_result = research_cache.get_cached_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
await _update_progress(task_id, "✅ Found cached research results! Returning instantly...")
|
||||
await task_manager.update_progress(task_id, "✅ Found cached research results! Returning instantly...")
|
||||
logger.info(f"Returning cached research result for keywords: {request.keywords}")
|
||||
return BlogResearchResponse(**cached_result)
|
||||
|
||||
# Cache miss - proceed with API call
|
||||
await _update_progress(task_id, "🌐 Cache miss - connecting to Google Search grounding...")
|
||||
await task_manager.update_progress(task_id, "🌐 Cache miss - connecting to Google Search grounding...")
|
||||
logger.info(f"Cache miss - making API call for keywords: {request.keywords}")
|
||||
gemini = GeminiGroundedProvider()
|
||||
|
||||
@@ -185,7 +219,7 @@ class ResearchService:
|
||||
Structure your response with clear sections for each analysis area.
|
||||
"""
|
||||
|
||||
await _update_progress(task_id, "🤖 Making AI request to Gemini with Google Search grounding...")
|
||||
await task_manager.update_progress(task_id, "🤖 Making AI request to Gemini with Google Search grounding...")
|
||||
# Single Gemini call with native Google Search grounding - no fallbacks
|
||||
gemini_result = await gemini.generate_grounded_content(
|
||||
prompt=research_prompt,
|
||||
@@ -193,22 +227,25 @@ class ResearchService:
|
||||
max_tokens=2000
|
||||
)
|
||||
|
||||
await _update_progress(task_id, "📊 Processing research results and extracting insights...")
|
||||
await task_manager.update_progress(task_id, "📊 Processing research results and extracting insights...")
|
||||
# Extract sources from grounding metadata
|
||||
sources = self._extract_sources_from_grounding(gemini_result)
|
||||
|
||||
# Extract grounding metadata for detailed UI display
|
||||
grounding_metadata = self._extract_grounding_metadata(gemini_result)
|
||||
|
||||
# Extract search widget and queries for UI display
|
||||
search_widget = gemini_result.get("search_widget", "") or ""
|
||||
search_queries = gemini_result.get("search_queries", []) or []
|
||||
|
||||
await _update_progress(task_id, "🔍 Analyzing keywords and content angles...")
|
||||
await task_manager.update_progress(task_id, "🔍 Analyzing keywords and content angles...")
|
||||
# Parse the comprehensive response for different analysis components
|
||||
content = gemini_result.get("content", "")
|
||||
keyword_analysis = self.keyword_analyzer.analyze(content, request.keywords)
|
||||
competitor_analysis = self.competitor_analyzer.analyze(content)
|
||||
suggested_angles = self.content_angle_generator.generate(content, topic, industry)
|
||||
|
||||
await _update_progress(task_id, "💾 Caching results for future use...")
|
||||
await task_manager.update_progress(task_id, "💾 Caching results for future use...")
|
||||
logger.info(f"Research completed successfully with {len(sources)} sources and {len(search_queries)} search queries")
|
||||
|
||||
# Create the response
|
||||
@@ -221,17 +258,34 @@ class ResearchService:
|
||||
# Add search widget and queries for UI display
|
||||
search_widget=search_widget if 'search_widget' in locals() else "",
|
||||
search_queries=search_queries if 'search_queries' in locals() else [],
|
||||
# Add grounding metadata for detailed UI display
|
||||
grounding_metadata=grounding_metadata,
|
||||
# Preserve original user keywords for caching
|
||||
original_keywords=request.keywords,
|
||||
)
|
||||
|
||||
# Cache the successful result for future exact keyword matches
|
||||
# Filter and clean research data for optimal AI processing
|
||||
await task_manager.update_progress(task_id, "🔍 Filtering and cleaning research data...")
|
||||
filtered_response = self.data_filter.filter_research_data(response)
|
||||
logger.info("Research data filtering completed successfully")
|
||||
|
||||
# Cache the successful result for future exact keyword matches (both caches)
|
||||
persistent_research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
# Also cache in memory for faster access
|
||||
research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=response.dict()
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
return response
|
||||
return filtered_response
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
@@ -261,8 +315,104 @@ class ResearchService:
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
published_at=str(src.get("publication_date", "2024-01-01"))
|
||||
published_at=str(src.get("publication_date", "2024-01-01")),
|
||||
index=src.get("index"),
|
||||
source_type=src.get("type", "web")
|
||||
)
|
||||
sources.append(source)
|
||||
|
||||
return sources
|
||||
|
||||
def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> GroundingMetadata:
|
||||
"""Extract detailed grounding metadata from Gemini result."""
|
||||
grounding_chunks = []
|
||||
grounding_supports = []
|
||||
citations = []
|
||||
|
||||
# Extract grounding chunks from the raw grounding metadata
|
||||
raw_grounding = gemini_result.get("grounding_metadata", {})
|
||||
|
||||
# Handle case where grounding_metadata might be a GroundingMetadata object
|
||||
if hasattr(raw_grounding, 'grounding_chunks'):
|
||||
raw_chunks = raw_grounding.grounding_chunks
|
||||
else:
|
||||
raw_chunks = raw_grounding.get("grounding_chunks", [])
|
||||
|
||||
for chunk in raw_chunks:
|
||||
if "web" in chunk:
|
||||
web_data = chunk["web"]
|
||||
grounding_chunk = GroundingChunk(
|
||||
title=web_data.get("title", "Untitled"),
|
||||
url=web_data.get("uri", ""),
|
||||
confidence_score=None # Will be set from supports
|
||||
)
|
||||
grounding_chunks.append(grounding_chunk)
|
||||
|
||||
# Extract grounding supports with confidence scores
|
||||
if hasattr(raw_grounding, 'grounding_supports'):
|
||||
raw_supports = raw_grounding.grounding_supports
|
||||
else:
|
||||
raw_supports = raw_grounding.get("grounding_supports", [])
|
||||
for support in raw_supports:
|
||||
# Handle both dictionary and GroundingSupport object formats
|
||||
if hasattr(support, 'confidence_scores'):
|
||||
confidence_scores = support.confidence_scores
|
||||
chunk_indices = support.grounding_chunk_indices
|
||||
segment_text = getattr(support, 'segment_text', '')
|
||||
start_index = getattr(support, 'start_index', None)
|
||||
end_index = getattr(support, 'end_index', None)
|
||||
else:
|
||||
confidence_scores = support.get("confidence_scores", [])
|
||||
chunk_indices = support.get("grounding_chunk_indices", [])
|
||||
segment = support.get("segment", {})
|
||||
segment_text = segment.get("text", "")
|
||||
start_index = segment.get("start_index")
|
||||
end_index = segment.get("end_index")
|
||||
|
||||
grounding_support = GroundingSupport(
|
||||
confidence_scores=confidence_scores,
|
||||
grounding_chunk_indices=chunk_indices,
|
||||
segment_text=segment_text,
|
||||
start_index=start_index,
|
||||
end_index=end_index
|
||||
)
|
||||
grounding_supports.append(grounding_support)
|
||||
|
||||
# Update confidence scores for chunks
|
||||
if confidence_scores and chunk_indices:
|
||||
avg_confidence = sum(confidence_scores) / len(confidence_scores)
|
||||
for idx in chunk_indices:
|
||||
if idx < len(grounding_chunks):
|
||||
grounding_chunks[idx].confidence_score = avg_confidence
|
||||
|
||||
# Extract citations from the raw result
|
||||
raw_citations = gemini_result.get("citations", [])
|
||||
for citation in raw_citations:
|
||||
citation_obj = Citation(
|
||||
citation_type=citation.get("type", "inline"),
|
||||
start_index=citation.get("start_index", 0),
|
||||
end_index=citation.get("end_index", 0),
|
||||
text=citation.get("text", ""),
|
||||
source_indices=citation.get("source_indices", []),
|
||||
reference=citation.get("reference", "")
|
||||
)
|
||||
citations.append(citation_obj)
|
||||
|
||||
# Extract search entry point and web search queries
|
||||
if hasattr(raw_grounding, 'search_entry_point'):
|
||||
search_entry_point = getattr(raw_grounding.search_entry_point, 'rendered_content', '') if raw_grounding.search_entry_point else ''
|
||||
else:
|
||||
search_entry_point = raw_grounding.get("search_entry_point", {}).get("rendered_content", "")
|
||||
|
||||
if hasattr(raw_grounding, 'web_search_queries'):
|
||||
web_search_queries = raw_grounding.web_search_queries
|
||||
else:
|
||||
web_search_queries = raw_grounding.get("web_search_queries", [])
|
||||
|
||||
return GroundingMetadata(
|
||||
grounding_chunks=grounding_chunks,
|
||||
grounding_supports=grounding_supports,
|
||||
citations=citations,
|
||||
search_entry_point=search_entry_point,
|
||||
web_search_queries=web_search_queries
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user