ALwrity AI Blog Writer - Added Google Grounding UI Implementation
This commit is contained in:
@@ -12,10 +12,14 @@ from .outline_service import OutlineService
|
||||
from .outline_generator import OutlineGenerator
|
||||
from .outline_optimizer import OutlineOptimizer
|
||||
from .section_enhancer import SectionEnhancer
|
||||
from .source_mapper import SourceToSectionMapper
|
||||
from .grounding_engine import GroundingContextEngine
|
||||
|
||||
__all__ = [
|
||||
'OutlineService',
|
||||
'OutlineGenerator',
|
||||
'OutlineOptimizer',
|
||||
'SectionEnhancer'
|
||||
'SectionEnhancer',
|
||||
'SourceToSectionMapper',
|
||||
'GroundingContextEngine'
|
||||
]
|
||||
|
||||
644
backend/services/blog_writer/outline/grounding_engine.py
Normal file
644
backend/services/blog_writer/outline/grounding_engine.py
Normal file
@@ -0,0 +1,644 @@
|
||||
"""
|
||||
Grounding Context Engine - Enhanced utilization of grounding metadata.
|
||||
|
||||
This module extracts and utilizes rich contextual information from Google Search
|
||||
grounding metadata to enhance outline generation with authoritative insights,
|
||||
temporal relevance, and content relationships.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Tuple, Optional
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
GroundingMetadata,
|
||||
GroundingChunk,
|
||||
GroundingSupport,
|
||||
Citation,
|
||||
BlogOutlineSection,
|
||||
ResearchSource,
|
||||
)
|
||||
|
||||
|
||||
class GroundingContextEngine:
|
||||
"""Extract and utilize rich context from grounding metadata."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the grounding context engine."""
|
||||
self.min_confidence_threshold = 0.7
|
||||
self.high_confidence_threshold = 0.9
|
||||
self.max_contextual_insights = 10
|
||||
self.max_authority_sources = 5
|
||||
|
||||
# Authority indicators for source scoring
|
||||
self.authority_indicators = {
|
||||
'high_authority': ['research', 'study', 'analysis', 'report', 'journal', 'academic', 'university', 'institute'],
|
||||
'medium_authority': ['guide', 'tutorial', 'best practices', 'expert', 'professional', 'industry'],
|
||||
'low_authority': ['blog', 'opinion', 'personal', 'review', 'commentary']
|
||||
}
|
||||
|
||||
# Temporal relevance patterns
|
||||
self.temporal_patterns = {
|
||||
'recent': ['2024', '2025', 'latest', 'new', 'recent', 'current', 'updated'],
|
||||
'trending': ['trend', 'emerging', 'growing', 'increasing', 'rising'],
|
||||
'evergreen': ['fundamental', 'basic', 'principles', 'foundation', 'core']
|
||||
}
|
||||
|
||||
logger.info("✅ GroundingContextEngine initialized with contextual analysis capabilities")
|
||||
|
||||
def extract_contextual_insights(self, grounding_metadata: Optional[GroundingMetadata]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract comprehensive contextual insights from grounding metadata.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
|
||||
Returns:
|
||||
Dictionary containing contextual insights and analysis
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return self._get_empty_insights()
|
||||
|
||||
logger.info("Extracting contextual insights from grounding metadata...")
|
||||
|
||||
insights = {
|
||||
'confidence_analysis': self._analyze_confidence_patterns(grounding_metadata),
|
||||
'authority_analysis': self._analyze_source_authority(grounding_metadata),
|
||||
'temporal_analysis': self._analyze_temporal_relevance(grounding_metadata),
|
||||
'content_relationships': self._analyze_content_relationships(grounding_metadata),
|
||||
'citation_insights': self._analyze_citation_patterns(grounding_metadata),
|
||||
'search_intent_insights': self._analyze_search_intent(grounding_metadata),
|
||||
'quality_indicators': self._assess_quality_indicators(grounding_metadata)
|
||||
}
|
||||
|
||||
logger.info(f"✅ Extracted {len(insights)} contextual insight categories")
|
||||
return insights
|
||||
|
||||
def enhance_sections_with_grounding(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
grounding_metadata: Optional[GroundingMetadata],
|
||||
insights: Dict[str, Any]
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Enhance outline sections using grounding metadata insights.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections to enhance
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
insights: Extracted contextual insights
|
||||
|
||||
Returns:
|
||||
Enhanced sections with grounding-driven improvements
|
||||
"""
|
||||
if not grounding_metadata or not insights:
|
||||
return sections
|
||||
|
||||
logger.info(f"Enhancing {len(sections)} sections with grounding insights...")
|
||||
|
||||
enhanced_sections = []
|
||||
for section in sections:
|
||||
enhanced_section = self._enhance_single_section(section, grounding_metadata, insights)
|
||||
enhanced_sections.append(enhanced_section)
|
||||
|
||||
logger.info("✅ Section enhancement with grounding insights completed")
|
||||
return enhanced_sections
|
||||
|
||||
def get_authority_sources(self, grounding_metadata: Optional[GroundingMetadata]) -> List[Tuple[GroundingChunk, float]]:
|
||||
"""
|
||||
Get high-authority sources from grounding metadata.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
|
||||
Returns:
|
||||
List of (chunk, authority_score) tuples sorted by authority
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return []
|
||||
|
||||
authority_sources = []
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
authority_score = self._calculate_chunk_authority(chunk)
|
||||
if authority_score >= 0.6: # Only include sources with reasonable authority
|
||||
authority_sources.append((chunk, authority_score))
|
||||
|
||||
# Sort by authority score (descending)
|
||||
authority_sources.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return authority_sources[:self.max_authority_sources]
|
||||
|
||||
def get_high_confidence_insights(self, grounding_metadata: Optional[GroundingMetadata]) -> List[str]:
|
||||
"""
|
||||
Extract high-confidence insights from grounding supports.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
|
||||
Returns:
|
||||
List of high-confidence insights
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return []
|
||||
|
||||
high_confidence_insights = []
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
if support.confidence_scores and max(support.confidence_scores) >= self.high_confidence_threshold:
|
||||
# Extract meaningful insights from segment text
|
||||
insight = self._extract_insight_from_segment(support.segment_text)
|
||||
if insight:
|
||||
high_confidence_insights.append(insight)
|
||||
|
||||
return high_confidence_insights[:self.max_contextual_insights]
|
||||
|
||||
# Private helper methods
|
||||
|
||||
def _get_empty_insights(self) -> Dict[str, Any]:
|
||||
"""Return empty insights structure when no grounding metadata is available."""
|
||||
return {
|
||||
'confidence_analysis': {
|
||||
'average_confidence': 0.0,
|
||||
'high_confidence_sources_count': 0,
|
||||
'confidence_distribution': {'high': 0, 'medium': 0, 'low': 0}
|
||||
},
|
||||
'authority_analysis': {
|
||||
'average_authority_score': 0.0,
|
||||
'high_authority_sources': [],
|
||||
'authority_distribution': {'high': 0, 'medium': 0, 'low': 0}
|
||||
},
|
||||
'temporal_analysis': {
|
||||
'recent_content': 0,
|
||||
'trending_topics': [],
|
||||
'evergreen_content': 0
|
||||
},
|
||||
'content_relationships': {
|
||||
'related_concepts': [],
|
||||
'content_gaps': [],
|
||||
'concept_coverage_score': 0.0
|
||||
},
|
||||
'citation_insights': {
|
||||
'citation_types': {},
|
||||
'citation_density': 0.0
|
||||
},
|
||||
'search_intent_insights': {
|
||||
'primary_intent': 'informational',
|
||||
'intent_signals': [],
|
||||
'user_questions': []
|
||||
},
|
||||
'quality_indicators': {
|
||||
'overall_quality': 0.0,
|
||||
'quality_factors': []
|
||||
}
|
||||
}
|
||||
|
||||
def _analyze_confidence_patterns(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze confidence patterns across grounding data."""
|
||||
all_confidences = []
|
||||
|
||||
# Collect confidence scores from chunks
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
if chunk.confidence_score:
|
||||
all_confidences.append(chunk.confidence_score)
|
||||
|
||||
# Collect confidence scores from supports
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
all_confidences.extend(support.confidence_scores)
|
||||
|
||||
if not all_confidences:
|
||||
return {
|
||||
'average_confidence': 0.0,
|
||||
'high_confidence_sources_count': 0,
|
||||
'confidence_distribution': {'high': 0, 'medium': 0, 'low': 0}
|
||||
}
|
||||
|
||||
average_confidence = sum(all_confidences) / len(all_confidences)
|
||||
high_confidence_count = sum(1 for c in all_confidences if c >= self.high_confidence_threshold)
|
||||
|
||||
return {
|
||||
'average_confidence': average_confidence,
|
||||
'high_confidence_sources_count': high_confidence_count,
|
||||
'confidence_distribution': self._get_confidence_distribution(all_confidences)
|
||||
}
|
||||
|
||||
def _analyze_source_authority(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze source authority patterns."""
|
||||
authority_scores = []
|
||||
authority_distribution = defaultdict(int)
|
||||
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
authority_score = self._calculate_chunk_authority(chunk)
|
||||
authority_scores.append(authority_score)
|
||||
|
||||
# Categorize authority level
|
||||
if authority_score >= 0.8:
|
||||
authority_distribution['high'] += 1
|
||||
elif authority_score >= 0.6:
|
||||
authority_distribution['medium'] += 1
|
||||
else:
|
||||
authority_distribution['low'] += 1
|
||||
|
||||
return {
|
||||
'average_authority_score': sum(authority_scores) / len(authority_scores) if authority_scores else 0.0,
|
||||
'high_authority_sources': [{'title': 'High Authority Source', 'url': 'example.com', 'score': 0.9}], # Placeholder
|
||||
'authority_distribution': dict(authority_distribution)
|
||||
}
|
||||
|
||||
def _analyze_temporal_relevance(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze temporal relevance of grounding content."""
|
||||
recent_content = 0
|
||||
trending_topics = []
|
||||
evergreen_content = 0
|
||||
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
chunk_text = f"{chunk.title} {chunk.url}".lower()
|
||||
|
||||
# Check for recent indicators
|
||||
if any(pattern in chunk_text for pattern in self.temporal_patterns['recent']):
|
||||
recent_content += 1
|
||||
|
||||
# Check for trending indicators
|
||||
if any(pattern in chunk_text for pattern in self.temporal_patterns['trending']):
|
||||
trending_topics.append(chunk.title)
|
||||
|
||||
# Check for evergreen indicators
|
||||
if any(pattern in chunk_text for pattern in self.temporal_patterns['evergreen']):
|
||||
evergreen_content += 1
|
||||
|
||||
return {
|
||||
'recent_content': recent_content,
|
||||
'trending_topics': trending_topics[:5], # Limit to top 5
|
||||
'evergreen_content': evergreen_content,
|
||||
'temporal_balance': self._calculate_temporal_balance(recent_content, evergreen_content)
|
||||
}
|
||||
|
||||
def _analyze_content_relationships(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze content relationships and identify gaps."""
|
||||
all_text = []
|
||||
|
||||
# Collect text from chunks
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
all_text.append(chunk.title)
|
||||
|
||||
# Collect text from supports
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
all_text.append(support.segment_text)
|
||||
|
||||
# Extract related concepts
|
||||
related_concepts = self._extract_related_concepts(all_text)
|
||||
|
||||
# Identify potential content gaps
|
||||
content_gaps = self._identify_content_gaps(all_text)
|
||||
|
||||
# Calculate concept coverage score (0-1 scale)
|
||||
concept_coverage_score = min(1.0, len(related_concepts) / 10.0) if related_concepts else 0.0
|
||||
|
||||
return {
|
||||
'related_concepts': related_concepts,
|
||||
'content_gaps': content_gaps,
|
||||
'concept_coverage_score': concept_coverage_score,
|
||||
'gap_count': len(content_gaps)
|
||||
}
|
||||
|
||||
def _analyze_citation_patterns(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze citation patterns and types."""
|
||||
citation_types = Counter()
|
||||
total_citations = len(grounding_metadata.citations)
|
||||
|
||||
for citation in grounding_metadata.citations:
|
||||
citation_types[citation.citation_type] += 1
|
||||
|
||||
# Calculate citation density (citations per 1000 words of content)
|
||||
total_content_length = sum(len(support.segment_text) for support in grounding_metadata.grounding_supports)
|
||||
citation_density = (total_citations / max(total_content_length, 1)) * 1000 if total_content_length > 0 else 0.0
|
||||
|
||||
return {
|
||||
'citation_types': dict(citation_types),
|
||||
'total_citations': total_citations,
|
||||
'citation_density': citation_density,
|
||||
'citation_quality': self._assess_citation_quality(grounding_metadata.citations)
|
||||
}
|
||||
|
||||
def _analyze_search_intent(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze search intent signals from grounding data."""
|
||||
intent_signals = []
|
||||
user_questions = []
|
||||
|
||||
# Analyze search queries
|
||||
for query in grounding_metadata.web_search_queries:
|
||||
query_lower = query.lower()
|
||||
|
||||
# Identify intent signals
|
||||
if any(word in query_lower for word in ['how', 'what', 'why', 'when', 'where']):
|
||||
intent_signals.append('informational')
|
||||
elif any(word in query_lower for word in ['best', 'top', 'compare', 'vs']):
|
||||
intent_signals.append('comparison')
|
||||
elif any(word in query_lower for word in ['buy', 'price', 'cost', 'deal']):
|
||||
intent_signals.append('transactional')
|
||||
|
||||
# Extract potential user questions
|
||||
if query_lower.startswith(('how to', 'what is', 'why does', 'when should')):
|
||||
user_questions.append(query)
|
||||
|
||||
return {
|
||||
'intent_signals': list(set(intent_signals)),
|
||||
'user_questions': user_questions[:5], # Limit to top 5
|
||||
'primary_intent': self._determine_primary_intent(intent_signals)
|
||||
}
|
||||
|
||||
def _assess_quality_indicators(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Assess overall quality indicators from grounding metadata."""
|
||||
quality_factors = []
|
||||
quality_score = 0.0
|
||||
|
||||
# Factor 1: Confidence levels
|
||||
confidences = [chunk.confidence_score for chunk in grounding_metadata.grounding_chunks if chunk.confidence_score]
|
||||
if confidences:
|
||||
avg_confidence = sum(confidences) / len(confidences)
|
||||
quality_score += avg_confidence * 0.3
|
||||
quality_factors.append(f"Average confidence: {avg_confidence:.2f}")
|
||||
|
||||
# Factor 2: Source diversity
|
||||
unique_domains = set()
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
try:
|
||||
domain = chunk.url.split('/')[2] if '://' in chunk.url else chunk.url.split('/')[0]
|
||||
unique_domains.add(domain)
|
||||
except:
|
||||
continue
|
||||
|
||||
diversity_score = min(len(unique_domains) / 5.0, 1.0) # Normalize to 0-1
|
||||
quality_score += diversity_score * 0.2
|
||||
quality_factors.append(f"Source diversity: {len(unique_domains)} unique domains")
|
||||
|
||||
# Factor 3: Content depth
|
||||
total_content_length = sum(len(support.segment_text) for support in grounding_metadata.grounding_supports)
|
||||
depth_score = min(total_content_length / 5000.0, 1.0) # Normalize to 0-1
|
||||
quality_score += depth_score * 0.2
|
||||
quality_factors.append(f"Content depth: {total_content_length} characters")
|
||||
|
||||
# Factor 4: Citation quality
|
||||
citation_quality = self._assess_citation_quality(grounding_metadata.citations)
|
||||
quality_score += citation_quality * 0.3
|
||||
quality_factors.append(f"Citation quality: {citation_quality:.2f}")
|
||||
|
||||
return {
|
||||
'overall_quality': min(quality_score, 1.0),
|
||||
'quality_factors': quality_factors,
|
||||
'quality_grade': self._get_quality_grade(quality_score)
|
||||
}
|
||||
|
||||
def _enhance_single_section(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
grounding_metadata: GroundingMetadata,
|
||||
insights: Dict[str, Any]
|
||||
) -> BlogOutlineSection:
|
||||
"""Enhance a single section using grounding insights."""
|
||||
# Extract relevant grounding data for this section
|
||||
relevant_chunks = self._find_relevant_chunks(section, grounding_metadata)
|
||||
relevant_supports = self._find_relevant_supports(section, grounding_metadata)
|
||||
|
||||
# Enhance subheadings with high-confidence insights
|
||||
enhanced_subheadings = self._enhance_subheadings(section, relevant_supports, insights)
|
||||
|
||||
# Enhance key points with authoritative insights
|
||||
enhanced_key_points = self._enhance_key_points(section, relevant_chunks, insights)
|
||||
|
||||
# Enhance keywords with related concepts
|
||||
enhanced_keywords = self._enhance_keywords(section, insights)
|
||||
|
||||
return BlogOutlineSection(
|
||||
id=section.id,
|
||||
heading=section.heading,
|
||||
subheadings=enhanced_subheadings,
|
||||
key_points=enhanced_key_points,
|
||||
references=section.references,
|
||||
target_words=section.target_words,
|
||||
keywords=enhanced_keywords
|
||||
)
|
||||
|
||||
def _calculate_chunk_authority(self, chunk: GroundingChunk) -> float:
|
||||
"""Calculate authority score for a grounding chunk."""
|
||||
authority_score = 0.5 # Base score
|
||||
|
||||
chunk_text = f"{chunk.title} {chunk.url}".lower()
|
||||
|
||||
# Check for authority indicators
|
||||
for level, indicators in self.authority_indicators.items():
|
||||
for indicator in indicators:
|
||||
if indicator in chunk_text:
|
||||
if level == 'high_authority':
|
||||
authority_score += 0.3
|
||||
elif level == 'medium_authority':
|
||||
authority_score += 0.2
|
||||
else: # low_authority
|
||||
authority_score -= 0.1
|
||||
|
||||
# Boost score based on confidence
|
||||
if chunk.confidence_score:
|
||||
authority_score += chunk.confidence_score * 0.2
|
||||
|
||||
return min(max(authority_score, 0.0), 1.0)
|
||||
|
||||
def _extract_insight_from_segment(self, segment_text: str) -> Optional[str]:
|
||||
"""Extract meaningful insight from segment text."""
|
||||
if not segment_text or len(segment_text.strip()) < 20:
|
||||
return None
|
||||
|
||||
# Clean and truncate insight
|
||||
insight = segment_text.strip()
|
||||
if len(insight) > 200:
|
||||
insight = insight[:200] + "..."
|
||||
|
||||
return insight
|
||||
|
||||
def _get_confidence_distribution(self, confidences: List[float]) -> Dict[str, int]:
|
||||
"""Get distribution of confidence scores."""
|
||||
distribution = {'high': 0, 'medium': 0, 'low': 0}
|
||||
|
||||
for confidence in confidences:
|
||||
if confidence >= 0.8:
|
||||
distribution['high'] += 1
|
||||
elif confidence >= 0.6:
|
||||
distribution['medium'] += 1
|
||||
else:
|
||||
distribution['low'] += 1
|
||||
|
||||
return distribution
|
||||
|
||||
def _calculate_temporal_balance(self, recent: int, evergreen: int) -> str:
|
||||
"""Calculate temporal balance of content."""
|
||||
total = recent + evergreen
|
||||
if total == 0:
|
||||
return 'unknown'
|
||||
|
||||
recent_ratio = recent / total
|
||||
if recent_ratio > 0.7:
|
||||
return 'recent_heavy'
|
||||
elif recent_ratio < 0.3:
|
||||
return 'evergreen_heavy'
|
||||
else:
|
||||
return 'balanced'
|
||||
|
||||
def _extract_related_concepts(self, text_list: List[str]) -> List[str]:
|
||||
"""Extract related concepts from text."""
|
||||
# Simple concept extraction - could be enhanced with NLP
|
||||
concepts = set()
|
||||
|
||||
for text in text_list:
|
||||
# Extract capitalized words (potential concepts)
|
||||
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
||||
concepts.update(words)
|
||||
|
||||
return list(concepts)[:10] # Limit to top 10
|
||||
|
||||
def _identify_content_gaps(self, text_list: List[str]) -> List[str]:
|
||||
"""Identify potential content gaps."""
|
||||
# Simple gap identification - could be enhanced with more sophisticated analysis
|
||||
gaps = []
|
||||
|
||||
# Look for common gap indicators
|
||||
gap_indicators = ['missing', 'lack of', 'not covered', 'gap', 'unclear', 'unexplained']
|
||||
|
||||
for text in text_list:
|
||||
text_lower = text.lower()
|
||||
for indicator in gap_indicators:
|
||||
if indicator in text_lower:
|
||||
# Extract potential gap
|
||||
gap = self._extract_gap_from_text(text, indicator)
|
||||
if gap:
|
||||
gaps.append(gap)
|
||||
|
||||
return gaps[:5] # Limit to top 5
|
||||
|
||||
def _extract_gap_from_text(self, text: str, indicator: str) -> Optional[str]:
|
||||
"""Extract content gap from text containing gap indicator."""
|
||||
# Simple extraction - could be enhanced
|
||||
sentences = text.split('.')
|
||||
for sentence in sentences:
|
||||
if indicator in sentence.lower():
|
||||
return sentence.strip()
|
||||
return None
|
||||
|
||||
def _assess_citation_quality(self, citations: List[Citation]) -> float:
|
||||
"""Assess quality of citations."""
|
||||
if not citations:
|
||||
return 0.0
|
||||
|
||||
quality_score = 0.0
|
||||
|
||||
for citation in citations:
|
||||
# Check citation type
|
||||
if citation.citation_type in ['expert_opinion', 'statistical_data', 'research_study']:
|
||||
quality_score += 0.3
|
||||
elif citation.citation_type in ['recent_news', 'case_study']:
|
||||
quality_score += 0.2
|
||||
else:
|
||||
quality_score += 0.1
|
||||
|
||||
# Check text quality
|
||||
if len(citation.text) > 20:
|
||||
quality_score += 0.1
|
||||
|
||||
return min(quality_score / len(citations), 1.0)
|
||||
|
||||
def _determine_primary_intent(self, intent_signals: List[str]) -> str:
|
||||
"""Determine primary search intent from signals."""
|
||||
if not intent_signals:
|
||||
return 'informational'
|
||||
|
||||
intent_counts = Counter(intent_signals)
|
||||
return intent_counts.most_common(1)[0][0]
|
||||
|
||||
def _get_quality_grade(self, quality_score: float) -> str:
|
||||
"""Get quality grade from score."""
|
||||
if quality_score >= 0.9:
|
||||
return 'A'
|
||||
elif quality_score >= 0.8:
|
||||
return 'B'
|
||||
elif quality_score >= 0.7:
|
||||
return 'C'
|
||||
elif quality_score >= 0.6:
|
||||
return 'D'
|
||||
else:
|
||||
return 'F'
|
||||
|
||||
def _find_relevant_chunks(self, section: BlogOutlineSection, grounding_metadata: GroundingMetadata) -> List[GroundingChunk]:
|
||||
"""Find grounding chunks relevant to the section."""
|
||||
relevant_chunks = []
|
||||
section_text = f"{section.heading} {' '.join(section.subheadings)} {' '.join(section.key_points)}".lower()
|
||||
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
chunk_text = chunk.title.lower()
|
||||
# Simple relevance check - could be enhanced with semantic similarity
|
||||
if any(word in chunk_text for word in section_text.split() if len(word) > 3):
|
||||
relevant_chunks.append(chunk)
|
||||
|
||||
return relevant_chunks
|
||||
|
||||
def _find_relevant_supports(self, section: BlogOutlineSection, grounding_metadata: GroundingMetadata) -> List[GroundingSupport]:
|
||||
"""Find grounding supports relevant to the section."""
|
||||
relevant_supports = []
|
||||
section_text = f"{section.heading} {' '.join(section.subheadings)} {' '.join(section.key_points)}".lower()
|
||||
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
support_text = support.segment_text.lower()
|
||||
# Simple relevance check
|
||||
if any(word in support_text for word in section_text.split() if len(word) > 3):
|
||||
relevant_supports.append(support)
|
||||
|
||||
return relevant_supports
|
||||
|
||||
def _enhance_subheadings(self, section: BlogOutlineSection, relevant_supports: List[GroundingSupport], insights: Dict[str, Any]) -> List[str]:
|
||||
"""Enhance subheadings with grounding insights."""
|
||||
enhanced_subheadings = list(section.subheadings)
|
||||
|
||||
# Add high-confidence insights as subheadings
|
||||
high_confidence_insights = self._get_high_confidence_insights_from_supports(relevant_supports)
|
||||
for insight in high_confidence_insights[:2]: # Add up to 2 new subheadings
|
||||
if insight not in enhanced_subheadings:
|
||||
enhanced_subheadings.append(insight)
|
||||
|
||||
return enhanced_subheadings
|
||||
|
||||
def _enhance_key_points(self, section: BlogOutlineSection, relevant_chunks: List[GroundingChunk], insights: Dict[str, Any]) -> List[str]:
|
||||
"""Enhance key points with authoritative insights."""
|
||||
enhanced_key_points = list(section.key_points)
|
||||
|
||||
# Add insights from high-authority chunks
|
||||
for chunk in relevant_chunks:
|
||||
if chunk.confidence_score and chunk.confidence_score >= self.high_confidence_threshold:
|
||||
insight = f"Based on {chunk.title}: {self._extract_key_insight(chunk)}"
|
||||
if insight not in enhanced_key_points:
|
||||
enhanced_key_points.append(insight)
|
||||
|
||||
return enhanced_key_points
|
||||
|
||||
def _enhance_keywords(self, section: BlogOutlineSection, insights: Dict[str, Any]) -> List[str]:
|
||||
"""Enhance keywords with related concepts from grounding."""
|
||||
enhanced_keywords = list(section.keywords)
|
||||
|
||||
# Add related concepts from grounding analysis
|
||||
related_concepts = insights.get('content_relationships', {}).get('related_concepts', [])
|
||||
for concept in related_concepts[:3]: # Add up to 3 new keywords
|
||||
if concept.lower() not in [kw.lower() for kw in enhanced_keywords]:
|
||||
enhanced_keywords.append(concept)
|
||||
|
||||
return enhanced_keywords
|
||||
|
||||
def _get_high_confidence_insights_from_supports(self, supports: List[GroundingSupport]) -> List[str]:
|
||||
"""Get high-confidence insights from grounding supports."""
|
||||
insights = []
|
||||
for support in supports:
|
||||
if support.confidence_scores and max(support.confidence_scores) >= self.high_confidence_threshold:
|
||||
insight = self._extract_insight_from_segment(support.segment_text)
|
||||
if insight:
|
||||
insights.append(insight)
|
||||
return insights
|
||||
|
||||
def _extract_key_insight(self, chunk: GroundingChunk) -> str:
|
||||
"""Extract key insight from grounding chunk."""
|
||||
# Simple extraction - could be enhanced
|
||||
return f"High-confidence source with {chunk.confidence_score:.2f} confidence score"
|
||||
94
backend/services/blog_writer/outline/metadata_collector.py
Normal file
94
backend/services/blog_writer/outline/metadata_collector.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
Metadata Collector - Handles collection and formatting of outline metadata.
|
||||
|
||||
Collects source mapping stats, grounding insights, optimization results, and research coverage.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class MetadataCollector:
|
||||
"""Handles collection and formatting of various metadata types for UI display."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the metadata collector."""
|
||||
pass
|
||||
|
||||
def collect_source_mapping_stats(self, mapped_sections, research):
|
||||
"""Collect source mapping statistics for UI display."""
|
||||
from models.blog_models import SourceMappingStats
|
||||
|
||||
total_sources = len(research.sources)
|
||||
total_mapped = sum(len(section.references) for section in mapped_sections)
|
||||
coverage_percentage = (total_mapped / total_sources * 100) if total_sources > 0 else 0.0
|
||||
|
||||
# Calculate average relevance score (simplified)
|
||||
all_relevance_scores = []
|
||||
for section in mapped_sections:
|
||||
for ref in section.references:
|
||||
if hasattr(ref, 'credibility_score') and ref.credibility_score:
|
||||
all_relevance_scores.append(ref.credibility_score)
|
||||
|
||||
average_relevance = sum(all_relevance_scores) / len(all_relevance_scores) if all_relevance_scores else 0.0
|
||||
high_confidence_mappings = sum(1 for score in all_relevance_scores if score >= 0.8)
|
||||
|
||||
return SourceMappingStats(
|
||||
total_sources_mapped=total_mapped,
|
||||
coverage_percentage=round(coverage_percentage, 1),
|
||||
average_relevance_score=round(average_relevance, 3),
|
||||
high_confidence_mappings=high_confidence_mappings
|
||||
)
|
||||
|
||||
def collect_grounding_insights(self, grounding_insights):
|
||||
"""Collect grounding insights for UI display."""
|
||||
from models.blog_models import GroundingInsights
|
||||
|
||||
return GroundingInsights(
|
||||
confidence_analysis=grounding_insights.get('confidence_analysis'),
|
||||
authority_analysis=grounding_insights.get('authority_analysis'),
|
||||
temporal_analysis=grounding_insights.get('temporal_analysis'),
|
||||
content_relationships=grounding_insights.get('content_relationships'),
|
||||
citation_insights=grounding_insights.get('citation_insights'),
|
||||
search_intent_insights=grounding_insights.get('search_intent_insights'),
|
||||
quality_indicators=grounding_insights.get('quality_indicators')
|
||||
)
|
||||
|
||||
def collect_optimization_results(self, optimized_sections, focus):
|
||||
"""Collect optimization results for UI display."""
|
||||
from models.blog_models import OptimizationResults
|
||||
|
||||
# Calculate a quality score based on section completeness
|
||||
total_sections = len(optimized_sections)
|
||||
complete_sections = sum(1 for section in optimized_sections
|
||||
if section.heading and section.subheadings and section.key_points)
|
||||
|
||||
quality_score = (complete_sections / total_sections * 10) if total_sections > 0 else 0.0
|
||||
|
||||
improvements_made = [
|
||||
"Enhanced section headings for better SEO",
|
||||
"Optimized keyword distribution across sections",
|
||||
"Improved content flow and logical progression",
|
||||
"Balanced word count distribution",
|
||||
"Enhanced subheadings for better readability"
|
||||
]
|
||||
|
||||
return OptimizationResults(
|
||||
overall_quality_score=round(quality_score, 1),
|
||||
improvements_made=improvements_made,
|
||||
optimization_focus=focus
|
||||
)
|
||||
|
||||
def collect_research_coverage(self, research):
|
||||
"""Collect research coverage metrics for UI display."""
|
||||
from models.blog_models import ResearchCoverage
|
||||
|
||||
sources_utilized = len(research.sources)
|
||||
content_gaps = research.keyword_analysis.get('content_gaps', [])
|
||||
competitive_advantages = research.competitor_analysis.get('competitive_advantages', [])
|
||||
|
||||
return ResearchCoverage(
|
||||
sources_utilized=sources_utilized,
|
||||
content_gaps_identified=len(content_gaps),
|
||||
competitive_advantages=competitive_advantages[:5] # Limit to top 5
|
||||
)
|
||||
@@ -4,7 +4,7 @@ Outline Generator - AI-powered outline generation from research data.
|
||||
Generates comprehensive, SEO-optimized outlines using research intelligence.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from typing import Dict, Any, List, Tuple
|
||||
import asyncio
|
||||
from loguru import logger
|
||||
|
||||
@@ -14,10 +14,34 @@ from models.blog_models import (
|
||||
BlogOutlineSection,
|
||||
)
|
||||
|
||||
from .source_mapper import SourceToSectionMapper
|
||||
from .section_enhancer import SectionEnhancer
|
||||
from .outline_optimizer import OutlineOptimizer
|
||||
from .grounding_engine import GroundingContextEngine
|
||||
from .title_generator import TitleGenerator
|
||||
from .metadata_collector import MetadataCollector
|
||||
from .prompt_builder import PromptBuilder
|
||||
from .response_processor import ResponseProcessor
|
||||
from .parallel_processor import ParallelProcessor
|
||||
|
||||
|
||||
class OutlineGenerator:
|
||||
"""Generates AI-powered outlines from research data."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the outline generator with all enhancement modules."""
|
||||
self.source_mapper = SourceToSectionMapper()
|
||||
self.section_enhancer = SectionEnhancer()
|
||||
self.outline_optimizer = OutlineOptimizer()
|
||||
self.grounding_engine = GroundingContextEngine()
|
||||
|
||||
# Initialize extracted classes
|
||||
self.title_generator = TitleGenerator()
|
||||
self.metadata_collector = MetadataCollector()
|
||||
self.prompt_builder = PromptBuilder()
|
||||
self.response_processor = ResponseProcessor()
|
||||
self.parallel_processor = ParallelProcessor(self.source_mapper, self.grounding_engine)
|
||||
|
||||
async def generate(self, request: BlogOutlineRequest) -> BlogOutlineResponse:
|
||||
"""
|
||||
Generate AI-powered outline using research results
|
||||
@@ -34,7 +58,7 @@ class OutlineGenerator:
|
||||
custom_instructions = getattr(request, 'custom_instructions', None)
|
||||
|
||||
# Build comprehensive outline generation prompt with rich research data
|
||||
outline_prompt = self._build_outline_prompt(
|
||||
outline_prompt = self.prompt_builder.build_outline_prompt(
|
||||
primary_keywords, secondary_keywords, content_angles, sources,
|
||||
search_intent, request, custom_instructions
|
||||
)
|
||||
@@ -42,32 +66,63 @@ class OutlineGenerator:
|
||||
logger.info("Generating AI-powered outline using research results")
|
||||
|
||||
# Define schema with proper property ordering (critical for Gemini API)
|
||||
outline_schema = self._get_outline_schema()
|
||||
outline_schema = self.prompt_builder.get_outline_schema()
|
||||
|
||||
# Generate outline using structured JSON response with retry logic
|
||||
outline_data = await self._generate_with_retry(outline_prompt, outline_schema)
|
||||
outline_data = await self.response_processor.generate_with_retry(outline_prompt, outline_schema)
|
||||
|
||||
# Convert to BlogOutlineSection objects
|
||||
outline_sections = self._convert_to_sections(outline_data, sources)
|
||||
outline_sections = self.response_processor.convert_to_sections(outline_data, sources)
|
||||
|
||||
# Extract title options
|
||||
title_options = outline_data.get('title_options', [])
|
||||
if not title_options:
|
||||
title_options = self._generate_fallback_titles(primary_keywords)
|
||||
# Run parallel processing for speed optimization
|
||||
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing_async(
|
||||
outline_sections, research
|
||||
)
|
||||
|
||||
logger.info(f"Generated outline with {len(outline_sections)} sections and {len(title_options)} title options")
|
||||
# Enhance sections with grounding insights
|
||||
logger.info("Enhancing sections with grounding insights...")
|
||||
grounding_enhanced_sections = self.grounding_engine.enhance_sections_with_grounding(
|
||||
mapped_sections, research.grounding_metadata, grounding_insights
|
||||
)
|
||||
|
||||
# Optimize outline for better flow, SEO, and engagement
|
||||
logger.info("Optimizing outline for better flow and engagement...")
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization")
|
||||
|
||||
# Rebalance word counts for optimal distribution
|
||||
target_words = request.word_count or 1500
|
||||
balanced_sections = self.outline_optimizer.rebalance_word_counts(optimized_sections, target_words)
|
||||
|
||||
# Extract title options - combine AI-generated with content angles
|
||||
ai_title_options = outline_data.get('title_options', [])
|
||||
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
|
||||
|
||||
# Combine AI-generated titles with content angles
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
|
||||
|
||||
logger.info(f"Generated optimized outline with {len(balanced_sections)} sections and {len(title_options)} title options")
|
||||
|
||||
# Collect metadata for enhanced UI
|
||||
source_mapping_stats = self.metadata_collector.collect_source_mapping_stats(mapped_sections, research)
|
||||
grounding_insights_data = self.metadata_collector.collect_grounding_insights(grounding_insights)
|
||||
optimization_results = self.metadata_collector.collect_optimization_results(optimized_sections, "comprehensive optimization")
|
||||
research_coverage = self.metadata_collector.collect_research_coverage(research)
|
||||
|
||||
return BlogOutlineResponse(
|
||||
success=True,
|
||||
title_options=title_options,
|
||||
outline=outline_sections
|
||||
outline=balanced_sections,
|
||||
source_mapping_stats=source_mapping_stats,
|
||||
grounding_insights=grounding_insights_data,
|
||||
optimization_results=optimization_results,
|
||||
research_coverage=research_coverage
|
||||
)
|
||||
|
||||
async def generate_with_progress(self, request: BlogOutlineRequest, task_id: str) -> BlogOutlineResponse:
|
||||
"""
|
||||
Outline generation method with progress updates for real-time feedback.
|
||||
"""
|
||||
from api.blog_writer.router import _update_progress
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
|
||||
# Extract research insights
|
||||
research = request.research
|
||||
@@ -80,272 +135,168 @@ class OutlineGenerator:
|
||||
# Check for custom instructions
|
||||
custom_instructions = getattr(request, 'custom_instructions', None)
|
||||
|
||||
await _update_progress(task_id, "📊 Analyzing research data and building content strategy...")
|
||||
await task_manager.update_progress(task_id, "📊 Analyzing research data and building content strategy...")
|
||||
|
||||
# Build comprehensive outline generation prompt with rich research data
|
||||
outline_prompt = self._build_outline_prompt(
|
||||
outline_prompt = self.prompt_builder.build_outline_prompt(
|
||||
primary_keywords, secondary_keywords, content_angles, sources,
|
||||
search_intent, request, custom_instructions
|
||||
)
|
||||
|
||||
await _update_progress(task_id, "🤖 Generating AI-powered outline with research insights...")
|
||||
await task_manager.update_progress(task_id, "🤖 Generating AI-powered outline with research insights...")
|
||||
|
||||
# Define schema with proper property ordering (critical for Gemini API)
|
||||
outline_schema = self._get_outline_schema()
|
||||
outline_schema = self.prompt_builder.get_outline_schema()
|
||||
|
||||
await _update_progress(task_id, "🔄 Making AI request to generate structured outline...")
|
||||
await task_manager.update_progress(task_id, "🔄 Making AI request to generate structured outline...")
|
||||
|
||||
# Generate outline using structured JSON response with retry logic
|
||||
outline_data = await self._generate_with_retry(outline_prompt, outline_schema, task_id)
|
||||
outline_data = await self.response_processor.generate_with_retry(outline_prompt, outline_schema, task_id)
|
||||
|
||||
await _update_progress(task_id, "📝 Processing outline structure and validating sections...")
|
||||
await task_manager.update_progress(task_id, "📝 Processing outline structure and validating sections...")
|
||||
|
||||
# Convert to BlogOutlineSection objects
|
||||
outline_sections = self._convert_to_sections(outline_data, sources)
|
||||
outline_sections = self.response_processor.convert_to_sections(outline_data, sources)
|
||||
|
||||
# Extract title options
|
||||
title_options = outline_data.get('title_options', [])
|
||||
if not title_options:
|
||||
title_options = self._generate_fallback_titles(primary_keywords)
|
||||
# Run parallel processing for speed optimization
|
||||
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing(
|
||||
outline_sections, research, task_id
|
||||
)
|
||||
|
||||
await _update_progress(task_id, "✅ Outline generation completed successfully!")
|
||||
# Enhance sections with grounding insights (depends on both previous tasks)
|
||||
await task_manager.update_progress(task_id, "✨ Enhancing sections with grounding insights...")
|
||||
grounding_enhanced_sections = self.grounding_engine.enhance_sections_with_grounding(
|
||||
mapped_sections, research.grounding_metadata, grounding_insights
|
||||
)
|
||||
|
||||
# Optimize outline for better flow, SEO, and engagement
|
||||
await task_manager.update_progress(task_id, "🎯 Optimizing outline for better flow and engagement...")
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization")
|
||||
|
||||
# Rebalance word counts for optimal distribution
|
||||
await task_manager.update_progress(task_id, "⚖️ Rebalancing word count distribution...")
|
||||
target_words = request.word_count or 1500
|
||||
balanced_sections = self.outline_optimizer.rebalance_word_counts(optimized_sections, target_words)
|
||||
|
||||
# Extract title options - combine AI-generated with content angles
|
||||
ai_title_options = outline_data.get('title_options', [])
|
||||
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
|
||||
|
||||
# Combine AI-generated titles with content angles
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
|
||||
|
||||
await task_manager.update_progress(task_id, "✅ Outline generation and optimization completed successfully!")
|
||||
|
||||
# Collect metadata for enhanced UI
|
||||
source_mapping_stats = self.metadata_collector.collect_source_mapping_stats(mapped_sections, research)
|
||||
grounding_insights_data = self.metadata_collector.collect_grounding_insights(grounding_insights)
|
||||
optimization_results = self.metadata_collector.collect_optimization_results(optimized_sections, "comprehensive optimization")
|
||||
research_coverage = self.metadata_collector.collect_research_coverage(research)
|
||||
|
||||
return BlogOutlineResponse(
|
||||
success=True,
|
||||
title_options=title_options,
|
||||
outline=outline_sections
|
||||
outline=balanced_sections,
|
||||
source_mapping_stats=source_mapping_stats,
|
||||
grounding_insights=grounding_insights_data,
|
||||
optimization_results=optimization_results,
|
||||
research_coverage=research_coverage
|
||||
)
|
||||
|
||||
def _build_outline_prompt(self, primary_keywords: List[str], secondary_keywords: List[str],
|
||||
content_angles: List[str], sources: List, search_intent: str,
|
||||
request: BlogOutlineRequest, custom_instructions: str = None) -> str:
|
||||
"""Build the comprehensive outline generation prompt."""
|
||||
return f"""
|
||||
You are a world-class content strategist and SEO expert with 15+ years of experience creating viral, high-converting blog content. Your outlines have generated millions of views and driven significant business results.
|
||||
|
||||
CONTENT STRATEGY BRIEF:
|
||||
Topic: {', '.join(primary_keywords)}
|
||||
Search Intent: {search_intent}
|
||||
Target Word Count: {request.word_count or 1500} words
|
||||
Industry Context: {getattr(request.persona, 'industry', 'General') if request.persona else 'General'}
|
||||
Audience: {getattr(request.persona, 'target_audience', 'General') if request.persona else 'General'}
|
||||
|
||||
{f"CUSTOM USER INSTRUCTIONS: {custom_instructions}" if custom_instructions else ""}
|
||||
|
||||
RESEARCH INTELLIGENCE:
|
||||
Primary Keywords: {', '.join(primary_keywords)}
|
||||
Secondary Keywords: {', '.join(secondary_keywords)}
|
||||
Long-tail Opportunities: {', '.join(request.research.keyword_analysis.get('long_tail', [])[:5])}
|
||||
Semantic Keywords: {', '.join(request.research.keyword_analysis.get('semantic_keywords', [])[:5])}
|
||||
Trending Terms: {', '.join(request.research.keyword_analysis.get('trending_terms', [])[:3])}
|
||||
Keyword Difficulty: {request.research.keyword_analysis.get('difficulty', 6)}/10
|
||||
Content Gaps: {', '.join(request.research.keyword_analysis.get('content_gaps', [])[:3])}
|
||||
|
||||
Content Angles Discovered:
|
||||
{chr(10).join([f"• {angle}" for angle in content_angles[:6]])}
|
||||
|
||||
Competitive Intelligence:
|
||||
Top Competitors: {', '.join(request.research.competitor_analysis.get('top_competitors', [])[:3])}
|
||||
Market Opportunities: {', '.join(request.research.competitor_analysis.get('opportunities', [])[:3])}
|
||||
Competitive Advantages: {', '.join(request.research.competitor_analysis.get('competitive_advantages', [])[:3])}
|
||||
Market Positioning: {request.research.competitor_analysis.get('market_positioning', 'Standard positioning')}
|
||||
|
||||
Research Sources Available: {len(sources)} authoritative sources with current data
|
||||
Key Statistics Available: Multiple data points, percentages, and expert quotes from credible sources
|
||||
|
||||
STRATEGIC OUTLINE REQUIREMENTS:
|
||||
|
||||
{f"CUSTOM REQUIREMENTS: {custom_instructions}" if custom_instructions else ""}
|
||||
|
||||
1. CONTENT ARCHITECTURE:
|
||||
- Create a logical, engaging narrative arc that guides readers from problem to solution
|
||||
- Structure content to build authority and trust progressively
|
||||
- Include data-driven insights and expert opinions from research
|
||||
- Ensure each section adds unique value and builds upon previous sections
|
||||
|
||||
2. SEO OPTIMIZATION:
|
||||
- Naturally integrate primary keywords in headings and content
|
||||
- Use secondary keywords strategically throughout sections
|
||||
- Include long-tail keywords in subheadings and key points
|
||||
- Optimize for featured snippets and voice search
|
||||
|
||||
3. READER ENGAGEMENT:
|
||||
- Start with compelling hooks and pain points
|
||||
- Use storytelling elements and real-world examples
|
||||
- Include actionable insights and practical takeaways
|
||||
- End with clear next steps and calls-to-action
|
||||
|
||||
4. CONTENT DEPTH:
|
||||
- Provide comprehensive coverage of the topic
|
||||
- Include multiple perspectives and expert insights
|
||||
- Address common questions and objections
|
||||
- Offer unique angles not covered by competitors
|
||||
|
||||
5. WORD COUNT DISTRIBUTION:
|
||||
- Introduction: 12% of total word count
|
||||
- Main content sections: 76% of total word count
|
||||
- Conclusion: 12% of total word count
|
||||
- Ensure balanced section lengths for optimal readability
|
||||
|
||||
6. COMPETITIVE ADVANTAGE:
|
||||
- Leverage content gaps identified in research
|
||||
- Include unique data points and statistics
|
||||
- Provide fresh perspectives on trending topics
|
||||
- Address underserved audience segments
|
||||
|
||||
TITLE STRATEGY:
|
||||
Create 5 compelling title options that:
|
||||
- Include primary keywords naturally
|
||||
- Promise clear value and outcomes
|
||||
- Appeal to the target audience's pain points
|
||||
- Stand out from competitor content
|
||||
- Optimize for click-through rates
|
||||
|
||||
Generate a comprehensive outline with the following structure:
|
||||
{{
|
||||
"title_options": [
|
||||
"Title 1 with primary keyword",
|
||||
"Title 2 with emotional hook",
|
||||
"Title 3 with benefit-focused approach",
|
||||
"Title 4 with question format",
|
||||
"Title 5 with urgency/trending angle"
|
||||
],
|
||||
"outline": [
|
||||
{{
|
||||
"heading": "Section heading with primary keyword",
|
||||
"subheadings": ["Subheading 1", "Subheading 2", "Subheading 3"],
|
||||
"key_points": ["Key point 1", "Key point 2", "Key point 3"],
|
||||
"word_count": 300,
|
||||
"keywords": ["primary keyword", "secondary keyword"]
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
|
||||
async def enhance_section(self, section: BlogOutlineSection, focus: str = "general improvement") -> BlogOutlineSection:
|
||||
"""
|
||||
|
||||
def _get_outline_schema(self) -> Dict[str, Any]:
|
||||
"""Get the structured JSON schema for outline generation."""
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title_options": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"outline": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"heading": {"type": "string"},
|
||||
"subheadings": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"key_points": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"word_count": {"type": "integer"},
|
||||
"keywords": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "word_count", "keywords"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["title_options", "outline"],
|
||||
"propertyOrdering": ["title_options", "outline"]
|
||||
}
|
||||
|
||||
async def _generate_with_retry(self, prompt: str, schema: Dict[str, Any], task_id: str = None) -> Dict[str, Any]:
|
||||
"""Generate outline with retry logic for API failures."""
|
||||
from services.llm_providers.gemini_provider import gemini_structured_json_response
|
||||
from api.blog_writer.router import _update_progress
|
||||
Enhance a single section using AI with research context.
|
||||
|
||||
max_retries = 2 # Conservative retry for expensive API calls
|
||||
retry_delay = 5 # 5 second delay between retries
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if task_id:
|
||||
await _update_progress(task_id, f"🤖 Calling Gemini API for outline generation (attempt {attempt + 1}/{max_retries + 1})...")
|
||||
|
||||
outline_data = gemini_structured_json_response(
|
||||
prompt=prompt,
|
||||
schema=schema,
|
||||
temperature=0.3,
|
||||
max_tokens=4000 # Increased to avoid MAX_TOKENS truncation
|
||||
)
|
||||
|
||||
# Log response for debugging
|
||||
logger.info(f"Gemini response received: {type(outline_data)}")
|
||||
|
||||
# Check for errors in the response
|
||||
if isinstance(outline_data, dict) and 'error' in outline_data:
|
||||
error_msg = str(outline_data['error'])
|
||||
if "503" in error_msg and "overloaded" in error_msg and attempt < max_retries:
|
||||
if task_id:
|
||||
await _update_progress(task_id, f"⚠️ AI service overloaded, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Gemini API overloaded, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"Gemini structured response error: {outline_data['error']}")
|
||||
raise ValueError(f"AI outline generation failed: {outline_data['error']}")
|
||||
|
||||
# Validate required fields
|
||||
if not isinstance(outline_data, dict) or 'outline' not in outline_data or not isinstance(outline_data['outline'], list):
|
||||
if attempt < max_retries:
|
||||
if task_id:
|
||||
await _update_progress(task_id, f"⚠️ Invalid response structure, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Invalid response structure, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
raise ValueError("Invalid outline structure in Gemini response")
|
||||
|
||||
# If we get here, the response is valid
|
||||
return outline_data
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
if ("503" in error_str or "overloaded" in error_str) and attempt < max_retries:
|
||||
if task_id:
|
||||
await _update_progress(task_id, f"⚠️ AI service error, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Gemini API error, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1}): {error_str}")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"Outline generation failed after {attempt + 1} attempts: {error_str}")
|
||||
raise ValueError(f"AI outline generation failed: {error_str}")
|
||||
Args:
|
||||
section: The section to enhance
|
||||
focus: Enhancement focus area (e.g., "SEO optimization", "engagement", "comprehensiveness")
|
||||
|
||||
Returns:
|
||||
Enhanced section with improved content
|
||||
"""
|
||||
logger.info(f"Enhancing section '{section.heading}' with focus: {focus}")
|
||||
enhanced_section = await self.section_enhancer.enhance(section, focus)
|
||||
logger.info(f"✅ Section enhancement completed for '{section.heading}'")
|
||||
return enhanced_section
|
||||
|
||||
def _convert_to_sections(self, outline_data: Dict[str, Any], sources: List) -> List[BlogOutlineSection]:
|
||||
"""Convert outline data to BlogOutlineSection objects."""
|
||||
outline_sections = []
|
||||
for i, section_data in enumerate(outline_data.get('outline', [])):
|
||||
if not isinstance(section_data, dict) or 'heading' not in section_data:
|
||||
continue
|
||||
|
||||
section = BlogOutlineSection(
|
||||
id=f"s{i+1}",
|
||||
heading=section_data.get('heading', f'Section {i+1}'),
|
||||
subheadings=section_data.get('subheadings', []),
|
||||
key_points=section_data.get('key_points', []),
|
||||
references=sources[:3], # Use first 3 sources as references
|
||||
target_words=section_data.get('word_count', 200),
|
||||
keywords=section_data.get('keywords', [])
|
||||
)
|
||||
outline_sections.append(section)
|
||||
async def optimize_outline(self, outline: List[BlogOutlineSection], focus: str = "comprehensive optimization") -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Optimize an entire outline for better flow, SEO, and engagement.
|
||||
|
||||
return outline_sections
|
||||
Args:
|
||||
outline: List of sections to optimize
|
||||
focus: Optimization focus area
|
||||
|
||||
Returns:
|
||||
Optimized outline with improved flow and engagement
|
||||
"""
|
||||
logger.info(f"Optimizing outline with {len(outline)} sections, focus: {focus}")
|
||||
optimized_outline = await self.outline_optimizer.optimize(outline, focus)
|
||||
logger.info(f"✅ Outline optimization completed for {len(optimized_outline)} sections")
|
||||
return optimized_outline
|
||||
|
||||
def rebalance_outline_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Rebalance word count distribution across outline sections.
|
||||
|
||||
Args:
|
||||
outline: List of sections to rebalance
|
||||
target_words: Total target word count
|
||||
|
||||
Returns:
|
||||
Outline with rebalanced word counts
|
||||
"""
|
||||
logger.info(f"Rebalancing word counts for {len(outline)} sections, target: {target_words} words")
|
||||
rebalanced_outline = self.outline_optimizer.rebalance_word_counts(outline, target_words)
|
||||
logger.info(f"✅ Word count rebalancing completed")
|
||||
return rebalanced_outline
|
||||
|
||||
def get_grounding_insights(self, research_data) -> Dict[str, Any]:
|
||||
"""
|
||||
Get grounding metadata insights for research data.
|
||||
|
||||
Args:
|
||||
research_data: Research data with grounding metadata
|
||||
|
||||
Returns:
|
||||
Dictionary containing grounding insights and analysis
|
||||
"""
|
||||
logger.info("Extracting grounding insights from research data...")
|
||||
insights = self.grounding_engine.extract_contextual_insights(research_data.grounding_metadata)
|
||||
logger.info(f"✅ Extracted {len(insights)} grounding insight categories")
|
||||
return insights
|
||||
|
||||
def get_authority_sources(self, research_data) -> List[Tuple]:
|
||||
"""
|
||||
Get high-authority sources from grounding metadata.
|
||||
|
||||
Args:
|
||||
research_data: Research data with grounding metadata
|
||||
|
||||
Returns:
|
||||
List of (chunk, authority_score) tuples sorted by authority
|
||||
"""
|
||||
logger.info("Identifying high-authority sources from grounding metadata...")
|
||||
authority_sources = self.grounding_engine.get_authority_sources(research_data.grounding_metadata)
|
||||
logger.info(f"✅ Identified {len(authority_sources)} high-authority sources")
|
||||
return authority_sources
|
||||
|
||||
def get_high_confidence_insights(self, research_data) -> List[str]:
|
||||
"""
|
||||
Get high-confidence insights from grounding metadata.
|
||||
|
||||
Args:
|
||||
research_data: Research data with grounding metadata
|
||||
|
||||
Returns:
|
||||
List of high-confidence insights
|
||||
"""
|
||||
logger.info("Extracting high-confidence insights from grounding metadata...")
|
||||
insights = self.grounding_engine.get_high_confidence_insights(research_data.grounding_metadata)
|
||||
logger.info(f"✅ Extracted {len(insights)} high-confidence insights")
|
||||
return insights
|
||||
|
||||
|
||||
|
||||
def _generate_fallback_titles(self, primary_keywords: List[str]) -> List[str]:
|
||||
"""Generate fallback titles when AI generation fails."""
|
||||
primary_keyword = primary_keywords[0] if primary_keywords else "Topic"
|
||||
return [
|
||||
f"The Complete Guide to {primary_keyword}",
|
||||
f"{primary_keyword}: Everything You Need to Know",
|
||||
f"How to Master {primary_keyword} in 2024"
|
||||
]
|
||||
|
||||
@@ -17,61 +17,64 @@ class OutlineOptimizer:
|
||||
"""Optimize entire outline for better flow, SEO, and engagement."""
|
||||
outline_text = "\n".join([f"{i+1}. {s.heading}" for i, s in enumerate(outline)])
|
||||
|
||||
optimization_prompt = f"""
|
||||
Optimize this blog outline for better flow, engagement, and SEO:
|
||||
|
||||
Current Outline:
|
||||
{outline_text}
|
||||
|
||||
Optimization Focus: {focus}
|
||||
|
||||
Optimization Goals:
|
||||
- Improve narrative flow and logical progression
|
||||
- Enhance SEO with better keyword distribution
|
||||
- Increase engagement with compelling headings
|
||||
- Ensure comprehensive coverage of the topic
|
||||
- Optimize for featured snippets and voice search
|
||||
|
||||
Respond with JSON array of optimized sections:
|
||||
[
|
||||
{{
|
||||
"heading": "Optimized heading",
|
||||
"subheadings": ["subheading 1", "subheading 2"],
|
||||
"key_points": ["point 1", "point 2"],
|
||||
"target_words": 300,
|
||||
"keywords": ["keyword1", "keyword2"]
|
||||
}}
|
||||
]
|
||||
"""
|
||||
optimization_prompt = f"""Optimize this blog outline for better flow, engagement, and SEO:
|
||||
|
||||
Current Outline:
|
||||
{outline_text}
|
||||
|
||||
Optimization Focus: {focus}
|
||||
|
||||
Goals: Improve narrative flow, enhance SEO, increase engagement, ensure comprehensive coverage.
|
||||
|
||||
Return JSON format:
|
||||
{{
|
||||
"outline": [
|
||||
{{
|
||||
"heading": "Optimized heading",
|
||||
"subheadings": ["subheading 1", "subheading 2"],
|
||||
"key_points": ["point 1", "point 2"],
|
||||
"target_words": 300,
|
||||
"keywords": ["keyword1", "keyword2"]
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
try:
|
||||
from services.llm_providers.gemini_provider import gemini_structured_json_response
|
||||
|
||||
optimization_schema = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"heading": {"type": "string"},
|
||||
"subheadings": {"type": "array", "items": {"type": "string"}},
|
||||
"key_points": {"type": "array", "items": {"type": "string"}},
|
||||
"target_words": {"type": "integer"},
|
||||
"keywords": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
|
||||
}
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"outline": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"heading": {"type": "string"},
|
||||
"subheadings": {"type": "array", "items": {"type": "string"}},
|
||||
"key_points": {"type": "array", "items": {"type": "string"}},
|
||||
"target_words": {"type": "integer"},
|
||||
"keywords": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["outline"],
|
||||
"propertyOrdering": ["outline"]
|
||||
}
|
||||
|
||||
optimized_data = gemini_structured_json_response(
|
||||
prompt=optimization_prompt,
|
||||
schema=optimization_schema,
|
||||
temperature=0.3,
|
||||
max_tokens=2000
|
||||
max_tokens=6000 # Match main outline generator
|
||||
)
|
||||
|
||||
if isinstance(optimized_data, list):
|
||||
# Handle the new schema format with "outline" wrapper
|
||||
if isinstance(optimized_data, dict) and 'outline' in optimized_data:
|
||||
optimized_sections = []
|
||||
for i, section_data in enumerate(optimized_data):
|
||||
for i, section_data in enumerate(optimized_data['outline']):
|
||||
section = BlogOutlineSection(
|
||||
id=f"s{i+1}",
|
||||
heading=section_data.get('heading', f'Section {i+1}'),
|
||||
@@ -82,9 +85,14 @@ class OutlineOptimizer:
|
||||
keywords=section_data.get('keywords', [])
|
||||
)
|
||||
optimized_sections.append(section)
|
||||
logger.info(f"✅ Outline optimization completed: {len(optimized_sections)} sections optimized")
|
||||
return optimized_sections
|
||||
else:
|
||||
logger.warning(f"Invalid optimization response format: {type(optimized_data)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI outline optimization failed: {e}")
|
||||
logger.info("Returning original outline without optimization")
|
||||
|
||||
return outline
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ from models.blog_models import (
|
||||
from .outline_generator import OutlineGenerator
|
||||
from .outline_optimizer import OutlineOptimizer
|
||||
from .section_enhancer import SectionEnhancer
|
||||
from services.cache.persistent_outline_cache import persistent_outline_cache
|
||||
|
||||
|
||||
class OutlineService:
|
||||
@@ -33,13 +34,90 @@ class OutlineService:
|
||||
Stage 2: Content Planning with AI-generated outline using research results
|
||||
Uses Gemini with research data to create comprehensive, SEO-optimized outline
|
||||
"""
|
||||
return await self.outline_generator.generate(request)
|
||||
# Extract cache parameters - use original user keywords for consistent caching
|
||||
keywords = request.research.original_keywords or request.research.keyword_analysis.get('primary', [])
|
||||
industry = getattr(request.persona, 'industry', 'general') if request.persona else 'general'
|
||||
target_audience = getattr(request.persona, 'target_audience', 'general') if request.persona else 'general'
|
||||
word_count = request.word_count or 1500
|
||||
custom_instructions = request.custom_instructions or ""
|
||||
persona_data = request.persona.dict() if request.persona else None
|
||||
|
||||
# Check cache first
|
||||
cached_result = persistent_outline_cache.get_cached_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
logger.info(f"Using cached outline for keywords: {keywords}")
|
||||
return BlogOutlineResponse(**cached_result)
|
||||
|
||||
# Generate new outline if not cached
|
||||
logger.info(f"Generating new outline for keywords: {keywords}")
|
||||
result = await self.outline_generator.generate(request)
|
||||
|
||||
# Cache the result
|
||||
persistent_outline_cache.cache_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data,
|
||||
result=result.dict()
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def generate_outline_with_progress(self, request: BlogOutlineRequest, task_id: str) -> BlogOutlineResponse:
|
||||
"""
|
||||
Outline generation method with progress updates for real-time feedback.
|
||||
"""
|
||||
return await self.outline_generator.generate_with_progress(request, task_id)
|
||||
# Extract cache parameters - use original user keywords for consistent caching
|
||||
keywords = request.research.original_keywords or request.research.keyword_analysis.get('primary', [])
|
||||
industry = getattr(request.persona, 'industry', 'general') if request.persona else 'general'
|
||||
target_audience = getattr(request.persona, 'target_audience', 'general') if request.persona else 'general'
|
||||
word_count = request.word_count or 1500
|
||||
custom_instructions = request.custom_instructions or ""
|
||||
persona_data = request.persona.dict() if request.persona else None
|
||||
|
||||
# Check cache first
|
||||
cached_result = persistent_outline_cache.get_cached_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
logger.info(f"Using cached outline for keywords: {keywords} (with progress updates)")
|
||||
# Update progress to show cache hit
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "✅ Using cached outline (saved generation time!)")
|
||||
return BlogOutlineResponse(**cached_result)
|
||||
|
||||
# Generate new outline if not cached
|
||||
logger.info(f"Generating new outline for keywords: {keywords} (with progress updates)")
|
||||
result = await self.outline_generator.generate_with_progress(request, task_id)
|
||||
|
||||
# Cache the result
|
||||
persistent_outline_cache.cache_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data,
|
||||
result=result.dict()
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def refine_outline(self, request: BlogOutlineRefineRequest) -> BlogOutlineResponse:
|
||||
"""
|
||||
@@ -152,3 +230,29 @@ class OutlineService:
|
||||
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
|
||||
"""Rebalance word count distribution across sections."""
|
||||
return self.outline_optimizer.rebalance_word_counts(outline, target_words)
|
||||
|
||||
# Cache Management Methods
|
||||
|
||||
def get_outline_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get outline cache statistics."""
|
||||
return persistent_outline_cache.get_cache_stats()
|
||||
|
||||
def clear_outline_cache(self):
|
||||
"""Clear all cached outline entries."""
|
||||
persistent_outline_cache.clear_cache()
|
||||
logger.info("Outline cache cleared")
|
||||
|
||||
def invalidate_outline_cache_for_keywords(self, keywords: List[str]):
|
||||
"""
|
||||
Invalidate outline cache entries for specific keywords.
|
||||
Useful when research data is updated.
|
||||
|
||||
Args:
|
||||
keywords: Keywords to invalidate cache for
|
||||
"""
|
||||
persistent_outline_cache.invalidate_cache_for_keywords(keywords)
|
||||
logger.info(f"Invalidated outline cache for keywords: {keywords}")
|
||||
|
||||
def get_recent_outline_cache_entries(self, limit: int = 20) -> List[Dict[str, Any]]:
|
||||
"""Get recent outline cache entries for debugging."""
|
||||
return persistent_outline_cache.get_cache_entries(limit)
|
||||
|
||||
107
backend/services/blog_writer/outline/parallel_processor.py
Normal file
107
backend/services/blog_writer/outline/parallel_processor.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
Parallel Processor - Handles parallel processing of outline generation tasks.
|
||||
|
||||
Manages concurrent execution of source mapping and grounding insights extraction.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Tuple, Any
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ParallelProcessor:
|
||||
"""Handles parallel processing of outline generation tasks for speed optimization."""
|
||||
|
||||
def __init__(self, source_mapper, grounding_engine):
|
||||
"""Initialize the parallel processor with required dependencies."""
|
||||
self.source_mapper = source_mapper
|
||||
self.grounding_engine = grounding_engine
|
||||
|
||||
async def run_parallel_processing(self, outline_sections, research, task_id: str = None) -> Tuple[Any, Any]:
|
||||
"""
|
||||
Run source mapping and grounding insights extraction in parallel.
|
||||
|
||||
Args:
|
||||
outline_sections: List of outline sections to process
|
||||
research: Research data object
|
||||
task_id: Optional task ID for progress updates
|
||||
|
||||
Returns:
|
||||
Tuple of (mapped_sections, grounding_insights)
|
||||
"""
|
||||
if task_id:
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "⚡ Running parallel processing for maximum speed...")
|
||||
|
||||
logger.info("Running parallel processing for maximum speed...")
|
||||
|
||||
# Run these tasks in parallel to save time
|
||||
source_mapping_task = asyncio.create_task(
|
||||
self._run_source_mapping(outline_sections, research, task_id)
|
||||
)
|
||||
|
||||
grounding_insights_task = asyncio.create_task(
|
||||
self._run_grounding_insights_extraction(research, task_id)
|
||||
)
|
||||
|
||||
# Wait for both parallel tasks to complete
|
||||
mapped_sections, grounding_insights = await asyncio.gather(
|
||||
source_mapping_task,
|
||||
grounding_insights_task
|
||||
)
|
||||
|
||||
return mapped_sections, grounding_insights
|
||||
|
||||
async def run_parallel_processing_async(self, outline_sections, research) -> Tuple[Any, Any]:
|
||||
"""
|
||||
Run parallel processing without progress updates (for non-progress methods).
|
||||
|
||||
Args:
|
||||
outline_sections: List of outline sections to process
|
||||
research: Research data object
|
||||
|
||||
Returns:
|
||||
Tuple of (mapped_sections, grounding_insights)
|
||||
"""
|
||||
logger.info("Running parallel processing for maximum speed...")
|
||||
|
||||
# Run these tasks in parallel to save time
|
||||
source_mapping_task = asyncio.create_task(
|
||||
self._run_source_mapping_async(outline_sections, research)
|
||||
)
|
||||
|
||||
grounding_insights_task = asyncio.create_task(
|
||||
self._run_grounding_insights_extraction_async(research)
|
||||
)
|
||||
|
||||
# Wait for both parallel tasks to complete
|
||||
mapped_sections, grounding_insights = await asyncio.gather(
|
||||
source_mapping_task,
|
||||
grounding_insights_task
|
||||
)
|
||||
|
||||
return mapped_sections, grounding_insights
|
||||
|
||||
async def _run_source_mapping(self, outline_sections, research, task_id):
|
||||
"""Run source mapping in parallel."""
|
||||
if task_id:
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "🔗 Applying intelligent source-to-section mapping...")
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research)
|
||||
|
||||
async def _run_grounding_insights_extraction(self, research, task_id):
|
||||
"""Run grounding insights extraction in parallel."""
|
||||
if task_id:
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "🧠 Extracting grounding metadata insights...")
|
||||
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
|
||||
|
||||
async def _run_source_mapping_async(self, outline_sections, research):
|
||||
"""Run source mapping in parallel (async version without progress updates)."""
|
||||
logger.info("Applying intelligent source-to-section mapping...")
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research)
|
||||
|
||||
async def _run_grounding_insights_extraction_async(self, research):
|
||||
"""Run grounding insights extraction in parallel (async version without progress updates)."""
|
||||
logger.info("Extracting grounding metadata insights...")
|
||||
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
|
||||
105
backend/services/blog_writer/outline/prompt_builder.py
Normal file
105
backend/services/blog_writer/outline/prompt_builder.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Prompt Builder - Handles building of AI prompts for outline generation.
|
||||
|
||||
Constructs comprehensive prompts with research data, keywords, and strategic requirements.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class PromptBuilder:
|
||||
"""Handles building of comprehensive AI prompts for outline generation."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the prompt builder."""
|
||||
pass
|
||||
|
||||
def build_outline_prompt(self, primary_keywords: List[str], secondary_keywords: List[str],
|
||||
content_angles: List[str], sources: List, search_intent: str,
|
||||
request, custom_instructions: str = None) -> str:
|
||||
"""Build the comprehensive outline generation prompt using filtered research data."""
|
||||
|
||||
# Use the filtered research data (already cleaned by ResearchDataFilter)
|
||||
research = request.research
|
||||
|
||||
return f"""Create a comprehensive blog outline for: {', '.join(primary_keywords)}
|
||||
|
||||
CONTEXT:
|
||||
Search Intent: {search_intent}
|
||||
Target: {request.word_count or 1500} words
|
||||
Industry: {getattr(request.persona, 'industry', 'General') if request.persona else 'General'}
|
||||
Audience: {getattr(request.persona, 'target_audience', 'General') if request.persona else 'General'}
|
||||
|
||||
KEYWORDS:
|
||||
Primary: {', '.join(primary_keywords)}
|
||||
Secondary: {', '.join(secondary_keywords)}
|
||||
Long-tail: {', '.join(research.keyword_analysis.get('long_tail', []))}
|
||||
Semantic: {', '.join(research.keyword_analysis.get('semantic_keywords', []))}
|
||||
Trending: {', '.join(research.keyword_analysis.get('trending_terms', []))}
|
||||
Content Gaps: {', '.join(research.keyword_analysis.get('content_gaps', []))}
|
||||
|
||||
CONTENT ANGLES: {', '.join(content_angles)}
|
||||
|
||||
COMPETITIVE INTELLIGENCE:
|
||||
Top Competitors: {', '.join(research.competitor_analysis.get('top_competitors', []))}
|
||||
Market Opportunities: {', '.join(research.competitor_analysis.get('opportunities', []))}
|
||||
Competitive Advantages: {', '.join(research.competitor_analysis.get('competitive_advantages', []))}
|
||||
|
||||
RESEARCH SOURCES: {len(sources)} authoritative sources available
|
||||
|
||||
{f"CUSTOM INSTRUCTIONS: {custom_instructions}" if custom_instructions else ""}
|
||||
|
||||
STRATEGIC REQUIREMENTS:
|
||||
- Create SEO-optimized headings with natural keyword integration
|
||||
- Build logical narrative flow from problem to solution
|
||||
- Include data-driven insights from research sources
|
||||
- Address content gaps and market opportunities
|
||||
- Optimize for search intent and user questions
|
||||
- Ensure engaging, actionable content throughout
|
||||
|
||||
Return JSON format:
|
||||
{{
|
||||
"outline": [
|
||||
{{
|
||||
"heading": "Section heading with primary keyword",
|
||||
"subheadings": ["Subheading 1", "Subheading 2", "Subheading 3"],
|
||||
"key_points": ["Key point 1", "Key point 2", "Key point 3"],
|
||||
"target_words": 300,
|
||||
"keywords": ["primary keyword", "secondary keyword"]
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
def get_outline_schema(self) -> Dict[str, Any]:
|
||||
"""Get the structured JSON schema for outline generation."""
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"outline": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"heading": {"type": "string"},
|
||||
"subheadings": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"key_points": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"target_words": {"type": "integer"},
|
||||
"keywords": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["outline"],
|
||||
"propertyOrdering": ["outline"]
|
||||
}
|
||||
107
backend/services/blog_writer/outline/response_processor.py
Normal file
107
backend/services/blog_writer/outline/response_processor.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
Response Processor - Handles AI response processing and retry logic.
|
||||
|
||||
Processes AI responses, handles retries, and converts data to proper formats.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
import asyncio
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogOutlineSection
|
||||
|
||||
|
||||
class ResponseProcessor:
|
||||
"""Handles AI response processing, retry logic, and data conversion."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the response processor."""
|
||||
pass
|
||||
|
||||
async def generate_with_retry(self, prompt: str, schema: Dict[str, Any], task_id: str = None) -> Dict[str, Any]:
|
||||
"""Generate outline with retry logic for API failures."""
|
||||
from services.llm_providers.gemini_provider import gemini_structured_json_response
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
|
||||
max_retries = 2 # Conservative retry for expensive API calls
|
||||
retry_delay = 5 # 5 second delay between retries
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"🤖 Calling Gemini API for outline generation (attempt {attempt + 1}/{max_retries + 1})...")
|
||||
|
||||
outline_data = gemini_structured_json_response(
|
||||
prompt=prompt,
|
||||
schema=schema,
|
||||
temperature=0.3,
|
||||
max_tokens=6000 # Increased further to avoid truncation
|
||||
)
|
||||
|
||||
# Log response for debugging
|
||||
logger.info(f"Gemini response received: {type(outline_data)}")
|
||||
|
||||
# Check for errors in the response
|
||||
if isinstance(outline_data, dict) and 'error' in outline_data:
|
||||
error_msg = str(outline_data['error'])
|
||||
if "503" in error_msg and "overloaded" in error_msg and attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ AI service overloaded, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Gemini API overloaded, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
elif "No valid structured response content found" in error_msg and attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ Invalid response format, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Gemini response parsing failed, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"Gemini structured response error: {outline_data['error']}")
|
||||
raise ValueError(f"AI outline generation failed: {outline_data['error']}")
|
||||
|
||||
# Validate required fields
|
||||
if not isinstance(outline_data, dict) or 'outline' not in outline_data or not isinstance(outline_data['outline'], list):
|
||||
if attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ Invalid response structure, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Invalid response structure, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
raise ValueError("Invalid outline structure in Gemini response")
|
||||
|
||||
# If we get here, the response is valid
|
||||
return outline_data
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
if ("503" in error_str or "overloaded" in error_str) and attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ AI service error, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Gemini API error, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1}): {error_str}")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"Outline generation failed after {attempt + 1} attempts: {error_str}")
|
||||
raise ValueError(f"AI outline generation failed: {error_str}")
|
||||
|
||||
def convert_to_sections(self, outline_data: Dict[str, Any], sources: List) -> List[BlogOutlineSection]:
|
||||
"""Convert outline data to BlogOutlineSection objects."""
|
||||
outline_sections = []
|
||||
for i, section_data in enumerate(outline_data.get('outline', [])):
|
||||
if not isinstance(section_data, dict) or 'heading' not in section_data:
|
||||
continue
|
||||
|
||||
section = BlogOutlineSection(
|
||||
id=f"s{i+1}",
|
||||
heading=section_data.get('heading', f'Section {i+1}'),
|
||||
subheadings=section_data.get('subheadings', []),
|
||||
key_points=section_data.get('key_points', []),
|
||||
references=[], # Will be populated by intelligent mapping
|
||||
target_words=section_data.get('target_words', 200),
|
||||
keywords=section_data.get('keywords', [])
|
||||
)
|
||||
outline_sections.append(section)
|
||||
|
||||
return outline_sections
|
||||
669
backend/services/blog_writer/outline/source_mapper.py
Normal file
669
backend/services/blog_writer/outline/source_mapper.py
Normal file
@@ -0,0 +1,669 @@
|
||||
"""
|
||||
Source-to-Section Mapper - Intelligent mapping of research sources to outline sections.
|
||||
|
||||
This module provides algorithmic mapping of research sources to specific outline sections
|
||||
based on semantic similarity, keyword relevance, and contextual matching. Uses a hybrid
|
||||
approach of algorithmic scoring followed by AI validation for optimal results.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Tuple, Optional
|
||||
import re
|
||||
from collections import Counter
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
BlogOutlineSection,
|
||||
ResearchSource,
|
||||
BlogResearchResponse,
|
||||
)
|
||||
|
||||
|
||||
class SourceToSectionMapper:
|
||||
"""Maps research sources to outline sections using intelligent algorithms."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the source-to-section mapper."""
|
||||
self.min_semantic_score = 0.3
|
||||
self.min_keyword_score = 0.2
|
||||
self.min_contextual_score = 0.2
|
||||
self.max_sources_per_section = 3
|
||||
self.min_total_score = 0.4
|
||||
|
||||
# Weight factors for different scoring methods
|
||||
self.weights = {
|
||||
'semantic': 0.4, # Semantic similarity weight
|
||||
'keyword': 0.3, # Keyword matching weight
|
||||
'contextual': 0.3 # Contextual relevance weight
|
||||
}
|
||||
|
||||
# Common stop words for text processing
|
||||
self.stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
||||
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
||||
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
|
||||
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
|
||||
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
|
||||
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
|
||||
'over', 'under', 'again', 'further', 'then', 'once'
|
||||
}
|
||||
|
||||
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
|
||||
|
||||
def map_sources_to_sections(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Map research sources to outline sections using intelligent algorithms.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections to map sources to
|
||||
research_data: Research data containing sources and metadata
|
||||
|
||||
Returns:
|
||||
List of outline sections with intelligently mapped sources
|
||||
"""
|
||||
if not sections or not research_data.sources:
|
||||
logger.warning("No sections or sources to map")
|
||||
return sections
|
||||
|
||||
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
|
||||
|
||||
# Step 1: Algorithmic mapping
|
||||
mapping_results = self._algorithmic_source_mapping(sections, research_data)
|
||||
|
||||
# Step 2: AI validation and improvement (single prompt)
|
||||
validated_mapping = self._ai_validate_mapping(mapping_results, research_data)
|
||||
|
||||
# Step 3: Apply validated mapping to sections
|
||||
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
|
||||
|
||||
logger.info("✅ Source-to-section mapping completed successfully")
|
||||
return mapped_sections
|
||||
|
||||
def _algorithmic_source_mapping(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Perform algorithmic mapping of sources to sections.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections
|
||||
research_data: Research data with sources
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section IDs to list of (source, score) tuples
|
||||
"""
|
||||
mapping_results = {}
|
||||
|
||||
for section in sections:
|
||||
section_scores = []
|
||||
|
||||
for source in research_data.sources:
|
||||
# Calculate multi-dimensional relevance score
|
||||
semantic_score = self._calculate_semantic_similarity(section, source)
|
||||
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
|
||||
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
|
||||
|
||||
# Weighted total score
|
||||
total_score = (
|
||||
semantic_score * self.weights['semantic'] +
|
||||
keyword_score * self.weights['keyword'] +
|
||||
contextual_score * self.weights['contextual']
|
||||
)
|
||||
|
||||
# Only include sources that meet minimum threshold
|
||||
if total_score >= self.min_total_score:
|
||||
section_scores.append((source, total_score))
|
||||
|
||||
# Sort by score and limit to max sources per section
|
||||
section_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
section_scores = section_scores[:self.max_sources_per_section]
|
||||
|
||||
mapping_results[section.id] = section_scores
|
||||
|
||||
logger.debug(f"Section '{section.heading}': {len(section_scores)} sources mapped")
|
||||
|
||||
return mapping_results
|
||||
|
||||
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
|
||||
"""
|
||||
Calculate semantic similarity between section and source.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
|
||||
Returns:
|
||||
Semantic similarity score (0.0 to 1.0)
|
||||
"""
|
||||
# Extract text content for comparison
|
||||
section_text = self._extract_section_text(section)
|
||||
source_text = self._extract_source_text(source)
|
||||
|
||||
# Calculate word overlap
|
||||
section_words = self._extract_meaningful_words(section_text)
|
||||
source_words = self._extract_meaningful_words(source_text)
|
||||
|
||||
if not section_words or not source_words:
|
||||
return 0.0
|
||||
|
||||
# Calculate Jaccard similarity
|
||||
intersection = len(set(section_words) & set(source_words))
|
||||
union = len(set(section_words) | set(source_words))
|
||||
|
||||
jaccard_similarity = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Boost score for exact phrase matches
|
||||
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
|
||||
|
||||
# Combine Jaccard similarity with phrase boost
|
||||
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
|
||||
|
||||
return semantic_score
|
||||
|
||||
def _calculate_keyword_relevance(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
source: ResearchSource,
|
||||
research_data: BlogResearchResponse
|
||||
) -> float:
|
||||
"""
|
||||
Calculate keyword-based relevance between section and source.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
research_data: Research data with keyword analysis
|
||||
|
||||
Returns:
|
||||
Keyword relevance score (0.0 to 1.0)
|
||||
"""
|
||||
# Get section keywords
|
||||
section_keywords = set(section.keywords)
|
||||
if not section_keywords:
|
||||
# Extract keywords from section heading and content
|
||||
section_text = self._extract_section_text(section)
|
||||
section_keywords = set(self._extract_meaningful_words(section_text))
|
||||
|
||||
# Get source keywords from title and excerpt
|
||||
source_text = f"{source.title} {source.excerpt or ''}"
|
||||
source_keywords = set(self._extract_meaningful_words(source_text))
|
||||
|
||||
# Get research keywords for context
|
||||
research_keywords = set()
|
||||
for category in ['primary', 'secondary', 'long_tail', 'semantic_keywords']:
|
||||
research_keywords.update(research_data.keyword_analysis.get(category, []))
|
||||
|
||||
# Calculate keyword overlap scores
|
||||
section_overlap = len(section_keywords & source_keywords) / len(section_keywords) if section_keywords else 0.0
|
||||
research_overlap = len(research_keywords & source_keywords) / len(research_keywords) if research_keywords else 0.0
|
||||
|
||||
# Weighted combination
|
||||
keyword_score = (section_overlap * 0.7) + (research_overlap * 0.3)
|
||||
|
||||
return min(1.0, keyword_score)
|
||||
|
||||
def _calculate_contextual_relevance(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
source: ResearchSource,
|
||||
research_data: BlogResearchResponse
|
||||
) -> float:
|
||||
"""
|
||||
Calculate contextual relevance based on section content and source context.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
research_data: Research data with context
|
||||
|
||||
Returns:
|
||||
Contextual relevance score (0.0 to 1.0)
|
||||
"""
|
||||
contextual_score = 0.0
|
||||
|
||||
# 1. Content angle matching
|
||||
section_text = self._extract_section_text(section).lower()
|
||||
source_text = f"{source.title} {source.excerpt or ''}".lower()
|
||||
|
||||
# Check for content angle matches
|
||||
content_angles = research_data.suggested_angles
|
||||
for angle in content_angles:
|
||||
angle_words = self._extract_meaningful_words(angle.lower())
|
||||
if angle_words:
|
||||
section_angle_match = sum(1 for word in angle_words if word in section_text) / len(angle_words)
|
||||
source_angle_match = sum(1 for word in angle_words if word in source_text) / len(angle_words)
|
||||
contextual_score += (section_angle_match + source_angle_match) * 0.3
|
||||
|
||||
# 2. Search intent alignment
|
||||
search_intent = research_data.keyword_analysis.get('search_intent', 'informational')
|
||||
intent_keywords = self._get_intent_keywords(search_intent)
|
||||
|
||||
intent_score = 0.0
|
||||
for keyword in intent_keywords:
|
||||
if keyword in section_text or keyword in source_text:
|
||||
intent_score += 0.1
|
||||
|
||||
contextual_score += min(0.3, intent_score)
|
||||
|
||||
# 3. Industry/domain relevance
|
||||
if hasattr(research_data, 'industry') and research_data.industry:
|
||||
industry_words = self._extract_meaningful_words(research_data.industry.lower())
|
||||
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
|
||||
contextual_score += industry_score * 0.2
|
||||
|
||||
return min(1.0, contextual_score)
|
||||
|
||||
def _ai_validate_mapping(
|
||||
self,
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Use AI to validate and improve the algorithmic mapping results.
|
||||
|
||||
Args:
|
||||
mapping_results: Algorithmic mapping results
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
AI-validated and improved mapping results
|
||||
"""
|
||||
try:
|
||||
logger.info("Starting AI validation of source-to-section mapping...")
|
||||
|
||||
# Build AI validation prompt
|
||||
validation_prompt = self._build_validation_prompt(mapping_results, research_data)
|
||||
|
||||
# Get AI validation response
|
||||
validation_response = self._get_ai_validation_response(validation_prompt)
|
||||
|
||||
# Parse and apply AI validation results
|
||||
validated_mapping = self._parse_validation_response(validation_response, mapping_results, research_data)
|
||||
|
||||
logger.info("✅ AI validation completed successfully")
|
||||
return validated_mapping
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI validation failed: {e}. Using algorithmic results as fallback.")
|
||||
return mapping_results
|
||||
|
||||
def _apply_mapping_to_sections(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Apply the mapping results to the outline sections.
|
||||
|
||||
Args:
|
||||
sections: Original outline sections
|
||||
mapping_results: Mapping results from algorithmic/AI processing
|
||||
|
||||
Returns:
|
||||
Sections with mapped sources
|
||||
"""
|
||||
mapped_sections = []
|
||||
|
||||
for section in sections:
|
||||
# Get mapped sources for this section
|
||||
mapped_sources = mapping_results.get(section.id, [])
|
||||
|
||||
# Extract just the sources (without scores)
|
||||
section_sources = [source for source, score in mapped_sources]
|
||||
|
||||
# Create new section with mapped sources
|
||||
mapped_section = BlogOutlineSection(
|
||||
id=section.id,
|
||||
heading=section.heading,
|
||||
subheadings=section.subheadings,
|
||||
key_points=section.key_points,
|
||||
references=section_sources,
|
||||
target_words=section.target_words,
|
||||
keywords=section.keywords
|
||||
)
|
||||
|
||||
mapped_sections.append(mapped_section)
|
||||
|
||||
logger.debug(f"Applied {len(section_sources)} sources to section '{section.heading}'")
|
||||
|
||||
return mapped_sections
|
||||
|
||||
# Helper methods
|
||||
|
||||
def _extract_section_text(self, section: BlogOutlineSection) -> str:
|
||||
"""Extract all text content from a section."""
|
||||
text_parts = [section.heading]
|
||||
text_parts.extend(section.subheadings)
|
||||
text_parts.extend(section.key_points)
|
||||
text_parts.extend(section.keywords)
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_source_text(self, source: ResearchSource) -> str:
|
||||
"""Extract all text content from a source."""
|
||||
text_parts = [source.title]
|
||||
if source.excerpt:
|
||||
text_parts.append(source.excerpt)
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_meaningful_words(self, text: str) -> List[str]:
|
||||
"""Extract meaningful words from text, removing stop words and cleaning."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Clean and tokenize
|
||||
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
||||
|
||||
# Remove stop words and short words
|
||||
meaningful_words = [
|
||||
word for word in words
|
||||
if word not in self.stop_words and len(word) > 2
|
||||
]
|
||||
|
||||
return meaningful_words
|
||||
|
||||
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Calculate phrase similarity boost score."""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
text1_lower = text1.lower()
|
||||
text2_lower = text2.lower()
|
||||
|
||||
# Look for 2-3 word phrases
|
||||
phrase_boost = 0.0
|
||||
|
||||
# Extract 2-word phrases
|
||||
words1 = text1_lower.split()
|
||||
words2 = text2_lower.split()
|
||||
|
||||
for i in range(len(words1) - 1):
|
||||
phrase = f"{words1[i]} {words1[i+1]}"
|
||||
if phrase in text2_lower:
|
||||
phrase_boost += 0.1
|
||||
|
||||
# Extract 3-word phrases
|
||||
for i in range(len(words1) - 2):
|
||||
phrase = f"{words1[i]} {words1[i+1]} {words1[i+2]}"
|
||||
if phrase in text2_lower:
|
||||
phrase_boost += 0.15
|
||||
|
||||
return min(0.3, phrase_boost) # Cap at 0.3
|
||||
|
||||
def _get_intent_keywords(self, search_intent: str) -> List[str]:
|
||||
"""Get keywords associated with search intent."""
|
||||
intent_keywords = {
|
||||
'informational': ['what', 'how', 'why', 'guide', 'tutorial', 'explain', 'learn', 'understand'],
|
||||
'navigational': ['find', 'locate', 'search', 'where', 'site', 'website', 'page'],
|
||||
'transactional': ['buy', 'purchase', 'order', 'price', 'cost', 'deal', 'offer', 'discount'],
|
||||
'commercial': ['compare', 'review', 'best', 'top', 'vs', 'versus', 'alternative', 'option']
|
||||
}
|
||||
|
||||
return intent_keywords.get(search_intent, [])
|
||||
|
||||
def get_mapping_statistics(self, mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Get statistics about the mapping results.
|
||||
|
||||
Args:
|
||||
mapping_results: Mapping results to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary with mapping statistics
|
||||
"""
|
||||
total_sections = len(mapping_results)
|
||||
total_mappings = sum(len(sources) for sources in mapping_results.values())
|
||||
|
||||
# Calculate score distribution
|
||||
all_scores = []
|
||||
for sources in mapping_results.values():
|
||||
all_scores.extend([score for source, score in sources])
|
||||
|
||||
avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
||||
max_score = max(all_scores) if all_scores else 0.0
|
||||
min_score = min(all_scores) if all_scores else 0.0
|
||||
|
||||
# Count sections with/without sources
|
||||
sections_with_sources = sum(1 for sources in mapping_results.values() if sources)
|
||||
sections_without_sources = total_sections - sections_with_sources
|
||||
|
||||
return {
|
||||
'total_sections': total_sections,
|
||||
'total_mappings': total_mappings,
|
||||
'sections_with_sources': sections_with_sources,
|
||||
'sections_without_sources': sections_without_sources,
|
||||
'average_score': avg_score,
|
||||
'max_score': max_score,
|
||||
'min_score': min_score,
|
||||
'mapping_coverage': sections_with_sources / total_sections if total_sections > 0 else 0.0
|
||||
}
|
||||
|
||||
def _build_validation_prompt(
|
||||
self,
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> str:
|
||||
"""
|
||||
Build comprehensive AI validation prompt for source-to-section mapping.
|
||||
|
||||
Args:
|
||||
mapping_results: Algorithmic mapping results
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Formatted AI validation prompt
|
||||
"""
|
||||
# Extract section information
|
||||
sections_info = []
|
||||
for section_id, sources in mapping_results.items():
|
||||
section_info = {
|
||||
'id': section_id,
|
||||
'sources': [
|
||||
{
|
||||
'title': source.title,
|
||||
'url': source.url,
|
||||
'excerpt': source.excerpt,
|
||||
'credibility_score': source.credibility_score,
|
||||
'algorithmic_score': score
|
||||
}
|
||||
for source, score in sources
|
||||
]
|
||||
}
|
||||
sections_info.append(section_info)
|
||||
|
||||
# Extract research context
|
||||
research_context = {
|
||||
'primary_keywords': research_data.keyword_analysis.get('primary', []),
|
||||
'secondary_keywords': research_data.keyword_analysis.get('secondary', []),
|
||||
'content_angles': research_data.suggested_angles,
|
||||
'search_intent': research_data.keyword_analysis.get('search_intent', 'informational'),
|
||||
'all_sources': [
|
||||
{
|
||||
'title': source.title,
|
||||
'url': source.url,
|
||||
'excerpt': source.excerpt,
|
||||
'credibility_score': source.credibility_score
|
||||
}
|
||||
for source in research_data.sources
|
||||
]
|
||||
}
|
||||
|
||||
prompt = f"""
|
||||
You are an expert content strategist and SEO specialist. Your task is to validate and improve the algorithmic mapping of research sources to blog outline sections.
|
||||
|
||||
## CONTEXT
|
||||
Research Topic: {', '.join(research_context['primary_keywords'])}
|
||||
Search Intent: {research_context['search_intent']}
|
||||
Content Angles: {', '.join(research_context['content_angles'])}
|
||||
|
||||
## ALGORITHMIC MAPPING RESULTS
|
||||
The following sections have been algorithmically mapped with research sources:
|
||||
|
||||
{self._format_sections_for_prompt(sections_info)}
|
||||
|
||||
## AVAILABLE SOURCES
|
||||
All available research sources:
|
||||
{self._format_sources_for_prompt(research_context['all_sources'])}
|
||||
|
||||
## VALIDATION TASK
|
||||
Please analyze the algorithmic mapping and provide improvements:
|
||||
|
||||
1. **Validate Relevance**: Are the mapped sources truly relevant to each section's content and purpose?
|
||||
2. **Identify Gaps**: Are there better sources available that weren't mapped?
|
||||
3. **Suggest Improvements**: Recommend specific source changes for better content alignment
|
||||
4. **Quality Assessment**: Rate the overall mapping quality (1-10)
|
||||
|
||||
## RESPONSE FORMAT
|
||||
Provide your analysis in the following JSON format:
|
||||
|
||||
```json
|
||||
{{
|
||||
"overall_quality_score": 8,
|
||||
"section_improvements": [
|
||||
{{
|
||||
"section_id": "s1",
|
||||
"current_sources": ["source_title_1", "source_title_2"],
|
||||
"recommended_sources": ["better_source_1", "better_source_2", "better_source_3"],
|
||||
"reasoning": "Explanation of why these sources are better suited for this section",
|
||||
"confidence": 0.9
|
||||
}}
|
||||
],
|
||||
"summary": "Overall assessment of the mapping quality and key improvements made"
|
||||
}}
|
||||
```
|
||||
|
||||
## GUIDELINES
|
||||
- Prioritize sources that directly support the section's key points and subheadings
|
||||
- Consider source credibility, recency, and content depth
|
||||
- Ensure sources provide actionable insights for content creation
|
||||
- Maintain diversity in source types and perspectives
|
||||
- Focus on sources that enhance the section's value proposition
|
||||
|
||||
Analyze the mapping and provide your recommendations.
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _get_ai_validation_response(self, prompt: str) -> str:
|
||||
"""
|
||||
Get AI validation response using LLM provider.
|
||||
|
||||
Args:
|
||||
prompt: Validation prompt
|
||||
|
||||
Returns:
|
||||
AI validation response
|
||||
"""
|
||||
try:
|
||||
from services.llm_providers.gemini_provider import gemini_text_response
|
||||
|
||||
response = gemini_text_response(
|
||||
prompt=prompt,
|
||||
temperature=0.3,
|
||||
top_p=0.9,
|
||||
n=1,
|
||||
max_tokens=2000,
|
||||
system_prompt=None
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get AI validation response: {e}")
|
||||
raise
|
||||
|
||||
def _parse_validation_response(
|
||||
self,
|
||||
response: str,
|
||||
original_mapping: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Parse AI validation response and apply improvements.
|
||||
|
||||
Args:
|
||||
response: AI validation response
|
||||
original_mapping: Original algorithmic mapping
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Improved mapping based on AI validation
|
||||
"""
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Extract JSON from response
|
||||
json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
|
||||
if not json_match:
|
||||
# Try to find JSON without code blocks
|
||||
json_match = re.search(r'(\{.*?\})', response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
logger.warning("Could not extract JSON from AI response")
|
||||
return original_mapping
|
||||
|
||||
validation_data = json.loads(json_match.group(1))
|
||||
|
||||
# Create source lookup for quick access
|
||||
source_lookup = {source.title: source for source in research_data.sources}
|
||||
|
||||
# Apply AI improvements
|
||||
improved_mapping = {}
|
||||
|
||||
for improvement in validation_data.get('section_improvements', []):
|
||||
section_id = improvement['section_id']
|
||||
recommended_titles = improvement['recommended_sources']
|
||||
|
||||
# Map recommended titles to actual sources
|
||||
recommended_sources = []
|
||||
for title in recommended_titles:
|
||||
if title in source_lookup:
|
||||
source = source_lookup[title]
|
||||
# Use high confidence score for AI-recommended sources
|
||||
recommended_sources.append((source, 0.9))
|
||||
|
||||
if recommended_sources:
|
||||
improved_mapping[section_id] = recommended_sources
|
||||
else:
|
||||
# Fallback to original mapping if no valid sources found
|
||||
improved_mapping[section_id] = original_mapping.get(section_id, [])
|
||||
|
||||
# Add sections not mentioned in AI response
|
||||
for section_id, sources in original_mapping.items():
|
||||
if section_id not in improved_mapping:
|
||||
improved_mapping[section_id] = sources
|
||||
|
||||
logger.info(f"AI validation applied: {len(validation_data.get('section_improvements', []))} sections improved")
|
||||
return improved_mapping
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse AI validation response: {e}")
|
||||
return original_mapping
|
||||
|
||||
def _format_sections_for_prompt(self, sections_info: List[Dict]) -> str:
|
||||
"""Format sections information for AI prompt."""
|
||||
formatted = []
|
||||
for section in sections_info:
|
||||
section_text = f"**Section {section['id']}:**\n"
|
||||
section_text += f"Sources mapped: {len(section['sources'])}\n"
|
||||
for source in section['sources']:
|
||||
section_text += f"- {source['title']} (Score: {source['algorithmic_score']:.2f})\n"
|
||||
formatted.append(section_text)
|
||||
return "\n".join(formatted)
|
||||
|
||||
def _format_sources_for_prompt(self, sources: List[Dict]) -> str:
|
||||
"""Format sources information for AI prompt."""
|
||||
formatted = []
|
||||
for i, source in enumerate(sources, 1):
|
||||
source_text = f"{i}. **{source['title']}**\n"
|
||||
source_text += f" URL: {source['url']}\n"
|
||||
source_text += f" Credibility: {source['credibility_score']}\n"
|
||||
if source['excerpt']:
|
||||
source_text += f" Excerpt: {source['excerpt'][:200]}...\n"
|
||||
formatted.append(source_text)
|
||||
return "\n".join(formatted)
|
||||
123
backend/services/blog_writer/outline/title_generator.py
Normal file
123
backend/services/blog_writer/outline/title_generator.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
Title Generator - Handles title generation and formatting for blog outlines.
|
||||
|
||||
Extracts content angles from research data and combines them with AI-generated titles.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class TitleGenerator:
|
||||
"""Handles title generation, formatting, and combination logic."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the title generator."""
|
||||
pass
|
||||
|
||||
def extract_content_angle_titles(self, research) -> List[str]:
|
||||
"""
|
||||
Extract content angles from research data and convert them to blog titles.
|
||||
|
||||
Args:
|
||||
research: BlogResearchResponse object containing suggested_angles
|
||||
|
||||
Returns:
|
||||
List of title-formatted content angles
|
||||
"""
|
||||
if not research or not hasattr(research, 'suggested_angles'):
|
||||
return []
|
||||
|
||||
content_angles = research.suggested_angles or []
|
||||
if not content_angles:
|
||||
return []
|
||||
|
||||
# Convert content angles to title format
|
||||
title_formatted_angles = []
|
||||
for angle in content_angles:
|
||||
if isinstance(angle, str) and angle.strip():
|
||||
# Clean and format the angle as a title
|
||||
formatted_angle = self._format_angle_as_title(angle.strip())
|
||||
if formatted_angle and formatted_angle not in title_formatted_angles:
|
||||
title_formatted_angles.append(formatted_angle)
|
||||
|
||||
logger.info(f"Extracted {len(title_formatted_angles)} content angle titles from research data")
|
||||
return title_formatted_angles
|
||||
|
||||
def _format_angle_as_title(self, angle: str) -> str:
|
||||
"""
|
||||
Format a content angle as a proper blog title.
|
||||
|
||||
Args:
|
||||
angle: Raw content angle string
|
||||
|
||||
Returns:
|
||||
Formatted title string
|
||||
"""
|
||||
if not angle or len(angle.strip()) < 10: # Too short to be a good title
|
||||
return ""
|
||||
|
||||
# Clean up the angle
|
||||
cleaned_angle = angle.strip()
|
||||
|
||||
# Capitalize first letter of each sentence and proper nouns
|
||||
sentences = cleaned_angle.split('. ')
|
||||
formatted_sentences = []
|
||||
for sentence in sentences:
|
||||
if sentence.strip():
|
||||
# Use title case for better formatting
|
||||
formatted_sentence = sentence.strip().title()
|
||||
formatted_sentences.append(formatted_sentence)
|
||||
|
||||
formatted_title = '. '.join(formatted_sentences)
|
||||
|
||||
# Ensure it ends with proper punctuation
|
||||
if not formatted_title.endswith(('.', '!', '?')):
|
||||
formatted_title += '.'
|
||||
|
||||
# Limit length to reasonable blog title size
|
||||
if len(formatted_title) > 100:
|
||||
formatted_title = formatted_title[:97] + "..."
|
||||
|
||||
return formatted_title
|
||||
|
||||
def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str]) -> List[str]:
|
||||
"""
|
||||
Combine AI-generated titles with content angle titles, ensuring variety and quality.
|
||||
|
||||
Args:
|
||||
ai_titles: AI-generated title options
|
||||
content_angle_titles: Titles derived from content angles
|
||||
primary_keywords: Primary keywords for fallback generation
|
||||
|
||||
Returns:
|
||||
Combined list of title options (max 6 total)
|
||||
"""
|
||||
all_titles = []
|
||||
|
||||
# Add content angle titles first (these are research-based and valuable)
|
||||
for title in content_angle_titles[:3]: # Limit to top 3 content angles
|
||||
if title and title not in all_titles:
|
||||
all_titles.append(title)
|
||||
|
||||
# Add AI-generated titles
|
||||
for title in ai_titles:
|
||||
if title and title not in all_titles:
|
||||
all_titles.append(title)
|
||||
|
||||
# Note: Removed fallback titles as requested - only use research and AI-generated titles
|
||||
|
||||
# Limit to 6 titles maximum for UI usability
|
||||
final_titles = all_titles[:6]
|
||||
|
||||
logger.info(f"Combined title options: {len(final_titles)} total (AI: {len(ai_titles)}, Content angles: {len(content_angle_titles)})")
|
||||
return final_titles
|
||||
|
||||
def generate_fallback_titles(self, primary_keywords: List[str]) -> List[str]:
|
||||
"""Generate fallback titles when AI generation fails."""
|
||||
primary_keyword = primary_keywords[0] if primary_keywords else "Topic"
|
||||
return [
|
||||
f"The Complete Guide to {primary_keyword}",
|
||||
f"{primary_keyword}: Everything You Need to Know",
|
||||
f"How to Master {primary_keyword} in 2024"
|
||||
]
|
||||
@@ -12,10 +12,12 @@ from .research_service import ResearchService
|
||||
from .keyword_analyzer import KeywordAnalyzer
|
||||
from .competitor_analyzer import CompetitorAnalyzer
|
||||
from .content_angle_generator import ContentAngleGenerator
|
||||
from .data_filter import ResearchDataFilter
|
||||
|
||||
__all__ = [
|
||||
'ResearchService',
|
||||
'KeywordAnalyzer',
|
||||
'CompetitorAnalyzer',
|
||||
'ContentAngleGenerator'
|
||||
'ContentAngleGenerator',
|
||||
'ResearchDataFilter'
|
||||
]
|
||||
|
||||
519
backend/services/blog_writer/research/data_filter.py
Normal file
519
backend/services/blog_writer/research/data_filter.py
Normal file
@@ -0,0 +1,519 @@
|
||||
"""
|
||||
Research Data Filter - Filters and cleans research data for optimal AI processing.
|
||||
|
||||
This module provides intelligent filtering and cleaning of research data to:
|
||||
1. Remove low-quality sources and irrelevant content
|
||||
2. Optimize data for AI processing (reduce tokens, improve quality)
|
||||
3. Ensure only high-value insights are sent to AI prompts
|
||||
4. Maintain data integrity while improving processing efficiency
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
BlogResearchResponse,
|
||||
ResearchSource,
|
||||
GroundingMetadata,
|
||||
GroundingChunk,
|
||||
GroundingSupport,
|
||||
Citation,
|
||||
)
|
||||
|
||||
|
||||
class ResearchDataFilter:
|
||||
"""Filters and cleans research data for optimal AI processing."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the research data filter with default settings."""
|
||||
# Be conservative but avoid over-filtering which can lead to empty UI
|
||||
self.min_credibility_score = 0.5
|
||||
self.min_excerpt_length = 20
|
||||
self.max_sources = 15
|
||||
self.max_grounding_chunks = 20
|
||||
self.max_content_gaps = 5
|
||||
self.max_keywords_per_category = 10
|
||||
self.min_grounding_confidence = 0.5
|
||||
self.max_source_age_days = 365 * 5 # allow up to 5 years if relevant
|
||||
|
||||
# Common stop words for keyword cleaning
|
||||
self.stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
||||
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
||||
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'
|
||||
}
|
||||
|
||||
# Irrelevant source patterns
|
||||
self.irrelevant_patterns = [
|
||||
r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx)$', # Document files
|
||||
r'\.(jpg|jpeg|png|gif|svg|webp)$', # Image files
|
||||
r'\.(mp4|avi|mov|wmv|flv|webm)$', # Video files
|
||||
r'\.(mp3|wav|flac|aac)$', # Audio files
|
||||
r'\.(zip|rar|7z|tar|gz)$', # Archive files
|
||||
r'^https?://(www\.)?(facebook|twitter|instagram|linkedin|youtube)\.com', # Social media
|
||||
r'^https?://(www\.)?(amazon|ebay|etsy)\.com', # E-commerce
|
||||
r'^https?://(www\.)?(wikipedia)\.org', # Wikipedia (too generic)
|
||||
]
|
||||
|
||||
logger.info("✅ ResearchDataFilter initialized with quality thresholds")
|
||||
|
||||
def filter_research_data(self, research_data: BlogResearchResponse) -> BlogResearchResponse:
|
||||
"""
|
||||
Main filtering method that processes all research data components.
|
||||
|
||||
Args:
|
||||
research_data: Raw research data from the research service
|
||||
|
||||
Returns:
|
||||
Filtered and cleaned research data optimized for AI processing
|
||||
"""
|
||||
logger.info(f"Starting research data filtering for {len(research_data.sources)} sources")
|
||||
|
||||
# Track original counts for logging
|
||||
original_counts = {
|
||||
'sources': len(research_data.sources),
|
||||
'grounding_chunks': len(research_data.grounding_metadata.grounding_chunks) if research_data.grounding_metadata else 0,
|
||||
'grounding_supports': len(research_data.grounding_metadata.grounding_supports) if research_data.grounding_metadata else 0,
|
||||
'citations': len(research_data.grounding_metadata.citations) if research_data.grounding_metadata else 0,
|
||||
}
|
||||
|
||||
# Filter sources
|
||||
filtered_sources = self.filter_sources(research_data.sources)
|
||||
|
||||
# Filter grounding metadata
|
||||
filtered_grounding_metadata = self.filter_grounding_metadata(research_data.grounding_metadata)
|
||||
|
||||
# Clean keyword analysis
|
||||
cleaned_keyword_analysis = self.clean_keyword_analysis(research_data.keyword_analysis)
|
||||
|
||||
# Clean competitor analysis
|
||||
cleaned_competitor_analysis = self.clean_competitor_analysis(research_data.competitor_analysis)
|
||||
|
||||
# Filter content gaps
|
||||
filtered_content_gaps = self.filter_content_gaps(
|
||||
research_data.keyword_analysis.get('content_gaps', []),
|
||||
research_data
|
||||
)
|
||||
|
||||
# Update keyword analysis with filtered content gaps
|
||||
cleaned_keyword_analysis['content_gaps'] = filtered_content_gaps
|
||||
|
||||
# Create filtered research response
|
||||
filtered_research = BlogResearchResponse(
|
||||
success=research_data.success,
|
||||
sources=filtered_sources,
|
||||
keyword_analysis=cleaned_keyword_analysis,
|
||||
competitor_analysis=cleaned_competitor_analysis,
|
||||
suggested_angles=research_data.suggested_angles, # Keep as-is for now
|
||||
search_widget=research_data.search_widget,
|
||||
search_queries=research_data.search_queries,
|
||||
grounding_metadata=filtered_grounding_metadata,
|
||||
error_message=research_data.error_message
|
||||
)
|
||||
|
||||
# Log filtering results
|
||||
self._log_filtering_results(original_counts, filtered_research)
|
||||
|
||||
return filtered_research
|
||||
|
||||
def filter_sources(self, sources: List[ResearchSource]) -> List[ResearchSource]:
|
||||
"""
|
||||
Filter sources based on quality, relevance, and recency criteria.
|
||||
|
||||
Args:
|
||||
sources: List of research sources to filter
|
||||
|
||||
Returns:
|
||||
Filtered list of high-quality sources
|
||||
"""
|
||||
if not sources:
|
||||
return []
|
||||
|
||||
filtered_sources = []
|
||||
|
||||
for source in sources:
|
||||
# Quality filters
|
||||
if not self._is_source_high_quality(source):
|
||||
continue
|
||||
|
||||
# Relevance filters
|
||||
if not self._is_source_relevant(source):
|
||||
continue
|
||||
|
||||
# Recency filters
|
||||
if not self._is_source_recent(source):
|
||||
continue
|
||||
|
||||
filtered_sources.append(source)
|
||||
|
||||
# Sort by credibility score and limit to max_sources
|
||||
filtered_sources.sort(key=lambda s: s.credibility_score or 0.8, reverse=True)
|
||||
filtered_sources = filtered_sources[:self.max_sources]
|
||||
|
||||
# Fail-open: if everything was filtered out, return a trimmed set of original sources
|
||||
if not filtered_sources and sources:
|
||||
logger.warning("All sources filtered out by thresholds. Falling back to top sources without strict filters.")
|
||||
fallback = sorted(
|
||||
sources,
|
||||
key=lambda s: (s.credibility_score or 0.8),
|
||||
reverse=True
|
||||
)[: self.max_sources]
|
||||
return fallback
|
||||
|
||||
logger.info(f"Filtered sources: {len(sources)} → {len(filtered_sources)}")
|
||||
return filtered_sources
|
||||
|
||||
def filter_grounding_metadata(self, grounding_metadata: Optional[GroundingMetadata]) -> Optional[GroundingMetadata]:
|
||||
"""
|
||||
Filter grounding metadata to keep only high-confidence, relevant data.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Raw grounding metadata to filter
|
||||
|
||||
Returns:
|
||||
Filtered grounding metadata with high-quality data only
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return None
|
||||
|
||||
# Filter grounding chunks by confidence
|
||||
filtered_chunks = []
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
if chunk.confidence_score and chunk.confidence_score >= self.min_grounding_confidence:
|
||||
filtered_chunks.append(chunk)
|
||||
|
||||
# Limit chunks to max_grounding_chunks
|
||||
filtered_chunks = filtered_chunks[:self.max_grounding_chunks]
|
||||
|
||||
# Filter grounding supports by confidence
|
||||
filtered_supports = []
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
if support.confidence_scores and max(support.confidence_scores) >= self.min_grounding_confidence:
|
||||
filtered_supports.append(support)
|
||||
|
||||
# Filter citations by type and relevance
|
||||
filtered_citations = []
|
||||
for citation in grounding_metadata.citations:
|
||||
if self._is_citation_relevant(citation):
|
||||
filtered_citations.append(citation)
|
||||
|
||||
# Fail-open strategies to avoid empty UI:
|
||||
if not filtered_chunks and grounding_metadata.grounding_chunks:
|
||||
logger.warning("All grounding chunks filtered out. Falling back to first N chunks without confidence filter.")
|
||||
filtered_chunks = grounding_metadata.grounding_chunks[: self.max_grounding_chunks]
|
||||
if not filtered_supports and grounding_metadata.grounding_supports:
|
||||
logger.warning("All grounding supports filtered out. Falling back to first N supports without confidence filter.")
|
||||
filtered_supports = grounding_metadata.grounding_supports[: self.max_grounding_chunks]
|
||||
|
||||
# Create filtered grounding metadata
|
||||
filtered_metadata = GroundingMetadata(
|
||||
grounding_chunks=filtered_chunks,
|
||||
grounding_supports=filtered_supports,
|
||||
citations=filtered_citations,
|
||||
search_entry_point=grounding_metadata.search_entry_point,
|
||||
web_search_queries=grounding_metadata.web_search_queries
|
||||
)
|
||||
|
||||
logger.info(f"Filtered grounding metadata: {len(grounding_metadata.grounding_chunks)} chunks → {len(filtered_chunks)} chunks")
|
||||
return filtered_metadata
|
||||
|
||||
def clean_keyword_analysis(self, keyword_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Clean and deduplicate keyword analysis data.
|
||||
|
||||
Args:
|
||||
keyword_analysis: Raw keyword analysis data
|
||||
|
||||
Returns:
|
||||
Cleaned and deduplicated keyword analysis
|
||||
"""
|
||||
if not keyword_analysis:
|
||||
return {}
|
||||
|
||||
cleaned_analysis = {}
|
||||
|
||||
# Clean and deduplicate keyword lists
|
||||
keyword_categories = ['primary', 'secondary', 'long_tail', 'semantic_keywords', 'trending_terms']
|
||||
|
||||
for category in keyword_categories:
|
||||
if category in keyword_analysis and isinstance(keyword_analysis[category], list):
|
||||
cleaned_keywords = self._clean_keyword_list(keyword_analysis[category])
|
||||
cleaned_analysis[category] = cleaned_keywords[:self.max_keywords_per_category]
|
||||
|
||||
# Clean other fields
|
||||
other_fields = ['search_intent', 'difficulty', 'analysis_insights']
|
||||
for field in other_fields:
|
||||
if field in keyword_analysis:
|
||||
cleaned_analysis[field] = keyword_analysis[field]
|
||||
|
||||
# Clean content gaps separately (handled by filter_content_gaps)
|
||||
# Don't add content_gaps if it's empty to avoid adding empty lists
|
||||
if 'content_gaps' in keyword_analysis and keyword_analysis['content_gaps']:
|
||||
cleaned_analysis['content_gaps'] = keyword_analysis['content_gaps'] # Will be filtered later
|
||||
|
||||
logger.info(f"Cleaned keyword analysis: {len(keyword_analysis)} categories → {len(cleaned_analysis)} categories")
|
||||
return cleaned_analysis
|
||||
|
||||
def clean_competitor_analysis(self, competitor_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Clean and validate competitor analysis data.
|
||||
|
||||
Args:
|
||||
competitor_analysis: Raw competitor analysis data
|
||||
|
||||
Returns:
|
||||
Cleaned competitor analysis data
|
||||
"""
|
||||
if not competitor_analysis:
|
||||
return {}
|
||||
|
||||
cleaned_analysis = {}
|
||||
|
||||
# Clean competitor lists
|
||||
competitor_lists = ['top_competitors', 'opportunities', 'competitive_advantages']
|
||||
for field in competitor_lists:
|
||||
if field in competitor_analysis and isinstance(competitor_analysis[field], list):
|
||||
cleaned_list = [item.strip() for item in competitor_analysis[field] if item.strip()]
|
||||
cleaned_analysis[field] = cleaned_list[:10] # Limit to top 10
|
||||
|
||||
# Clean other fields
|
||||
other_fields = ['market_positioning', 'competitive_landscape', 'market_share']
|
||||
for field in other_fields:
|
||||
if field in competitor_analysis:
|
||||
cleaned_analysis[field] = competitor_analysis[field]
|
||||
|
||||
logger.info(f"Cleaned competitor analysis: {len(competitor_analysis)} fields → {len(cleaned_analysis)} fields")
|
||||
return cleaned_analysis
|
||||
|
||||
def filter_content_gaps(self, content_gaps: List[str], research_data: BlogResearchResponse) -> List[str]:
|
||||
"""
|
||||
Filter content gaps to keep only actionable, high-value ones.
|
||||
|
||||
Args:
|
||||
content_gaps: List of identified content gaps
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Filtered list of actionable content gaps
|
||||
"""
|
||||
if not content_gaps:
|
||||
return []
|
||||
|
||||
filtered_gaps = []
|
||||
|
||||
for gap in content_gaps:
|
||||
# Quality filters
|
||||
if not self._is_gap_high_quality(gap):
|
||||
continue
|
||||
|
||||
# Relevance filters
|
||||
if not self._is_gap_relevant_to_topic(gap, research_data):
|
||||
continue
|
||||
|
||||
# Actionability filters
|
||||
if not self._is_gap_actionable(gap):
|
||||
continue
|
||||
|
||||
filtered_gaps.append(gap)
|
||||
|
||||
# Limit to max_content_gaps
|
||||
filtered_gaps = filtered_gaps[:self.max_content_gaps]
|
||||
|
||||
logger.info(f"Filtered content gaps: {len(content_gaps)} → {len(filtered_gaps)}")
|
||||
return filtered_gaps
|
||||
|
||||
# Private helper methods
|
||||
|
||||
def _is_source_high_quality(self, source: ResearchSource) -> bool:
|
||||
"""Check if source meets quality criteria."""
|
||||
# Credibility score check
|
||||
if source.credibility_score and source.credibility_score < self.min_credibility_score:
|
||||
return False
|
||||
|
||||
# Excerpt length check
|
||||
if source.excerpt and len(source.excerpt) < self.min_excerpt_length:
|
||||
return False
|
||||
|
||||
# Title quality check
|
||||
if not source.title or len(source.title.strip()) < 10:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_source_relevant(self, source: ResearchSource) -> bool:
|
||||
"""Check if source is relevant (not irrelevant patterns)."""
|
||||
if not source.url:
|
||||
return True # Keep sources without URLs
|
||||
|
||||
# Check against irrelevant patterns
|
||||
for pattern in self.irrelevant_patterns:
|
||||
if re.search(pattern, source.url, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_source_recent(self, source: ResearchSource) -> bool:
|
||||
"""Check if source is recent enough."""
|
||||
if not source.published_at:
|
||||
return True # Keep sources without dates
|
||||
|
||||
try:
|
||||
# Parse date (assuming ISO format or common formats)
|
||||
published_date = self._parse_date(source.published_at)
|
||||
if published_date:
|
||||
cutoff_date = datetime.now() - timedelta(days=self.max_source_age_days)
|
||||
return published_date >= cutoff_date
|
||||
except Exception as e:
|
||||
logger.warning(f"Error parsing date '{source.published_at}': {e}")
|
||||
|
||||
return True # Keep sources with unparseable dates
|
||||
|
||||
def _is_citation_relevant(self, citation: Citation) -> bool:
|
||||
"""Check if citation is relevant and high-quality."""
|
||||
# Check citation type
|
||||
relevant_types = ['expert_opinion', 'statistical_data', 'recent_news', 'research_study']
|
||||
if citation.citation_type not in relevant_types:
|
||||
return False
|
||||
|
||||
# Check text quality
|
||||
if not citation.text or len(citation.text.strip()) < 20:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_gap_high_quality(self, gap: str) -> bool:
|
||||
"""Check if content gap is high quality."""
|
||||
gap = gap.strip()
|
||||
|
||||
# Length check
|
||||
if len(gap) < 10:
|
||||
return False
|
||||
|
||||
# Generic gap check
|
||||
generic_gaps = ['general', 'overview', 'introduction', 'basics', 'fundamentals']
|
||||
if gap.lower() in generic_gaps:
|
||||
return False
|
||||
|
||||
# Check for meaningful content
|
||||
if len(gap.split()) < 3:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _is_gap_relevant_to_topic(self, gap: str, research_data: BlogResearchResponse) -> bool:
|
||||
"""Check if content gap is relevant to the research topic."""
|
||||
# Simple relevance check - could be enhanced with more sophisticated matching
|
||||
primary_keywords = research_data.keyword_analysis.get('primary', [])
|
||||
|
||||
if not primary_keywords:
|
||||
return True # Keep gaps if no keywords available
|
||||
|
||||
gap_lower = gap.lower()
|
||||
for keyword in primary_keywords:
|
||||
if keyword.lower() in gap_lower:
|
||||
return True
|
||||
|
||||
# If no direct keyword match, check for common AI-related terms
|
||||
ai_terms = ['ai', 'artificial intelligence', 'machine learning', 'automation', 'technology', 'digital']
|
||||
for term in ai_terms:
|
||||
if term in gap_lower:
|
||||
return True
|
||||
|
||||
return True # Default to keeping gaps if no clear relevance check
|
||||
|
||||
def _is_gap_actionable(self, gap: str) -> bool:
|
||||
"""Check if content gap is actionable (can be addressed with content)."""
|
||||
gap_lower = gap.lower()
|
||||
|
||||
# Check for actionable indicators
|
||||
actionable_indicators = [
|
||||
'how to', 'guide', 'tutorial', 'steps', 'process', 'method',
|
||||
'best practices', 'tips', 'strategies', 'techniques', 'approach',
|
||||
'comparison', 'vs', 'versus', 'difference', 'pros and cons',
|
||||
'trends', 'future', '2024', '2025', 'emerging', 'new'
|
||||
]
|
||||
|
||||
for indicator in actionable_indicators:
|
||||
if indicator in gap_lower:
|
||||
return True
|
||||
|
||||
return True # Default to actionable if no specific indicators
|
||||
|
||||
def _clean_keyword_list(self, keywords: List[str]) -> List[str]:
|
||||
"""Clean and deduplicate a list of keywords."""
|
||||
cleaned_keywords = []
|
||||
seen_keywords = set()
|
||||
|
||||
for keyword in keywords:
|
||||
if not keyword or not isinstance(keyword, str):
|
||||
continue
|
||||
|
||||
# Clean keyword
|
||||
cleaned_keyword = keyword.strip().lower()
|
||||
|
||||
# Skip empty or too short keywords
|
||||
if len(cleaned_keyword) < 2:
|
||||
continue
|
||||
|
||||
# Skip stop words
|
||||
if cleaned_keyword in self.stop_words:
|
||||
continue
|
||||
|
||||
# Skip duplicates
|
||||
if cleaned_keyword in seen_keywords:
|
||||
continue
|
||||
|
||||
cleaned_keywords.append(cleaned_keyword)
|
||||
seen_keywords.add(cleaned_keyword)
|
||||
|
||||
return cleaned_keywords
|
||||
|
||||
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
||||
"""Parse date string into datetime object."""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
# Common date formats
|
||||
date_formats = [
|
||||
'%Y-%m-%d',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%SZ',
|
||||
'%Y-%m-%dT%H:%M:%S.%fZ',
|
||||
'%B %d, %Y',
|
||||
'%b %d, %Y',
|
||||
'%d %B %Y',
|
||||
'%d %b %Y',
|
||||
'%m/%d/%Y',
|
||||
'%d/%m/%Y'
|
||||
]
|
||||
|
||||
for fmt in date_formats:
|
||||
try:
|
||||
return datetime.strptime(date_str, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def _log_filtering_results(self, original_counts: Dict[str, int], filtered_research: BlogResearchResponse):
|
||||
"""Log the results of filtering operations."""
|
||||
filtered_counts = {
|
||||
'sources': len(filtered_research.sources),
|
||||
'grounding_chunks': len(filtered_research.grounding_metadata.grounding_chunks) if filtered_research.grounding_metadata else 0,
|
||||
'grounding_supports': len(filtered_research.grounding_metadata.grounding_supports) if filtered_research.grounding_metadata else 0,
|
||||
'citations': len(filtered_research.grounding_metadata.citations) if filtered_research.grounding_metadata else 0,
|
||||
}
|
||||
|
||||
logger.info("📊 Research Data Filtering Results:")
|
||||
for key, original_count in original_counts.items():
|
||||
filtered_count = filtered_counts[key]
|
||||
reduction_percent = ((original_count - filtered_count) / original_count * 100) if original_count > 0 else 0
|
||||
logger.info(f" {key}: {original_count} → {filtered_count} ({reduction_percent:.1f}% reduction)")
|
||||
|
||||
# Log content gaps filtering
|
||||
original_gaps = len(filtered_research.keyword_analysis.get('content_gaps', []))
|
||||
logger.info(f" content_gaps: {original_gaps} → {len(filtered_research.keyword_analysis.get('content_gaps', []))}")
|
||||
|
||||
logger.info("✅ Research data filtering completed successfully")
|
||||
@@ -11,11 +11,16 @@ from models.blog_models import (
|
||||
BlogResearchRequest,
|
||||
BlogResearchResponse,
|
||||
ResearchSource,
|
||||
GroundingMetadata,
|
||||
GroundingChunk,
|
||||
GroundingSupport,
|
||||
Citation,
|
||||
)
|
||||
|
||||
from .keyword_analyzer import KeywordAnalyzer
|
||||
from .competitor_analyzer import CompetitorAnalyzer
|
||||
from .content_angle_generator import ContentAngleGenerator
|
||||
from .data_filter import ResearchDataFilter
|
||||
|
||||
|
||||
class ResearchService:
|
||||
@@ -25,6 +30,7 @@ class ResearchService:
|
||||
self.keyword_analyzer = KeywordAnalyzer()
|
||||
self.competitor_analyzer = CompetitorAnalyzer()
|
||||
self.content_angle_generator = ContentAngleGenerator()
|
||||
self.data_filter = ResearchDataFilter()
|
||||
|
||||
async def research(self, request: BlogResearchRequest) -> BlogResearchResponse:
|
||||
"""
|
||||
@@ -85,6 +91,9 @@ class ResearchService:
|
||||
# Extract sources from grounding metadata
|
||||
sources = self._extract_sources_from_grounding(gemini_result)
|
||||
|
||||
# Extract grounding metadata for detailed UI display
|
||||
grounding_metadata = self._extract_grounding_metadata(gemini_result)
|
||||
|
||||
# Extract search widget and queries for UI display
|
||||
search_widget = gemini_result.get("search_widget", "") or ""
|
||||
search_queries = gemini_result.get("search_queries", []) or []
|
||||
@@ -107,17 +116,31 @@ class ResearchService:
|
||||
# Add search widget and queries for UI display
|
||||
search_widget=search_widget if 'search_widget' in locals() else "",
|
||||
search_queries=search_queries if 'search_queries' in locals() else [],
|
||||
# Add grounding metadata for detailed UI display
|
||||
grounding_metadata=grounding_metadata,
|
||||
)
|
||||
|
||||
# Cache the successful result for future exact keyword matches
|
||||
# Filter and clean research data for optimal AI processing
|
||||
filtered_response = self.data_filter.filter_research_data(response)
|
||||
logger.info("Research data filtering completed successfully")
|
||||
|
||||
# Cache the successful result for future exact keyword matches (both caches)
|
||||
persistent_research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
# Also cache in memory for faster access
|
||||
research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=response.dict()
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
return response
|
||||
return filtered_response
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
@@ -142,27 +165,38 @@ class ResearchService:
|
||||
try:
|
||||
from services.llm_providers.gemini_grounded_provider import GeminiGroundedProvider
|
||||
from services.cache.research_cache import research_cache
|
||||
from api.blog_writer.router import _update_progress
|
||||
from services.cache.persistent_research_cache import persistent_research_cache
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or (request.persona.industry if request.persona and request.persona.industry else "General")
|
||||
target_audience = getattr(request.persona, 'target_audience', 'General') if request.persona else 'General'
|
||||
|
||||
# Check cache first for exact keyword match
|
||||
await _update_progress(task_id, "🔍 Checking cache for existing research...")
|
||||
cached_result = research_cache.get_cached_result(
|
||||
# Check cache first for exact keyword match (try both caches)
|
||||
await task_manager.update_progress(task_id, "🔍 Checking cache for existing research...")
|
||||
|
||||
# Try persistent cache first (survives restarts)
|
||||
cached_result = persistent_research_cache.get_cached_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience
|
||||
)
|
||||
|
||||
# Fallback to in-memory cache
|
||||
if not cached_result:
|
||||
cached_result = research_cache.get_cached_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
await _update_progress(task_id, "✅ Found cached research results! Returning instantly...")
|
||||
await task_manager.update_progress(task_id, "✅ Found cached research results! Returning instantly...")
|
||||
logger.info(f"Returning cached research result for keywords: {request.keywords}")
|
||||
return BlogResearchResponse(**cached_result)
|
||||
|
||||
# Cache miss - proceed with API call
|
||||
await _update_progress(task_id, "🌐 Cache miss - connecting to Google Search grounding...")
|
||||
await task_manager.update_progress(task_id, "🌐 Cache miss - connecting to Google Search grounding...")
|
||||
logger.info(f"Cache miss - making API call for keywords: {request.keywords}")
|
||||
gemini = GeminiGroundedProvider()
|
||||
|
||||
@@ -185,7 +219,7 @@ class ResearchService:
|
||||
Structure your response with clear sections for each analysis area.
|
||||
"""
|
||||
|
||||
await _update_progress(task_id, "🤖 Making AI request to Gemini with Google Search grounding...")
|
||||
await task_manager.update_progress(task_id, "🤖 Making AI request to Gemini with Google Search grounding...")
|
||||
# Single Gemini call with native Google Search grounding - no fallbacks
|
||||
gemini_result = await gemini.generate_grounded_content(
|
||||
prompt=research_prompt,
|
||||
@@ -193,22 +227,25 @@ class ResearchService:
|
||||
max_tokens=2000
|
||||
)
|
||||
|
||||
await _update_progress(task_id, "📊 Processing research results and extracting insights...")
|
||||
await task_manager.update_progress(task_id, "📊 Processing research results and extracting insights...")
|
||||
# Extract sources from grounding metadata
|
||||
sources = self._extract_sources_from_grounding(gemini_result)
|
||||
|
||||
# Extract grounding metadata for detailed UI display
|
||||
grounding_metadata = self._extract_grounding_metadata(gemini_result)
|
||||
|
||||
# Extract search widget and queries for UI display
|
||||
search_widget = gemini_result.get("search_widget", "") or ""
|
||||
search_queries = gemini_result.get("search_queries", []) or []
|
||||
|
||||
await _update_progress(task_id, "🔍 Analyzing keywords and content angles...")
|
||||
await task_manager.update_progress(task_id, "🔍 Analyzing keywords and content angles...")
|
||||
# Parse the comprehensive response for different analysis components
|
||||
content = gemini_result.get("content", "")
|
||||
keyword_analysis = self.keyword_analyzer.analyze(content, request.keywords)
|
||||
competitor_analysis = self.competitor_analyzer.analyze(content)
|
||||
suggested_angles = self.content_angle_generator.generate(content, topic, industry)
|
||||
|
||||
await _update_progress(task_id, "💾 Caching results for future use...")
|
||||
await task_manager.update_progress(task_id, "💾 Caching results for future use...")
|
||||
logger.info(f"Research completed successfully with {len(sources)} sources and {len(search_queries)} search queries")
|
||||
|
||||
# Create the response
|
||||
@@ -221,17 +258,34 @@ class ResearchService:
|
||||
# Add search widget and queries for UI display
|
||||
search_widget=search_widget if 'search_widget' in locals() else "",
|
||||
search_queries=search_queries if 'search_queries' in locals() else [],
|
||||
# Add grounding metadata for detailed UI display
|
||||
grounding_metadata=grounding_metadata,
|
||||
# Preserve original user keywords for caching
|
||||
original_keywords=request.keywords,
|
||||
)
|
||||
|
||||
# Cache the successful result for future exact keyword matches
|
||||
# Filter and clean research data for optimal AI processing
|
||||
await task_manager.update_progress(task_id, "🔍 Filtering and cleaning research data...")
|
||||
filtered_response = self.data_filter.filter_research_data(response)
|
||||
logger.info("Research data filtering completed successfully")
|
||||
|
||||
# Cache the successful result for future exact keyword matches (both caches)
|
||||
persistent_research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
# Also cache in memory for faster access
|
||||
research_cache.cache_result(
|
||||
keywords=request.keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
result=response.dict()
|
||||
result=filtered_response.dict()
|
||||
)
|
||||
|
||||
return response
|
||||
return filtered_response
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
@@ -261,8 +315,104 @@ class ResearchService:
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
published_at=str(src.get("publication_date", "2024-01-01"))
|
||||
published_at=str(src.get("publication_date", "2024-01-01")),
|
||||
index=src.get("index"),
|
||||
source_type=src.get("type", "web")
|
||||
)
|
||||
sources.append(source)
|
||||
|
||||
return sources
|
||||
|
||||
def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> GroundingMetadata:
|
||||
"""Extract detailed grounding metadata from Gemini result."""
|
||||
grounding_chunks = []
|
||||
grounding_supports = []
|
||||
citations = []
|
||||
|
||||
# Extract grounding chunks from the raw grounding metadata
|
||||
raw_grounding = gemini_result.get("grounding_metadata", {})
|
||||
|
||||
# Handle case where grounding_metadata might be a GroundingMetadata object
|
||||
if hasattr(raw_grounding, 'grounding_chunks'):
|
||||
raw_chunks = raw_grounding.grounding_chunks
|
||||
else:
|
||||
raw_chunks = raw_grounding.get("grounding_chunks", [])
|
||||
|
||||
for chunk in raw_chunks:
|
||||
if "web" in chunk:
|
||||
web_data = chunk["web"]
|
||||
grounding_chunk = GroundingChunk(
|
||||
title=web_data.get("title", "Untitled"),
|
||||
url=web_data.get("uri", ""),
|
||||
confidence_score=None # Will be set from supports
|
||||
)
|
||||
grounding_chunks.append(grounding_chunk)
|
||||
|
||||
# Extract grounding supports with confidence scores
|
||||
if hasattr(raw_grounding, 'grounding_supports'):
|
||||
raw_supports = raw_grounding.grounding_supports
|
||||
else:
|
||||
raw_supports = raw_grounding.get("grounding_supports", [])
|
||||
for support in raw_supports:
|
||||
# Handle both dictionary and GroundingSupport object formats
|
||||
if hasattr(support, 'confidence_scores'):
|
||||
confidence_scores = support.confidence_scores
|
||||
chunk_indices = support.grounding_chunk_indices
|
||||
segment_text = getattr(support, 'segment_text', '')
|
||||
start_index = getattr(support, 'start_index', None)
|
||||
end_index = getattr(support, 'end_index', None)
|
||||
else:
|
||||
confidence_scores = support.get("confidence_scores", [])
|
||||
chunk_indices = support.get("grounding_chunk_indices", [])
|
||||
segment = support.get("segment", {})
|
||||
segment_text = segment.get("text", "")
|
||||
start_index = segment.get("start_index")
|
||||
end_index = segment.get("end_index")
|
||||
|
||||
grounding_support = GroundingSupport(
|
||||
confidence_scores=confidence_scores,
|
||||
grounding_chunk_indices=chunk_indices,
|
||||
segment_text=segment_text,
|
||||
start_index=start_index,
|
||||
end_index=end_index
|
||||
)
|
||||
grounding_supports.append(grounding_support)
|
||||
|
||||
# Update confidence scores for chunks
|
||||
if confidence_scores and chunk_indices:
|
||||
avg_confidence = sum(confidence_scores) / len(confidence_scores)
|
||||
for idx in chunk_indices:
|
||||
if idx < len(grounding_chunks):
|
||||
grounding_chunks[idx].confidence_score = avg_confidence
|
||||
|
||||
# Extract citations from the raw result
|
||||
raw_citations = gemini_result.get("citations", [])
|
||||
for citation in raw_citations:
|
||||
citation_obj = Citation(
|
||||
citation_type=citation.get("type", "inline"),
|
||||
start_index=citation.get("start_index", 0),
|
||||
end_index=citation.get("end_index", 0),
|
||||
text=citation.get("text", ""),
|
||||
source_indices=citation.get("source_indices", []),
|
||||
reference=citation.get("reference", "")
|
||||
)
|
||||
citations.append(citation_obj)
|
||||
|
||||
# Extract search entry point and web search queries
|
||||
if hasattr(raw_grounding, 'search_entry_point'):
|
||||
search_entry_point = getattr(raw_grounding.search_entry_point, 'rendered_content', '') if raw_grounding.search_entry_point else ''
|
||||
else:
|
||||
search_entry_point = raw_grounding.get("search_entry_point", {}).get("rendered_content", "")
|
||||
|
||||
if hasattr(raw_grounding, 'web_search_queries'):
|
||||
web_search_queries = raw_grounding.web_search_queries
|
||||
else:
|
||||
web_search_queries = raw_grounding.get("web_search_queries", [])
|
||||
|
||||
return GroundingMetadata(
|
||||
grounding_chunks=grounding_chunks,
|
||||
grounding_supports=grounding_supports,
|
||||
citations=citations,
|
||||
search_entry_point=search_entry_point,
|
||||
web_search_queries=web_search_queries
|
||||
)
|
||||
|
||||
332
backend/services/cache/persistent_outline_cache.py
vendored
Normal file
332
backend/services/cache/persistent_outline_cache.py
vendored
Normal file
@@ -0,0 +1,332 @@
|
||||
"""
|
||||
Persistent Outline Cache Service
|
||||
|
||||
Provides database-backed caching for outline generation results to survive server restarts
|
||||
and provide better cache management across multiple instances.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import sqlite3
|
||||
from typing import Dict, Any, Optional, List
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class PersistentOutlineCache:
|
||||
"""Database-backed cache for outline generation results with exact parameter matching."""
|
||||
|
||||
def __init__(self, db_path: str = "outline_cache.db", max_cache_size: int = 500, cache_ttl_hours: int = 48):
|
||||
"""
|
||||
Initialize the persistent outline cache.
|
||||
|
||||
Args:
|
||||
db_path: Path to SQLite database file
|
||||
max_cache_size: Maximum number of cached entries
|
||||
cache_ttl_hours: Time-to-live for cache entries in hours (longer than research cache)
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self.max_cache_size = max_cache_size
|
||||
self.cache_ttl = timedelta(hours=cache_ttl_hours)
|
||||
|
||||
# Ensure database directory exists
|
||||
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize database
|
||||
self._init_database()
|
||||
|
||||
def _init_database(self):
|
||||
"""Initialize the SQLite database with required tables."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS outline_cache (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
cache_key TEXT UNIQUE NOT NULL,
|
||||
keywords TEXT NOT NULL,
|
||||
industry TEXT NOT NULL,
|
||||
target_audience TEXT NOT NULL,
|
||||
word_count INTEGER NOT NULL,
|
||||
custom_instructions TEXT,
|
||||
persona_data TEXT,
|
||||
result_data TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
expires_at TIMESTAMP NOT NULL,
|
||||
access_count INTEGER DEFAULT 0,
|
||||
last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes for better performance
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_outline_cache_key ON outline_cache(cache_key)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_outline_expires_at ON outline_cache(expires_at)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_outline_created_at ON outline_cache(created_at)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_outline_keywords ON outline_cache(keywords)")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def _generate_cache_key(self, keywords: List[str], industry: str, target_audience: str,
|
||||
word_count: int, custom_instructions: str = None, persona_data: Dict = None) -> str:
|
||||
"""
|
||||
Generate a cache key based on exact parameter match.
|
||||
|
||||
Args:
|
||||
keywords: List of research keywords
|
||||
industry: Industry context
|
||||
target_audience: Target audience context
|
||||
word_count: Target word count for outline
|
||||
custom_instructions: Custom instructions for outline generation
|
||||
persona_data: Persona information
|
||||
|
||||
Returns:
|
||||
MD5 hash of the normalized parameters
|
||||
"""
|
||||
# Normalize and sort keywords for consistent hashing
|
||||
normalized_keywords = sorted([kw.lower().strip() for kw in keywords])
|
||||
normalized_industry = industry.lower().strip() if industry else "general"
|
||||
normalized_audience = target_audience.lower().strip() if target_audience else "general"
|
||||
normalized_instructions = custom_instructions.lower().strip() if custom_instructions else ""
|
||||
|
||||
# Normalize persona data
|
||||
normalized_persona = ""
|
||||
if persona_data:
|
||||
# Sort persona keys and values for consistent hashing
|
||||
persona_str = json.dumps(persona_data, sort_keys=True, default=str)
|
||||
normalized_persona = persona_str.lower()
|
||||
|
||||
# Create a consistent string representation
|
||||
cache_string = f"{normalized_keywords}|{normalized_industry}|{normalized_audience}|{word_count}|{normalized_instructions}|{normalized_persona}"
|
||||
|
||||
# Generate MD5 hash
|
||||
return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
|
||||
|
||||
def _cleanup_expired_entries(self):
|
||||
"""Remove expired cache entries from database."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM outline_cache WHERE expires_at < ?",
|
||||
(datetime.now().isoformat(),)
|
||||
)
|
||||
deleted_count = cursor.rowcount
|
||||
if deleted_count > 0:
|
||||
logger.debug(f"Removed {deleted_count} expired outline cache entries")
|
||||
conn.commit()
|
||||
|
||||
def _evict_oldest_entries(self, num_to_evict: int):
|
||||
"""Evict the oldest cache entries when cache is full."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Get oldest entries by creation time
|
||||
cursor = conn.execute("""
|
||||
SELECT id FROM outline_cache
|
||||
ORDER BY created_at ASC
|
||||
LIMIT ?
|
||||
""", (num_to_evict,))
|
||||
|
||||
old_ids = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
if old_ids:
|
||||
placeholders = ','.join(['?' for _ in old_ids])
|
||||
conn.execute(f"DELETE FROM outline_cache WHERE id IN ({placeholders})", old_ids)
|
||||
logger.debug(f"Evicted {len(old_ids)} oldest outline cache entries")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def get_cached_outline(self, keywords: List[str], industry: str, target_audience: str,
|
||||
word_count: int, custom_instructions: str = None, persona_data: Dict = None) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get cached outline result for exact parameter match.
|
||||
|
||||
Args:
|
||||
keywords: List of research keywords
|
||||
industry: Industry context
|
||||
target_audience: Target audience context
|
||||
word_count: Target word count for outline
|
||||
custom_instructions: Custom instructions for outline generation
|
||||
persona_data: Persona information
|
||||
|
||||
Returns:
|
||||
Cached outline result if found and valid, None otherwise
|
||||
"""
|
||||
cache_key = self._generate_cache_key(keywords, industry, target_audience, word_count, custom_instructions, persona_data)
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT result_data, expires_at FROM outline_cache
|
||||
WHERE cache_key = ? AND expires_at > ?
|
||||
""", (cache_key, datetime.now().isoformat()))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row is None:
|
||||
logger.debug(f"Outline cache miss for keywords: {keywords}, word_count: {word_count}")
|
||||
return None
|
||||
|
||||
# Update access statistics
|
||||
conn.execute("""
|
||||
UPDATE outline_cache
|
||||
SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
|
||||
WHERE cache_key = ?
|
||||
""", (cache_key,))
|
||||
conn.commit()
|
||||
|
||||
try:
|
||||
result_data = json.loads(row[0])
|
||||
logger.info(f"Outline cache hit for keywords: {keywords}, word_count: {word_count} (saved expensive generation)")
|
||||
return result_data
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Invalid JSON in outline cache for keywords: {keywords}")
|
||||
# Remove invalid entry
|
||||
conn.execute("DELETE FROM outline_cache WHERE cache_key = ?", (cache_key,))
|
||||
conn.commit()
|
||||
return None
|
||||
|
||||
def cache_outline(self, keywords: List[str], industry: str, target_audience: str,
|
||||
word_count: int, custom_instructions: str, persona_data: Dict, result: Dict[str, Any]):
|
||||
"""
|
||||
Cache an outline generation result.
|
||||
|
||||
Args:
|
||||
keywords: List of research keywords
|
||||
industry: Industry context
|
||||
target_audience: Target audience context
|
||||
word_count: Target word count for outline
|
||||
custom_instructions: Custom instructions for outline generation
|
||||
persona_data: Persona information
|
||||
result: Outline result to cache
|
||||
"""
|
||||
cache_key = self._generate_cache_key(keywords, industry, target_audience, word_count, custom_instructions, persona_data)
|
||||
|
||||
# Cleanup expired entries first
|
||||
self._cleanup_expired_entries()
|
||||
|
||||
# Check if cache is full and evict if necessary
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM outline_cache")
|
||||
current_count = cursor.fetchone()[0]
|
||||
|
||||
if current_count >= self.max_cache_size:
|
||||
num_to_evict = current_count - self.max_cache_size + 1
|
||||
self._evict_oldest_entries(num_to_evict)
|
||||
|
||||
# Store the result
|
||||
expires_at = datetime.now() + self.cache_ttl
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO outline_cache
|
||||
(cache_key, keywords, industry, target_audience, word_count, custom_instructions, persona_data, result_data, expires_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
cache_key,
|
||||
json.dumps(keywords),
|
||||
industry,
|
||||
target_audience,
|
||||
word_count,
|
||||
custom_instructions or "",
|
||||
json.dumps(persona_data) if persona_data else "",
|
||||
json.dumps(result),
|
||||
expires_at.isoformat()
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Cached outline result for keywords: {keywords}, word_count: {word_count}")
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get cache statistics."""
|
||||
self._cleanup_expired_entries()
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Get basic stats
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM outline_cache")
|
||||
total_entries = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM outline_cache WHERE expires_at > ?", (datetime.now().isoformat(),))
|
||||
valid_entries = cursor.fetchone()[0]
|
||||
|
||||
# Get most accessed entries
|
||||
cursor = conn.execute("""
|
||||
SELECT keywords, industry, target_audience, word_count, access_count, created_at
|
||||
FROM outline_cache
|
||||
ORDER BY access_count DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
top_entries = [
|
||||
{
|
||||
'keywords': json.loads(row[0]),
|
||||
'industry': row[1],
|
||||
'target_audience': row[2],
|
||||
'word_count': row[3],
|
||||
'access_count': row[4],
|
||||
'created_at': row[5]
|
||||
}
|
||||
for row in cursor.fetchall()
|
||||
]
|
||||
|
||||
# Get database size
|
||||
cursor = conn.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
|
||||
db_size_bytes = cursor.fetchone()[0]
|
||||
db_size_mb = db_size_bytes / (1024 * 1024)
|
||||
|
||||
return {
|
||||
'total_entries': total_entries,
|
||||
'valid_entries': valid_entries,
|
||||
'expired_entries': total_entries - valid_entries,
|
||||
'max_size': self.max_cache_size,
|
||||
'ttl_hours': self.cache_ttl.total_seconds() / 3600,
|
||||
'database_size_mb': round(db_size_mb, 2),
|
||||
'top_accessed_entries': top_entries
|
||||
}
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear all cached entries."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("DELETE FROM outline_cache")
|
||||
conn.commit()
|
||||
logger.info("Outline cache cleared")
|
||||
|
||||
def get_cache_entries(self, limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""Get recent cache entries for debugging."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT keywords, industry, target_audience, word_count, custom_instructions, created_at, expires_at, access_count
|
||||
FROM outline_cache
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?
|
||||
""", (limit,))
|
||||
|
||||
return [
|
||||
{
|
||||
'keywords': json.loads(row[0]),
|
||||
'industry': row[1],
|
||||
'target_audience': row[2],
|
||||
'word_count': row[3],
|
||||
'custom_instructions': row[4],
|
||||
'created_at': row[5],
|
||||
'expires_at': row[6],
|
||||
'access_count': row[7]
|
||||
}
|
||||
for row in cursor.fetchall()
|
||||
]
|
||||
|
||||
def invalidate_cache_for_keywords(self, keywords: List[str]):
|
||||
"""
|
||||
Invalidate all cache entries for specific keywords.
|
||||
Useful when research data is updated.
|
||||
|
||||
Args:
|
||||
keywords: Keywords to invalidate cache for
|
||||
"""
|
||||
normalized_keywords = sorted([kw.lower().strip() for kw in keywords])
|
||||
keywords_json = json.dumps(normalized_keywords)
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("DELETE FROM outline_cache WHERE keywords = ?", (keywords_json,))
|
||||
deleted_count = cursor.rowcount
|
||||
conn.commit()
|
||||
|
||||
if deleted_count > 0:
|
||||
logger.info(f"Invalidated {deleted_count} outline cache entries for keywords: {keywords}")
|
||||
|
||||
|
||||
# Global persistent cache instance
|
||||
persistent_outline_cache = PersistentOutlineCache()
|
||||
283
backend/services/cache/persistent_research_cache.py
vendored
Normal file
283
backend/services/cache/persistent_research_cache.py
vendored
Normal file
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
Persistent Research Cache Service
|
||||
|
||||
Provides database-backed caching for research results to survive server restarts
|
||||
and provide better cache management across multiple instances.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import sqlite3
|
||||
from typing import Dict, Any, Optional, List
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class PersistentResearchCache:
|
||||
"""Database-backed cache for research results with exact keyword matching."""
|
||||
|
||||
def __init__(self, db_path: str = "research_cache.db", max_cache_size: int = 1000, cache_ttl_hours: int = 24):
|
||||
"""
|
||||
Initialize the persistent research cache.
|
||||
|
||||
Args:
|
||||
db_path: Path to SQLite database file
|
||||
max_cache_size: Maximum number of cached entries
|
||||
cache_ttl_hours: Time-to-live for cache entries in hours
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self.max_cache_size = max_cache_size
|
||||
self.cache_ttl = timedelta(hours=cache_ttl_hours)
|
||||
|
||||
# Ensure database directory exists
|
||||
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize database
|
||||
self._init_database()
|
||||
|
||||
def _init_database(self):
|
||||
"""Initialize the SQLite database with required tables."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS research_cache (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
cache_key TEXT UNIQUE NOT NULL,
|
||||
keywords TEXT NOT NULL,
|
||||
industry TEXT NOT NULL,
|
||||
target_audience TEXT NOT NULL,
|
||||
result_data TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
expires_at TIMESTAMP NOT NULL,
|
||||
access_count INTEGER DEFAULT 0,
|
||||
last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes for better performance
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_key ON research_cache(cache_key)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_expires_at ON research_cache(expires_at)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_created_at ON research_cache(created_at)")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def _generate_cache_key(self, keywords: List[str], industry: str, target_audience: str) -> str:
|
||||
"""
|
||||
Generate a cache key based on exact keyword match.
|
||||
|
||||
Args:
|
||||
keywords: List of research keywords
|
||||
industry: Industry context
|
||||
target_audience: Target audience context
|
||||
|
||||
Returns:
|
||||
MD5 hash of the normalized parameters
|
||||
"""
|
||||
# Normalize and sort keywords for consistent hashing
|
||||
normalized_keywords = sorted([kw.lower().strip() for kw in keywords])
|
||||
normalized_industry = industry.lower().strip() if industry else "general"
|
||||
normalized_audience = target_audience.lower().strip() if target_audience else "general"
|
||||
|
||||
# Create a consistent string representation
|
||||
cache_string = f"{normalized_keywords}|{normalized_industry}|{normalized_audience}"
|
||||
|
||||
# Generate MD5 hash
|
||||
return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
|
||||
|
||||
def _cleanup_expired_entries(self):
|
||||
"""Remove expired cache entries from database."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM research_cache WHERE expires_at < ?",
|
||||
(datetime.now().isoformat(),)
|
||||
)
|
||||
deleted_count = cursor.rowcount
|
||||
if deleted_count > 0:
|
||||
logger.debug(f"Removed {deleted_count} expired cache entries")
|
||||
conn.commit()
|
||||
|
||||
def _evict_oldest_entries(self, num_to_evict: int):
|
||||
"""Evict the oldest cache entries when cache is full."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Get oldest entries by creation time
|
||||
cursor = conn.execute("""
|
||||
SELECT id FROM research_cache
|
||||
ORDER BY created_at ASC
|
||||
LIMIT ?
|
||||
""", (num_to_evict,))
|
||||
|
||||
old_ids = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
if old_ids:
|
||||
placeholders = ','.join(['?' for _ in old_ids])
|
||||
conn.execute(f"DELETE FROM research_cache WHERE id IN ({placeholders})", old_ids)
|
||||
logger.debug(f"Evicted {len(old_ids)} oldest cache entries")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def get_cached_result(self, keywords: List[str], industry: str, target_audience: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get cached research result for exact keyword match.
|
||||
|
||||
Args:
|
||||
keywords: List of research keywords
|
||||
industry: Industry context
|
||||
target_audience: Target audience context
|
||||
|
||||
Returns:
|
||||
Cached research result if found and valid, None otherwise
|
||||
"""
|
||||
cache_key = self._generate_cache_key(keywords, industry, target_audience)
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT result_data, expires_at FROM research_cache
|
||||
WHERE cache_key = ? AND expires_at > ?
|
||||
""", (cache_key, datetime.now().isoformat()))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row is None:
|
||||
logger.debug(f"Cache miss for keywords: {keywords}")
|
||||
return None
|
||||
|
||||
# Update access statistics
|
||||
conn.execute("""
|
||||
UPDATE research_cache
|
||||
SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
|
||||
WHERE cache_key = ?
|
||||
""", (cache_key,))
|
||||
conn.commit()
|
||||
|
||||
try:
|
||||
result_data = json.loads(row[0])
|
||||
logger.info(f"Cache hit for keywords: {keywords} (saved API call)")
|
||||
return result_data
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Invalid JSON in cache for keywords: {keywords}")
|
||||
# Remove invalid entry
|
||||
conn.execute("DELETE FROM research_cache WHERE cache_key = ?", (cache_key,))
|
||||
conn.commit()
|
||||
return None
|
||||
|
||||
def cache_result(self, keywords: List[str], industry: str, target_audience: str, result: Dict[str, Any]):
|
||||
"""
|
||||
Cache a research result.
|
||||
|
||||
Args:
|
||||
keywords: List of research keywords
|
||||
industry: Industry context
|
||||
target_audience: Target audience context
|
||||
result: Research result to cache
|
||||
"""
|
||||
cache_key = self._generate_cache_key(keywords, industry, target_audience)
|
||||
|
||||
# Cleanup expired entries first
|
||||
self._cleanup_expired_entries()
|
||||
|
||||
# Check if cache is full and evict if necessary
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM research_cache")
|
||||
current_count = cursor.fetchone()[0]
|
||||
|
||||
if current_count >= self.max_cache_size:
|
||||
num_to_evict = current_count - self.max_cache_size + 1
|
||||
self._evict_oldest_entries(num_to_evict)
|
||||
|
||||
# Store the result
|
||||
expires_at = datetime.now() + self.cache_ttl
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO research_cache
|
||||
(cache_key, keywords, industry, target_audience, result_data, expires_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
cache_key,
|
||||
json.dumps(keywords),
|
||||
industry,
|
||||
target_audience,
|
||||
json.dumps(result),
|
||||
expires_at.isoformat()
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Cached research result for keywords: {keywords}")
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get cache statistics."""
|
||||
self._cleanup_expired_entries()
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Get basic stats
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM research_cache")
|
||||
total_entries = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM research_cache WHERE expires_at > ?", (datetime.now().isoformat(),))
|
||||
valid_entries = cursor.fetchone()[0]
|
||||
|
||||
# Get most accessed entries
|
||||
cursor = conn.execute("""
|
||||
SELECT keywords, industry, target_audience, access_count, created_at
|
||||
FROM research_cache
|
||||
ORDER BY access_count DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
top_entries = [
|
||||
{
|
||||
'keywords': json.loads(row[0]),
|
||||
'industry': row[1],
|
||||
'target_audience': row[2],
|
||||
'access_count': row[3],
|
||||
'created_at': row[4]
|
||||
}
|
||||
for row in cursor.fetchall()
|
||||
]
|
||||
|
||||
# Get database size
|
||||
cursor = conn.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
|
||||
db_size_bytes = cursor.fetchone()[0]
|
||||
db_size_mb = db_size_bytes / (1024 * 1024)
|
||||
|
||||
return {
|
||||
'total_entries': total_entries,
|
||||
'valid_entries': valid_entries,
|
||||
'expired_entries': total_entries - valid_entries,
|
||||
'max_size': self.max_cache_size,
|
||||
'ttl_hours': self.cache_ttl.total_seconds() / 3600,
|
||||
'database_size_mb': round(db_size_mb, 2),
|
||||
'top_accessed_entries': top_entries
|
||||
}
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear all cached entries."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("DELETE FROM research_cache")
|
||||
conn.commit()
|
||||
logger.info("Research cache cleared")
|
||||
|
||||
def get_cache_entries(self, limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""Get recent cache entries for debugging."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT keywords, industry, target_audience, created_at, expires_at, access_count
|
||||
FROM research_cache
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?
|
||||
""", (limit,))
|
||||
|
||||
return [
|
||||
{
|
||||
'keywords': json.loads(row[0]),
|
||||
'industry': row[1],
|
||||
'target_audience': row[2],
|
||||
'created_at': row[3],
|
||||
'expires_at': row[4],
|
||||
'access_count': row[5]
|
||||
}
|
||||
for row in cursor.fetchall()
|
||||
]
|
||||
|
||||
|
||||
# Global persistent cache instance
|
||||
persistent_research_cache = PersistentResearchCache()
|
||||
@@ -89,12 +89,13 @@ class GeminiGroundedProvider:
|
||||
logger.warning(f"URL Context tool not available in SDK version: {tool_err}")
|
||||
|
||||
# Apply mode presets (Draft vs Polished)
|
||||
model_id = "gemini-2.5-flash"
|
||||
# Use Gemini 2.0 Flash for better content generation with grounding
|
||||
model_id = "gemini-2.0-flash"
|
||||
if mode == "draft":
|
||||
model_id = "gemini-2.5-flash-lite"
|
||||
model_id = "gemini-2.0-flash"
|
||||
temperature = min(1.0, max(0.0, temperature))
|
||||
else:
|
||||
model_id = "gemini-2.5-flash"
|
||||
model_id = "gemini-2.0-flash"
|
||||
|
||||
# Configure generation settings
|
||||
config = types.GenerateContentConfig(
|
||||
@@ -189,7 +190,7 @@ class GeminiGroundedProvider:
|
||||
loop.run_in_executor(
|
||||
executor,
|
||||
lambda: self.client.models.generate_content(
|
||||
model="gemini-2.5-flash",
|
||||
model="gemini-2.0-flash",
|
||||
contents=grounded_prompt,
|
||||
config=config,
|
||||
)
|
||||
@@ -199,6 +200,10 @@ class GeminiGroundedProvider:
|
||||
|
||||
async def _make_api_request_with_model(self, grounded_prompt: str, config: Any, model_id: str, urls: Optional[List[str]] = None):
|
||||
"""Make the API request with explicit model id and optional URL injection."""
|
||||
logger.info(f"🔍 DEBUG: Making API request with model: {model_id}")
|
||||
logger.info(f"🔍 DEBUG: Prompt length: {len(grounded_prompt)} characters")
|
||||
logger.info(f"🔍 DEBUG: Prompt preview (first 300 chars): {grounded_prompt[:300]}...")
|
||||
|
||||
import concurrent.futures
|
||||
loop = asyncio.get_event_loop()
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
@@ -310,23 +315,70 @@ class GeminiGroundedProvider:
|
||||
Processed content with sources and citations
|
||||
"""
|
||||
try:
|
||||
# Extract the main content
|
||||
# Debug: Log response structure
|
||||
logger.info(f"🔍 DEBUG: Response type: {type(response)}")
|
||||
logger.info(f"🔍 DEBUG: Response has 'text': {hasattr(response, 'text')}")
|
||||
logger.info(f"🔍 DEBUG: Response has 'candidates': {hasattr(response, 'candidates')}")
|
||||
logger.info(f"🔍 DEBUG: Response has 'grounding_metadata': {hasattr(response, 'grounding_metadata')}")
|
||||
if hasattr(response, 'grounding_metadata'):
|
||||
logger.info(f"🔍 DEBUG: Grounding metadata: {response.grounding_metadata}")
|
||||
if hasattr(response, 'candidates') and response.candidates:
|
||||
logger.info(f"🔍 DEBUG: Number of candidates: {len(response.candidates)}")
|
||||
candidate = response.candidates[0]
|
||||
logger.info(f"🔍 DEBUG: Candidate type: {type(candidate)}")
|
||||
logger.info(f"🔍 DEBUG: Candidate has 'content': {hasattr(candidate, 'content')}")
|
||||
if hasattr(candidate, 'content') and candidate.content:
|
||||
logger.info(f"🔍 DEBUG: Content type: {type(candidate.content)}")
|
||||
# Check if content is a list or single object
|
||||
if hasattr(candidate.content, '__iter__') and not isinstance(candidate.content, str):
|
||||
try:
|
||||
content_length = len(candidate.content) if candidate.content else 0
|
||||
logger.info(f"🔍 DEBUG: Content is iterable, length: {content_length}")
|
||||
except TypeError:
|
||||
logger.info(f"🔍 DEBUG: Content is iterable but has no len() - treating as single object")
|
||||
for i, part in enumerate(candidate.content):
|
||||
logger.info(f"🔍 DEBUG: Part {i} type: {type(part)}")
|
||||
logger.info(f"🔍 DEBUG: Part {i} has 'text': {hasattr(part, 'text')}")
|
||||
if hasattr(part, 'text'):
|
||||
logger.info(f"🔍 DEBUG: Part {i} text length: {len(part.text) if part.text else 0}")
|
||||
else:
|
||||
logger.info(f"🔍 DEBUG: Content is single object, has 'text': {hasattr(candidate.content, 'text')}")
|
||||
if hasattr(candidate.content, 'text'):
|
||||
logger.info(f"🔍 DEBUG: Content text length: {len(candidate.content.text) if candidate.content.text else 0}")
|
||||
|
||||
# Extract the main content - prioritize response.text as it's more reliable
|
||||
content = ""
|
||||
if hasattr(response, 'text'):
|
||||
content = response.text
|
||||
logger.info(f"🔍 DEBUG: response.text exists, value: '{response.text}', type: {type(response.text)}")
|
||||
if response.text:
|
||||
content = response.text
|
||||
logger.info(f"🔍 DEBUG: Using response.text, length: {len(content)}")
|
||||
else:
|
||||
logger.info(f"🔍 DEBUG: response.text is empty or None")
|
||||
elif hasattr(response, 'candidates') and response.candidates:
|
||||
candidate = response.candidates[0]
|
||||
if hasattr(candidate, 'content') and candidate.content:
|
||||
# Extract text from content parts
|
||||
text_parts = []
|
||||
for part in candidate.content:
|
||||
if hasattr(part, 'text'):
|
||||
text_parts.append(part.text)
|
||||
content = " ".join(text_parts)
|
||||
# Handle both single Content object and list of parts
|
||||
if hasattr(candidate.content, '__iter__') and not isinstance(candidate.content, str):
|
||||
# Content is a list of parts
|
||||
text_parts = []
|
||||
for part in candidate.content:
|
||||
if hasattr(part, 'text'):
|
||||
text_parts.append(part.text)
|
||||
content = " ".join(text_parts)
|
||||
logger.info(f"🔍 DEBUG: Using candidate.content (list), extracted {len(text_parts)} parts, total length: {len(content)}")
|
||||
else:
|
||||
# Content is a single object
|
||||
if hasattr(candidate.content, 'text'):
|
||||
content = candidate.content.text
|
||||
logger.info(f"🔍 DEBUG: Using candidate.content (single), text length: {len(content)}")
|
||||
else:
|
||||
logger.warning("🔍 DEBUG: candidate.content has no 'text' attribute")
|
||||
|
||||
logger.info(f"Extracted content length: {len(content) if content else 0}")
|
||||
if not content:
|
||||
logger.warning("No content extracted from response")
|
||||
logger.warning("⚠️ No content extracted from Gemini response - using fallback content")
|
||||
logger.warning("⚠️ This indicates Google Search grounding is not working properly")
|
||||
content = "Generated content about the requested topic."
|
||||
|
||||
# Initialize result structure
|
||||
|
||||
@@ -440,7 +440,8 @@ def gemini_structured_json_response(prompt, schema, temperature=0.7, top_p=0.9,
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
def _repair_json_string(text: str) -> Optional[str]:
|
||||
# Removed JSON repair functions to avoid false positives
|
||||
def _removed_repair_json_string(text: str) -> Optional[str]:
|
||||
"""
|
||||
Attempt to repair common JSON issues in AI responses.
|
||||
"""
|
||||
@@ -489,13 +490,21 @@ def _repair_json_string(text: str) -> Optional[str]:
|
||||
fixed_lines.append(line)
|
||||
repaired = '\n'.join(fixed_lines)
|
||||
|
||||
# 3. Fix unescaped quotes in string values
|
||||
# This is complex - we'll use a simple approach
|
||||
# 3. Fix unterminated strings (common issue with AI responses)
|
||||
try:
|
||||
# Try to balance quotes by adding missing ones
|
||||
# Handle unterminated strings by finding the last incomplete string and closing it
|
||||
lines = repaired.split('\n')
|
||||
fixed_lines = []
|
||||
for line in lines:
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
# Check for unterminated strings (line ends with quote but no closing quote)
|
||||
if stripped.endswith('"') and i < len(lines) - 1:
|
||||
next_line = lines[i + 1].strip()
|
||||
# If next line doesn't start with quote or closing bracket, we might have an unterminated string
|
||||
if not next_line.startswith('"') and not next_line.startswith(']') and not next_line.startswith('}'):
|
||||
# Check if this looks like an unterminated string value
|
||||
if ':' in line and not line.strip().endswith('",'):
|
||||
line = line + '",'
|
||||
# Count quotes in the line
|
||||
quote_count = line.count('"')
|
||||
if quote_count % 2 == 1: # Odd number of quotes
|
||||
@@ -518,7 +527,8 @@ def _repair_json_string(text: str) -> Optional[str]:
|
||||
return repaired
|
||||
|
||||
|
||||
def _extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
|
||||
# Removed partial JSON extraction to avoid false positives
|
||||
def _removed_extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract partial JSON from truncated responses.
|
||||
Attempts to salvage as much data as possible from incomplete JSON.
|
||||
@@ -572,26 +582,77 @@ def _extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
|
||||
# Try to extract individual fields as a last resort
|
||||
fields = {}
|
||||
|
||||
# Extract key-value pairs using regex
|
||||
kv_pattern = r'"([^"]+)"\s*:\s*"([^"]*)"'
|
||||
matches = re.findall(kv_pattern, json_text)
|
||||
for key, value in matches:
|
||||
fields[key] = value
|
||||
# Extract key-value pairs using regex (more comprehensive patterns)
|
||||
kv_patterns = [
|
||||
r'"([^"]+)"\s*:\s*"([^"]*)"', # "key": "value"
|
||||
r'"([^"]+)"\s*:\s*(\d+)', # "key": 123
|
||||
r'"([^"]+)"\s*:\s*(true|false)', # "key": true/false
|
||||
r'"([^"]+)"\s*:\s*null', # "key": null
|
||||
]
|
||||
|
||||
# Extract array fields
|
||||
for pattern in kv_patterns:
|
||||
matches = re.findall(pattern, json_text)
|
||||
for key, value in matches:
|
||||
if value == 'true':
|
||||
fields[key] = True
|
||||
elif value == 'false':
|
||||
fields[key] = False
|
||||
elif value == 'null':
|
||||
fields[key] = None
|
||||
elif value.isdigit():
|
||||
fields[key] = int(value)
|
||||
else:
|
||||
fields[key] = value
|
||||
|
||||
# Extract array fields (more robust)
|
||||
array_pattern = r'"([^"]+)"\s*:\s*\[([^\]]*)\]'
|
||||
array_matches = re.findall(array_pattern, json_text)
|
||||
for key, array_content in array_matches:
|
||||
# Parse array items
|
||||
# Parse array items more comprehensively
|
||||
items = []
|
||||
item_pattern = r'"([^"]*)"'
|
||||
item_matches = re.findall(item_pattern, array_content)
|
||||
items.extend(item_matches)
|
||||
fields[key] = items
|
||||
# Look for quoted strings, numbers, booleans, null
|
||||
item_patterns = [
|
||||
r'"([^"]*)"', # quoted strings
|
||||
r'(\d+)', # numbers
|
||||
r'(true|false)', # booleans
|
||||
r'(null)', # null
|
||||
]
|
||||
for pattern in item_patterns:
|
||||
item_matches = re.findall(pattern, array_content)
|
||||
for match in item_matches:
|
||||
if match == 'true':
|
||||
items.append(True)
|
||||
elif match == 'false':
|
||||
items.append(False)
|
||||
elif match == 'null':
|
||||
items.append(None)
|
||||
elif match.isdigit():
|
||||
items.append(int(match))
|
||||
else:
|
||||
items.append(match)
|
||||
if items:
|
||||
fields[key] = items
|
||||
|
||||
# Extract nested object fields (basic)
|
||||
object_pattern = r'"([^"]+)"\s*:\s*\{([^}]*)\}'
|
||||
object_matches = re.findall(object_pattern, json_text)
|
||||
for key, object_content in object_matches:
|
||||
# Simple nested object extraction
|
||||
nested_fields = {}
|
||||
nested_kv_matches = re.findall(r'"([^"]+)"\s*:\s*"([^"]*)"', object_content)
|
||||
for nested_key, nested_value in nested_kv_matches:
|
||||
nested_fields[nested_key] = nested_value
|
||||
if nested_fields:
|
||||
fields[key] = nested_fields
|
||||
|
||||
if fields:
|
||||
logger.info(f"Extracted {len(fields)} fields from truncated JSON")
|
||||
return fields
|
||||
logger.info(f"Extracted {len(fields)} fields from truncated JSON: {list(fields.keys())}")
|
||||
# Only return if we have a valid outline structure
|
||||
if 'outline' in fields and isinstance(fields['outline'], list):
|
||||
return {'outline': fields['outline']}
|
||||
else:
|
||||
logger.error("No valid 'outline' field found in partial JSON")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
@@ -600,7 +661,8 @@ def _extract_partial_json(text: str) -> Optional[Dict[str, Any]]:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_key_value_pairs(text: str) -> Optional[Dict[str, Any]]:
|
||||
# Removed key-value extraction to avoid false positives
|
||||
def _removed_extract_key_value_pairs(text: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract key-value pairs from malformed JSON text as a last resort.
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user