Base code
This commit is contained in:
25
backend/services/blog_writer/outline/__init__.py
Normal file
25
backend/services/blog_writer/outline/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""
|
||||
Outline module for AI Blog Writer.
|
||||
|
||||
This module handles all outline-related functionality including:
|
||||
- AI-powered outline generation
|
||||
- Outline refinement and optimization
|
||||
- Section enhancement and rebalancing
|
||||
- Strategic content planning
|
||||
"""
|
||||
|
||||
from .outline_service import OutlineService
|
||||
from .outline_generator import OutlineGenerator
|
||||
from .outline_optimizer import OutlineOptimizer
|
||||
from .section_enhancer import SectionEnhancer
|
||||
from .source_mapper import SourceToSectionMapper
|
||||
from .grounding_engine import GroundingContextEngine
|
||||
|
||||
__all__ = [
|
||||
'OutlineService',
|
||||
'OutlineGenerator',
|
||||
'OutlineOptimizer',
|
||||
'SectionEnhancer',
|
||||
'SourceToSectionMapper',
|
||||
'GroundingContextEngine'
|
||||
]
|
||||
644
backend/services/blog_writer/outline/grounding_engine.py
Normal file
644
backend/services/blog_writer/outline/grounding_engine.py
Normal file
@@ -0,0 +1,644 @@
|
||||
"""
|
||||
Grounding Context Engine - Enhanced utilization of grounding metadata.
|
||||
|
||||
This module extracts and utilizes rich contextual information from Google Search
|
||||
grounding metadata to enhance outline generation with authoritative insights,
|
||||
temporal relevance, and content relationships.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Tuple, Optional
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
GroundingMetadata,
|
||||
GroundingChunk,
|
||||
GroundingSupport,
|
||||
Citation,
|
||||
BlogOutlineSection,
|
||||
ResearchSource,
|
||||
)
|
||||
|
||||
|
||||
class GroundingContextEngine:
|
||||
"""Extract and utilize rich context from grounding metadata."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the grounding context engine."""
|
||||
self.min_confidence_threshold = 0.7
|
||||
self.high_confidence_threshold = 0.9
|
||||
self.max_contextual_insights = 10
|
||||
self.max_authority_sources = 5
|
||||
|
||||
# Authority indicators for source scoring
|
||||
self.authority_indicators = {
|
||||
'high_authority': ['research', 'study', 'analysis', 'report', 'journal', 'academic', 'university', 'institute'],
|
||||
'medium_authority': ['guide', 'tutorial', 'best practices', 'expert', 'professional', 'industry'],
|
||||
'low_authority': ['blog', 'opinion', 'personal', 'review', 'commentary']
|
||||
}
|
||||
|
||||
# Temporal relevance patterns
|
||||
self.temporal_patterns = {
|
||||
'recent': ['2024', '2025', 'latest', 'new', 'recent', 'current', 'updated'],
|
||||
'trending': ['trend', 'emerging', 'growing', 'increasing', 'rising'],
|
||||
'evergreen': ['fundamental', 'basic', 'principles', 'foundation', 'core']
|
||||
}
|
||||
|
||||
logger.info("✅ GroundingContextEngine initialized with contextual analysis capabilities")
|
||||
|
||||
def extract_contextual_insights(self, grounding_metadata: Optional[GroundingMetadata]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract comprehensive contextual insights from grounding metadata.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
|
||||
Returns:
|
||||
Dictionary containing contextual insights and analysis
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return self._get_empty_insights()
|
||||
|
||||
logger.info("Extracting contextual insights from grounding metadata...")
|
||||
|
||||
insights = {
|
||||
'confidence_analysis': self._analyze_confidence_patterns(grounding_metadata),
|
||||
'authority_analysis': self._analyze_source_authority(grounding_metadata),
|
||||
'temporal_analysis': self._analyze_temporal_relevance(grounding_metadata),
|
||||
'content_relationships': self._analyze_content_relationships(grounding_metadata),
|
||||
'citation_insights': self._analyze_citation_patterns(grounding_metadata),
|
||||
'search_intent_insights': self._analyze_search_intent(grounding_metadata),
|
||||
'quality_indicators': self._assess_quality_indicators(grounding_metadata)
|
||||
}
|
||||
|
||||
logger.info(f"✅ Extracted {len(insights)} contextual insight categories")
|
||||
return insights
|
||||
|
||||
def enhance_sections_with_grounding(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
grounding_metadata: Optional[GroundingMetadata],
|
||||
insights: Dict[str, Any]
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Enhance outline sections using grounding metadata insights.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections to enhance
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
insights: Extracted contextual insights
|
||||
|
||||
Returns:
|
||||
Enhanced sections with grounding-driven improvements
|
||||
"""
|
||||
if not grounding_metadata or not insights:
|
||||
return sections
|
||||
|
||||
logger.info(f"Enhancing {len(sections)} sections with grounding insights...")
|
||||
|
||||
enhanced_sections = []
|
||||
for section in sections:
|
||||
enhanced_section = self._enhance_single_section(section, grounding_metadata, insights)
|
||||
enhanced_sections.append(enhanced_section)
|
||||
|
||||
logger.info("✅ Section enhancement with grounding insights completed")
|
||||
return enhanced_sections
|
||||
|
||||
def get_authority_sources(self, grounding_metadata: Optional[GroundingMetadata]) -> List[Tuple[GroundingChunk, float]]:
|
||||
"""
|
||||
Get high-authority sources from grounding metadata.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
|
||||
Returns:
|
||||
List of (chunk, authority_score) tuples sorted by authority
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return []
|
||||
|
||||
authority_sources = []
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
authority_score = self._calculate_chunk_authority(chunk)
|
||||
if authority_score >= 0.6: # Only include sources with reasonable authority
|
||||
authority_sources.append((chunk, authority_score))
|
||||
|
||||
# Sort by authority score (descending)
|
||||
authority_sources.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return authority_sources[:self.max_authority_sources]
|
||||
|
||||
def get_high_confidence_insights(self, grounding_metadata: Optional[GroundingMetadata]) -> List[str]:
|
||||
"""
|
||||
Extract high-confidence insights from grounding supports.
|
||||
|
||||
Args:
|
||||
grounding_metadata: Google Search grounding metadata
|
||||
|
||||
Returns:
|
||||
List of high-confidence insights
|
||||
"""
|
||||
if not grounding_metadata:
|
||||
return []
|
||||
|
||||
high_confidence_insights = []
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
if support.confidence_scores and max(support.confidence_scores) >= self.high_confidence_threshold:
|
||||
# Extract meaningful insights from segment text
|
||||
insight = self._extract_insight_from_segment(support.segment_text)
|
||||
if insight:
|
||||
high_confidence_insights.append(insight)
|
||||
|
||||
return high_confidence_insights[:self.max_contextual_insights]
|
||||
|
||||
# Private helper methods
|
||||
|
||||
def _get_empty_insights(self) -> Dict[str, Any]:
|
||||
"""Return empty insights structure when no grounding metadata is available."""
|
||||
return {
|
||||
'confidence_analysis': {
|
||||
'average_confidence': 0.0,
|
||||
'high_confidence_sources_count': 0,
|
||||
'confidence_distribution': {'high': 0, 'medium': 0, 'low': 0}
|
||||
},
|
||||
'authority_analysis': {
|
||||
'average_authority_score': 0.0,
|
||||
'high_authority_sources': [],
|
||||
'authority_distribution': {'high': 0, 'medium': 0, 'low': 0}
|
||||
},
|
||||
'temporal_analysis': {
|
||||
'recent_content': 0,
|
||||
'trending_topics': [],
|
||||
'evergreen_content': 0
|
||||
},
|
||||
'content_relationships': {
|
||||
'related_concepts': [],
|
||||
'content_gaps': [],
|
||||
'concept_coverage_score': 0.0
|
||||
},
|
||||
'citation_insights': {
|
||||
'citation_types': {},
|
||||
'citation_density': 0.0
|
||||
},
|
||||
'search_intent_insights': {
|
||||
'primary_intent': 'informational',
|
||||
'intent_signals': [],
|
||||
'user_questions': []
|
||||
},
|
||||
'quality_indicators': {
|
||||
'overall_quality': 0.0,
|
||||
'quality_factors': []
|
||||
}
|
||||
}
|
||||
|
||||
def _analyze_confidence_patterns(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze confidence patterns across grounding data."""
|
||||
all_confidences = []
|
||||
|
||||
# Collect confidence scores from chunks
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
if chunk.confidence_score:
|
||||
all_confidences.append(chunk.confidence_score)
|
||||
|
||||
# Collect confidence scores from supports
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
all_confidences.extend(support.confidence_scores)
|
||||
|
||||
if not all_confidences:
|
||||
return {
|
||||
'average_confidence': 0.0,
|
||||
'high_confidence_sources_count': 0,
|
||||
'confidence_distribution': {'high': 0, 'medium': 0, 'low': 0}
|
||||
}
|
||||
|
||||
average_confidence = sum(all_confidences) / len(all_confidences)
|
||||
high_confidence_count = sum(1 for c in all_confidences if c >= self.high_confidence_threshold)
|
||||
|
||||
return {
|
||||
'average_confidence': average_confidence,
|
||||
'high_confidence_sources_count': high_confidence_count,
|
||||
'confidence_distribution': self._get_confidence_distribution(all_confidences)
|
||||
}
|
||||
|
||||
def _analyze_source_authority(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze source authority patterns."""
|
||||
authority_scores = []
|
||||
authority_distribution = defaultdict(int)
|
||||
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
authority_score = self._calculate_chunk_authority(chunk)
|
||||
authority_scores.append(authority_score)
|
||||
|
||||
# Categorize authority level
|
||||
if authority_score >= 0.8:
|
||||
authority_distribution['high'] += 1
|
||||
elif authority_score >= 0.6:
|
||||
authority_distribution['medium'] += 1
|
||||
else:
|
||||
authority_distribution['low'] += 1
|
||||
|
||||
return {
|
||||
'average_authority_score': sum(authority_scores) / len(authority_scores) if authority_scores else 0.0,
|
||||
'high_authority_sources': [{'title': 'High Authority Source', 'url': 'example.com', 'score': 0.9}], # Placeholder
|
||||
'authority_distribution': dict(authority_distribution)
|
||||
}
|
||||
|
||||
def _analyze_temporal_relevance(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze temporal relevance of grounding content."""
|
||||
recent_content = 0
|
||||
trending_topics = []
|
||||
evergreen_content = 0
|
||||
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
chunk_text = f"{chunk.title} {chunk.url}".lower()
|
||||
|
||||
# Check for recent indicators
|
||||
if any(pattern in chunk_text for pattern in self.temporal_patterns['recent']):
|
||||
recent_content += 1
|
||||
|
||||
# Check for trending indicators
|
||||
if any(pattern in chunk_text for pattern in self.temporal_patterns['trending']):
|
||||
trending_topics.append(chunk.title)
|
||||
|
||||
# Check for evergreen indicators
|
||||
if any(pattern in chunk_text for pattern in self.temporal_patterns['evergreen']):
|
||||
evergreen_content += 1
|
||||
|
||||
return {
|
||||
'recent_content': recent_content,
|
||||
'trending_topics': trending_topics[:5], # Limit to top 5
|
||||
'evergreen_content': evergreen_content,
|
||||
'temporal_balance': self._calculate_temporal_balance(recent_content, evergreen_content)
|
||||
}
|
||||
|
||||
def _analyze_content_relationships(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze content relationships and identify gaps."""
|
||||
all_text = []
|
||||
|
||||
# Collect text from chunks
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
all_text.append(chunk.title)
|
||||
|
||||
# Collect text from supports
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
all_text.append(support.segment_text)
|
||||
|
||||
# Extract related concepts
|
||||
related_concepts = self._extract_related_concepts(all_text)
|
||||
|
||||
# Identify potential content gaps
|
||||
content_gaps = self._identify_content_gaps(all_text)
|
||||
|
||||
# Calculate concept coverage score (0-1 scale)
|
||||
concept_coverage_score = min(1.0, len(related_concepts) / 10.0) if related_concepts else 0.0
|
||||
|
||||
return {
|
||||
'related_concepts': related_concepts,
|
||||
'content_gaps': content_gaps,
|
||||
'concept_coverage_score': concept_coverage_score,
|
||||
'gap_count': len(content_gaps)
|
||||
}
|
||||
|
||||
def _analyze_citation_patterns(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze citation patterns and types."""
|
||||
citation_types = Counter()
|
||||
total_citations = len(grounding_metadata.citations)
|
||||
|
||||
for citation in grounding_metadata.citations:
|
||||
citation_types[citation.citation_type] += 1
|
||||
|
||||
# Calculate citation density (citations per 1000 words of content)
|
||||
total_content_length = sum(len(support.segment_text) for support in grounding_metadata.grounding_supports)
|
||||
citation_density = (total_citations / max(total_content_length, 1)) * 1000 if total_content_length > 0 else 0.0
|
||||
|
||||
return {
|
||||
'citation_types': dict(citation_types),
|
||||
'total_citations': total_citations,
|
||||
'citation_density': citation_density,
|
||||
'citation_quality': self._assess_citation_quality(grounding_metadata.citations)
|
||||
}
|
||||
|
||||
def _analyze_search_intent(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Analyze search intent signals from grounding data."""
|
||||
intent_signals = []
|
||||
user_questions = []
|
||||
|
||||
# Analyze search queries
|
||||
for query in grounding_metadata.web_search_queries:
|
||||
query_lower = query.lower()
|
||||
|
||||
# Identify intent signals
|
||||
if any(word in query_lower for word in ['how', 'what', 'why', 'when', 'where']):
|
||||
intent_signals.append('informational')
|
||||
elif any(word in query_lower for word in ['best', 'top', 'compare', 'vs']):
|
||||
intent_signals.append('comparison')
|
||||
elif any(word in query_lower for word in ['buy', 'price', 'cost', 'deal']):
|
||||
intent_signals.append('transactional')
|
||||
|
||||
# Extract potential user questions
|
||||
if query_lower.startswith(('how to', 'what is', 'why does', 'when should')):
|
||||
user_questions.append(query)
|
||||
|
||||
return {
|
||||
'intent_signals': list(set(intent_signals)),
|
||||
'user_questions': user_questions[:5], # Limit to top 5
|
||||
'primary_intent': self._determine_primary_intent(intent_signals)
|
||||
}
|
||||
|
||||
def _assess_quality_indicators(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
|
||||
"""Assess overall quality indicators from grounding metadata."""
|
||||
quality_factors = []
|
||||
quality_score = 0.0
|
||||
|
||||
# Factor 1: Confidence levels
|
||||
confidences = [chunk.confidence_score for chunk in grounding_metadata.grounding_chunks if chunk.confidence_score]
|
||||
if confidences:
|
||||
avg_confidence = sum(confidences) / len(confidences)
|
||||
quality_score += avg_confidence * 0.3
|
||||
quality_factors.append(f"Average confidence: {avg_confidence:.2f}")
|
||||
|
||||
# Factor 2: Source diversity
|
||||
unique_domains = set()
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
try:
|
||||
domain = chunk.url.split('/')[2] if '://' in chunk.url else chunk.url.split('/')[0]
|
||||
unique_domains.add(domain)
|
||||
except:
|
||||
continue
|
||||
|
||||
diversity_score = min(len(unique_domains) / 5.0, 1.0) # Normalize to 0-1
|
||||
quality_score += diversity_score * 0.2
|
||||
quality_factors.append(f"Source diversity: {len(unique_domains)} unique domains")
|
||||
|
||||
# Factor 3: Content depth
|
||||
total_content_length = sum(len(support.segment_text) for support in grounding_metadata.grounding_supports)
|
||||
depth_score = min(total_content_length / 5000.0, 1.0) # Normalize to 0-1
|
||||
quality_score += depth_score * 0.2
|
||||
quality_factors.append(f"Content depth: {total_content_length} characters")
|
||||
|
||||
# Factor 4: Citation quality
|
||||
citation_quality = self._assess_citation_quality(grounding_metadata.citations)
|
||||
quality_score += citation_quality * 0.3
|
||||
quality_factors.append(f"Citation quality: {citation_quality:.2f}")
|
||||
|
||||
return {
|
||||
'overall_quality': min(quality_score, 1.0),
|
||||
'quality_factors': quality_factors,
|
||||
'quality_grade': self._get_quality_grade(quality_score)
|
||||
}
|
||||
|
||||
def _enhance_single_section(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
grounding_metadata: GroundingMetadata,
|
||||
insights: Dict[str, Any]
|
||||
) -> BlogOutlineSection:
|
||||
"""Enhance a single section using grounding insights."""
|
||||
# Extract relevant grounding data for this section
|
||||
relevant_chunks = self._find_relevant_chunks(section, grounding_metadata)
|
||||
relevant_supports = self._find_relevant_supports(section, grounding_metadata)
|
||||
|
||||
# Enhance subheadings with high-confidence insights
|
||||
enhanced_subheadings = self._enhance_subheadings(section, relevant_supports, insights)
|
||||
|
||||
# Enhance key points with authoritative insights
|
||||
enhanced_key_points = self._enhance_key_points(section, relevant_chunks, insights)
|
||||
|
||||
# Enhance keywords with related concepts
|
||||
enhanced_keywords = self._enhance_keywords(section, insights)
|
||||
|
||||
return BlogOutlineSection(
|
||||
id=section.id,
|
||||
heading=section.heading,
|
||||
subheadings=enhanced_subheadings,
|
||||
key_points=enhanced_key_points,
|
||||
references=section.references,
|
||||
target_words=section.target_words,
|
||||
keywords=enhanced_keywords
|
||||
)
|
||||
|
||||
def _calculate_chunk_authority(self, chunk: GroundingChunk) -> float:
|
||||
"""Calculate authority score for a grounding chunk."""
|
||||
authority_score = 0.5 # Base score
|
||||
|
||||
chunk_text = f"{chunk.title} {chunk.url}".lower()
|
||||
|
||||
# Check for authority indicators
|
||||
for level, indicators in self.authority_indicators.items():
|
||||
for indicator in indicators:
|
||||
if indicator in chunk_text:
|
||||
if level == 'high_authority':
|
||||
authority_score += 0.3
|
||||
elif level == 'medium_authority':
|
||||
authority_score += 0.2
|
||||
else: # low_authority
|
||||
authority_score -= 0.1
|
||||
|
||||
# Boost score based on confidence
|
||||
if chunk.confidence_score:
|
||||
authority_score += chunk.confidence_score * 0.2
|
||||
|
||||
return min(max(authority_score, 0.0), 1.0)
|
||||
|
||||
def _extract_insight_from_segment(self, segment_text: str) -> Optional[str]:
|
||||
"""Extract meaningful insight from segment text."""
|
||||
if not segment_text or len(segment_text.strip()) < 20:
|
||||
return None
|
||||
|
||||
# Clean and truncate insight
|
||||
insight = segment_text.strip()
|
||||
if len(insight) > 200:
|
||||
insight = insight[:200] + "..."
|
||||
|
||||
return insight
|
||||
|
||||
def _get_confidence_distribution(self, confidences: List[float]) -> Dict[str, int]:
|
||||
"""Get distribution of confidence scores."""
|
||||
distribution = {'high': 0, 'medium': 0, 'low': 0}
|
||||
|
||||
for confidence in confidences:
|
||||
if confidence >= 0.8:
|
||||
distribution['high'] += 1
|
||||
elif confidence >= 0.6:
|
||||
distribution['medium'] += 1
|
||||
else:
|
||||
distribution['low'] += 1
|
||||
|
||||
return distribution
|
||||
|
||||
def _calculate_temporal_balance(self, recent: int, evergreen: int) -> str:
|
||||
"""Calculate temporal balance of content."""
|
||||
total = recent + evergreen
|
||||
if total == 0:
|
||||
return 'unknown'
|
||||
|
||||
recent_ratio = recent / total
|
||||
if recent_ratio > 0.7:
|
||||
return 'recent_heavy'
|
||||
elif recent_ratio < 0.3:
|
||||
return 'evergreen_heavy'
|
||||
else:
|
||||
return 'balanced'
|
||||
|
||||
def _extract_related_concepts(self, text_list: List[str]) -> List[str]:
|
||||
"""Extract related concepts from text."""
|
||||
# Simple concept extraction - could be enhanced with NLP
|
||||
concepts = set()
|
||||
|
||||
for text in text_list:
|
||||
# Extract capitalized words (potential concepts)
|
||||
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
||||
concepts.update(words)
|
||||
|
||||
return list(concepts)[:10] # Limit to top 10
|
||||
|
||||
def _identify_content_gaps(self, text_list: List[str]) -> List[str]:
|
||||
"""Identify potential content gaps."""
|
||||
# Simple gap identification - could be enhanced with more sophisticated analysis
|
||||
gaps = []
|
||||
|
||||
# Look for common gap indicators
|
||||
gap_indicators = ['missing', 'lack of', 'not covered', 'gap', 'unclear', 'unexplained']
|
||||
|
||||
for text in text_list:
|
||||
text_lower = text.lower()
|
||||
for indicator in gap_indicators:
|
||||
if indicator in text_lower:
|
||||
# Extract potential gap
|
||||
gap = self._extract_gap_from_text(text, indicator)
|
||||
if gap:
|
||||
gaps.append(gap)
|
||||
|
||||
return gaps[:5] # Limit to top 5
|
||||
|
||||
def _extract_gap_from_text(self, text: str, indicator: str) -> Optional[str]:
|
||||
"""Extract content gap from text containing gap indicator."""
|
||||
# Simple extraction - could be enhanced
|
||||
sentences = text.split('.')
|
||||
for sentence in sentences:
|
||||
if indicator in sentence.lower():
|
||||
return sentence.strip()
|
||||
return None
|
||||
|
||||
def _assess_citation_quality(self, citations: List[Citation]) -> float:
|
||||
"""Assess quality of citations."""
|
||||
if not citations:
|
||||
return 0.0
|
||||
|
||||
quality_score = 0.0
|
||||
|
||||
for citation in citations:
|
||||
# Check citation type
|
||||
if citation.citation_type in ['expert_opinion', 'statistical_data', 'research_study']:
|
||||
quality_score += 0.3
|
||||
elif citation.citation_type in ['recent_news', 'case_study']:
|
||||
quality_score += 0.2
|
||||
else:
|
||||
quality_score += 0.1
|
||||
|
||||
# Check text quality
|
||||
if len(citation.text) > 20:
|
||||
quality_score += 0.1
|
||||
|
||||
return min(quality_score / len(citations), 1.0)
|
||||
|
||||
def _determine_primary_intent(self, intent_signals: List[str]) -> str:
|
||||
"""Determine primary search intent from signals."""
|
||||
if not intent_signals:
|
||||
return 'informational'
|
||||
|
||||
intent_counts = Counter(intent_signals)
|
||||
return intent_counts.most_common(1)[0][0]
|
||||
|
||||
def _get_quality_grade(self, quality_score: float) -> str:
|
||||
"""Get quality grade from score."""
|
||||
if quality_score >= 0.9:
|
||||
return 'A'
|
||||
elif quality_score >= 0.8:
|
||||
return 'B'
|
||||
elif quality_score >= 0.7:
|
||||
return 'C'
|
||||
elif quality_score >= 0.6:
|
||||
return 'D'
|
||||
else:
|
||||
return 'F'
|
||||
|
||||
def _find_relevant_chunks(self, section: BlogOutlineSection, grounding_metadata: GroundingMetadata) -> List[GroundingChunk]:
|
||||
"""Find grounding chunks relevant to the section."""
|
||||
relevant_chunks = []
|
||||
section_text = f"{section.heading} {' '.join(section.subheadings)} {' '.join(section.key_points)}".lower()
|
||||
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
chunk_text = chunk.title.lower()
|
||||
# Simple relevance check - could be enhanced with semantic similarity
|
||||
if any(word in chunk_text for word in section_text.split() if len(word) > 3):
|
||||
relevant_chunks.append(chunk)
|
||||
|
||||
return relevant_chunks
|
||||
|
||||
def _find_relevant_supports(self, section: BlogOutlineSection, grounding_metadata: GroundingMetadata) -> List[GroundingSupport]:
|
||||
"""Find grounding supports relevant to the section."""
|
||||
relevant_supports = []
|
||||
section_text = f"{section.heading} {' '.join(section.subheadings)} {' '.join(section.key_points)}".lower()
|
||||
|
||||
for support in grounding_metadata.grounding_supports:
|
||||
support_text = support.segment_text.lower()
|
||||
# Simple relevance check
|
||||
if any(word in support_text for word in section_text.split() if len(word) > 3):
|
||||
relevant_supports.append(support)
|
||||
|
||||
return relevant_supports
|
||||
|
||||
def _enhance_subheadings(self, section: BlogOutlineSection, relevant_supports: List[GroundingSupport], insights: Dict[str, Any]) -> List[str]:
|
||||
"""Enhance subheadings with grounding insights."""
|
||||
enhanced_subheadings = list(section.subheadings)
|
||||
|
||||
# Add high-confidence insights as subheadings
|
||||
high_confidence_insights = self._get_high_confidence_insights_from_supports(relevant_supports)
|
||||
for insight in high_confidence_insights[:2]: # Add up to 2 new subheadings
|
||||
if insight not in enhanced_subheadings:
|
||||
enhanced_subheadings.append(insight)
|
||||
|
||||
return enhanced_subheadings
|
||||
|
||||
def _enhance_key_points(self, section: BlogOutlineSection, relevant_chunks: List[GroundingChunk], insights: Dict[str, Any]) -> List[str]:
|
||||
"""Enhance key points with authoritative insights."""
|
||||
enhanced_key_points = list(section.key_points)
|
||||
|
||||
# Add insights from high-authority chunks
|
||||
for chunk in relevant_chunks:
|
||||
if chunk.confidence_score and chunk.confidence_score >= self.high_confidence_threshold:
|
||||
insight = f"Based on {chunk.title}: {self._extract_key_insight(chunk)}"
|
||||
if insight not in enhanced_key_points:
|
||||
enhanced_key_points.append(insight)
|
||||
|
||||
return enhanced_key_points
|
||||
|
||||
def _enhance_keywords(self, section: BlogOutlineSection, insights: Dict[str, Any]) -> List[str]:
|
||||
"""Enhance keywords with related concepts from grounding."""
|
||||
enhanced_keywords = list(section.keywords)
|
||||
|
||||
# Add related concepts from grounding analysis
|
||||
related_concepts = insights.get('content_relationships', {}).get('related_concepts', [])
|
||||
for concept in related_concepts[:3]: # Add up to 3 new keywords
|
||||
if concept.lower() not in [kw.lower() for kw in enhanced_keywords]:
|
||||
enhanced_keywords.append(concept)
|
||||
|
||||
return enhanced_keywords
|
||||
|
||||
def _get_high_confidence_insights_from_supports(self, supports: List[GroundingSupport]) -> List[str]:
|
||||
"""Get high-confidence insights from grounding supports."""
|
||||
insights = []
|
||||
for support in supports:
|
||||
if support.confidence_scores and max(support.confidence_scores) >= self.high_confidence_threshold:
|
||||
insight = self._extract_insight_from_segment(support.segment_text)
|
||||
if insight:
|
||||
insights.append(insight)
|
||||
return insights
|
||||
|
||||
def _extract_key_insight(self, chunk: GroundingChunk) -> str:
|
||||
"""Extract key insight from grounding chunk."""
|
||||
# Simple extraction - could be enhanced
|
||||
return f"High-confidence source with {chunk.confidence_score:.2f} confidence score"
|
||||
94
backend/services/blog_writer/outline/metadata_collector.py
Normal file
94
backend/services/blog_writer/outline/metadata_collector.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
Metadata Collector - Handles collection and formatting of outline metadata.
|
||||
|
||||
Collects source mapping stats, grounding insights, optimization results, and research coverage.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class MetadataCollector:
|
||||
"""Handles collection and formatting of various metadata types for UI display."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the metadata collector."""
|
||||
pass
|
||||
|
||||
def collect_source_mapping_stats(self, mapped_sections, research):
|
||||
"""Collect source mapping statistics for UI display."""
|
||||
from models.blog_models import SourceMappingStats
|
||||
|
||||
total_sources = len(research.sources)
|
||||
total_mapped = sum(len(section.references) for section in mapped_sections)
|
||||
coverage_percentage = (total_mapped / total_sources * 100) if total_sources > 0 else 0.0
|
||||
|
||||
# Calculate average relevance score (simplified)
|
||||
all_relevance_scores = []
|
||||
for section in mapped_sections:
|
||||
for ref in section.references:
|
||||
if hasattr(ref, 'credibility_score') and ref.credibility_score:
|
||||
all_relevance_scores.append(ref.credibility_score)
|
||||
|
||||
average_relevance = sum(all_relevance_scores) / len(all_relevance_scores) if all_relevance_scores else 0.0
|
||||
high_confidence_mappings = sum(1 for score in all_relevance_scores if score >= 0.8)
|
||||
|
||||
return SourceMappingStats(
|
||||
total_sources_mapped=total_mapped,
|
||||
coverage_percentage=round(coverage_percentage, 1),
|
||||
average_relevance_score=round(average_relevance, 3),
|
||||
high_confidence_mappings=high_confidence_mappings
|
||||
)
|
||||
|
||||
def collect_grounding_insights(self, grounding_insights):
|
||||
"""Collect grounding insights for UI display."""
|
||||
from models.blog_models import GroundingInsights
|
||||
|
||||
return GroundingInsights(
|
||||
confidence_analysis=grounding_insights.get('confidence_analysis'),
|
||||
authority_analysis=grounding_insights.get('authority_analysis'),
|
||||
temporal_analysis=grounding_insights.get('temporal_analysis'),
|
||||
content_relationships=grounding_insights.get('content_relationships'),
|
||||
citation_insights=grounding_insights.get('citation_insights'),
|
||||
search_intent_insights=grounding_insights.get('search_intent_insights'),
|
||||
quality_indicators=grounding_insights.get('quality_indicators')
|
||||
)
|
||||
|
||||
def collect_optimization_results(self, optimized_sections, focus):
|
||||
"""Collect optimization results for UI display."""
|
||||
from models.blog_models import OptimizationResults
|
||||
|
||||
# Calculate a quality score based on section completeness
|
||||
total_sections = len(optimized_sections)
|
||||
complete_sections = sum(1 for section in optimized_sections
|
||||
if section.heading and section.subheadings and section.key_points)
|
||||
|
||||
quality_score = (complete_sections / total_sections * 10) if total_sections > 0 else 0.0
|
||||
|
||||
improvements_made = [
|
||||
"Enhanced section headings for better SEO",
|
||||
"Optimized keyword distribution across sections",
|
||||
"Improved content flow and logical progression",
|
||||
"Balanced word count distribution",
|
||||
"Enhanced subheadings for better readability"
|
||||
]
|
||||
|
||||
return OptimizationResults(
|
||||
overall_quality_score=round(quality_score, 1),
|
||||
improvements_made=improvements_made,
|
||||
optimization_focus=focus
|
||||
)
|
||||
|
||||
def collect_research_coverage(self, research):
|
||||
"""Collect research coverage metrics for UI display."""
|
||||
from models.blog_models import ResearchCoverage
|
||||
|
||||
sources_utilized = len(research.sources)
|
||||
content_gaps = research.keyword_analysis.get('content_gaps', [])
|
||||
competitive_advantages = research.competitor_analysis.get('competitive_advantages', [])
|
||||
|
||||
return ResearchCoverage(
|
||||
sources_utilized=sources_utilized,
|
||||
content_gaps_identified=len(content_gaps),
|
||||
competitive_advantages=competitive_advantages[:5] # Limit to top 5
|
||||
)
|
||||
323
backend/services/blog_writer/outline/outline_generator.py
Normal file
323
backend/services/blog_writer/outline/outline_generator.py
Normal file
@@ -0,0 +1,323 @@
|
||||
"""
|
||||
Outline Generator - AI-powered outline generation from research data.
|
||||
|
||||
Generates comprehensive, SEO-optimized outlines using research intelligence.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Tuple
|
||||
import asyncio
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
BlogOutlineRequest,
|
||||
BlogOutlineResponse,
|
||||
BlogOutlineSection,
|
||||
)
|
||||
|
||||
from .source_mapper import SourceToSectionMapper
|
||||
from .section_enhancer import SectionEnhancer
|
||||
from .outline_optimizer import OutlineOptimizer
|
||||
from .grounding_engine import GroundingContextEngine
|
||||
from .title_generator import TitleGenerator
|
||||
from .metadata_collector import MetadataCollector
|
||||
from .prompt_builder import PromptBuilder
|
||||
from .response_processor import ResponseProcessor
|
||||
from .parallel_processor import ParallelProcessor
|
||||
|
||||
|
||||
class OutlineGenerator:
|
||||
"""Generates AI-powered outlines from research data."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the outline generator with all enhancement modules."""
|
||||
self.source_mapper = SourceToSectionMapper()
|
||||
self.section_enhancer = SectionEnhancer()
|
||||
self.outline_optimizer = OutlineOptimizer()
|
||||
self.grounding_engine = GroundingContextEngine()
|
||||
|
||||
# Initialize extracted classes
|
||||
self.title_generator = TitleGenerator()
|
||||
self.metadata_collector = MetadataCollector()
|
||||
self.prompt_builder = PromptBuilder()
|
||||
self.response_processor = ResponseProcessor()
|
||||
self.parallel_processor = ParallelProcessor(self.source_mapper, self.grounding_engine)
|
||||
|
||||
async def generate(self, request: BlogOutlineRequest, user_id: str) -> BlogOutlineResponse:
|
||||
"""
|
||||
Generate AI-powered outline using research results.
|
||||
|
||||
Args:
|
||||
request: Outline generation request with research data
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
|
||||
|
||||
# Extract research insights
|
||||
research = request.research
|
||||
primary_keywords = research.keyword_analysis.get('primary', [])
|
||||
secondary_keywords = research.keyword_analysis.get('secondary', [])
|
||||
content_angles = research.suggested_angles
|
||||
sources = research.sources
|
||||
search_intent = research.keyword_analysis.get('search_intent', 'informational')
|
||||
|
||||
# Check for custom instructions
|
||||
custom_instructions = getattr(request, 'custom_instructions', None)
|
||||
|
||||
# Build comprehensive outline generation prompt with rich research data
|
||||
outline_prompt = self.prompt_builder.build_outline_prompt(
|
||||
primary_keywords, secondary_keywords, content_angles, sources,
|
||||
search_intent, request, custom_instructions
|
||||
)
|
||||
|
||||
logger.info("Generating AI-powered outline using research results")
|
||||
|
||||
# Define schema with proper property ordering (critical for Gemini API)
|
||||
outline_schema = self.prompt_builder.get_outline_schema()
|
||||
|
||||
# Generate outline using structured JSON response with retry logic (user_id required)
|
||||
outline_data = await self.response_processor.generate_with_retry(outline_prompt, outline_schema, user_id)
|
||||
|
||||
# Convert to BlogOutlineSection objects
|
||||
outline_sections = self.response_processor.convert_to_sections(outline_data, sources)
|
||||
|
||||
# Run parallel processing for speed optimization (user_id required)
|
||||
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing_async(
|
||||
outline_sections, research, user_id
|
||||
)
|
||||
|
||||
# Enhance sections with grounding insights
|
||||
logger.info("Enhancing sections with grounding insights...")
|
||||
grounding_enhanced_sections = self.grounding_engine.enhance_sections_with_grounding(
|
||||
mapped_sections, research.grounding_metadata, grounding_insights
|
||||
)
|
||||
|
||||
# Optimize outline for better flow, SEO, and engagement (user_id required)
|
||||
logger.info("Optimizing outline for better flow and engagement...")
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
|
||||
|
||||
# Rebalance word counts for optimal distribution
|
||||
target_words = request.word_count or 1500
|
||||
balanced_sections = self.outline_optimizer.rebalance_word_counts(optimized_sections, target_words)
|
||||
|
||||
# Extract title options - combine AI-generated with content angles
|
||||
ai_title_options = outline_data.get('title_options', [])
|
||||
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
|
||||
|
||||
# Combine AI-generated titles with content angles
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
|
||||
|
||||
logger.info(f"Generated optimized outline with {len(balanced_sections)} sections and {len(title_options)} title options")
|
||||
|
||||
# Collect metadata for enhanced UI
|
||||
source_mapping_stats = self.metadata_collector.collect_source_mapping_stats(mapped_sections, research)
|
||||
grounding_insights_data = self.metadata_collector.collect_grounding_insights(grounding_insights)
|
||||
optimization_results = self.metadata_collector.collect_optimization_results(optimized_sections, "comprehensive optimization")
|
||||
research_coverage = self.metadata_collector.collect_research_coverage(research)
|
||||
|
||||
return BlogOutlineResponse(
|
||||
success=True,
|
||||
title_options=title_options,
|
||||
outline=balanced_sections,
|
||||
source_mapping_stats=source_mapping_stats,
|
||||
grounding_insights=grounding_insights_data,
|
||||
optimization_results=optimization_results,
|
||||
research_coverage=research_coverage
|
||||
)
|
||||
|
||||
async def generate_with_progress(self, request: BlogOutlineRequest, task_id: str, user_id: str) -> BlogOutlineResponse:
|
||||
"""
|
||||
Outline generation method with progress updates for real-time feedback.
|
||||
|
||||
Args:
|
||||
request: Outline generation request with research data
|
||||
task_id: Task ID for progress updates
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
|
||||
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
|
||||
# Extract research insights
|
||||
research = request.research
|
||||
primary_keywords = research.keyword_analysis.get('primary', [])
|
||||
secondary_keywords = research.keyword_analysis.get('secondary', [])
|
||||
content_angles = research.suggested_angles
|
||||
sources = research.sources
|
||||
search_intent = research.keyword_analysis.get('search_intent', 'informational')
|
||||
|
||||
# Check for custom instructions
|
||||
custom_instructions = getattr(request, 'custom_instructions', None)
|
||||
|
||||
await task_manager.update_progress(task_id, "📊 Analyzing research data and building content strategy...")
|
||||
|
||||
# Build comprehensive outline generation prompt with rich research data
|
||||
outline_prompt = self.prompt_builder.build_outline_prompt(
|
||||
primary_keywords, secondary_keywords, content_angles, sources,
|
||||
search_intent, request, custom_instructions
|
||||
)
|
||||
|
||||
await task_manager.update_progress(task_id, "🤖 Generating AI-powered outline with research insights...")
|
||||
|
||||
# Define schema with proper property ordering (critical for Gemini API)
|
||||
outline_schema = self.prompt_builder.get_outline_schema()
|
||||
|
||||
await task_manager.update_progress(task_id, "🔄 Making AI request to generate structured outline...")
|
||||
|
||||
# Generate outline using structured JSON response with retry logic (user_id required for subscription checks)
|
||||
outline_data = await self.response_processor.generate_with_retry(outline_prompt, outline_schema, user_id, task_id)
|
||||
|
||||
await task_manager.update_progress(task_id, "📝 Processing outline structure and validating sections...")
|
||||
|
||||
# Convert to BlogOutlineSection objects
|
||||
outline_sections = self.response_processor.convert_to_sections(outline_data, sources)
|
||||
|
||||
# Run parallel processing for speed optimization (user_id required for subscription checks)
|
||||
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing(
|
||||
outline_sections, research, user_id, task_id
|
||||
)
|
||||
|
||||
# Enhance sections with grounding insights (depends on both previous tasks)
|
||||
await task_manager.update_progress(task_id, "✨ Enhancing sections with grounding insights...")
|
||||
grounding_enhanced_sections = self.grounding_engine.enhance_sections_with_grounding(
|
||||
mapped_sections, research.grounding_metadata, grounding_insights
|
||||
)
|
||||
|
||||
# Optimize outline for better flow, SEO, and engagement (user_id required for subscription checks)
|
||||
await task_manager.update_progress(task_id, "🎯 Optimizing outline for better flow and engagement...")
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
|
||||
|
||||
# Rebalance word counts for optimal distribution
|
||||
await task_manager.update_progress(task_id, "⚖️ Rebalancing word count distribution...")
|
||||
target_words = request.word_count or 1500
|
||||
balanced_sections = self.outline_optimizer.rebalance_word_counts(optimized_sections, target_words)
|
||||
|
||||
# Extract title options - combine AI-generated with content angles
|
||||
ai_title_options = outline_data.get('title_options', [])
|
||||
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
|
||||
|
||||
# Combine AI-generated titles with content angles
|
||||
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
|
||||
|
||||
await task_manager.update_progress(task_id, "✅ Outline generation and optimization completed successfully!")
|
||||
|
||||
# Collect metadata for enhanced UI
|
||||
source_mapping_stats = self.metadata_collector.collect_source_mapping_stats(mapped_sections, research)
|
||||
grounding_insights_data = self.metadata_collector.collect_grounding_insights(grounding_insights)
|
||||
optimization_results = self.metadata_collector.collect_optimization_results(optimized_sections, "comprehensive optimization")
|
||||
research_coverage = self.metadata_collector.collect_research_coverage(research)
|
||||
|
||||
return BlogOutlineResponse(
|
||||
success=True,
|
||||
title_options=title_options,
|
||||
outline=balanced_sections,
|
||||
source_mapping_stats=source_mapping_stats,
|
||||
grounding_insights=grounding_insights_data,
|
||||
optimization_results=optimization_results,
|
||||
research_coverage=research_coverage
|
||||
)
|
||||
|
||||
|
||||
|
||||
async def enhance_section(self, section: BlogOutlineSection, focus: str = "general improvement") -> BlogOutlineSection:
|
||||
"""
|
||||
Enhance a single section using AI with research context.
|
||||
|
||||
Args:
|
||||
section: The section to enhance
|
||||
focus: Enhancement focus area (e.g., "SEO optimization", "engagement", "comprehensiveness")
|
||||
|
||||
Returns:
|
||||
Enhanced section with improved content
|
||||
"""
|
||||
logger.info(f"Enhancing section '{section.heading}' with focus: {focus}")
|
||||
enhanced_section = await self.section_enhancer.enhance(section, focus)
|
||||
logger.info(f"✅ Section enhancement completed for '{section.heading}'")
|
||||
return enhanced_section
|
||||
|
||||
async def optimize_outline(self, outline: List[BlogOutlineSection], focus: str = "comprehensive optimization") -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Optimize an entire outline for better flow, SEO, and engagement.
|
||||
|
||||
Args:
|
||||
outline: List of sections to optimize
|
||||
focus: Optimization focus area
|
||||
|
||||
Returns:
|
||||
Optimized outline with improved flow and engagement
|
||||
"""
|
||||
logger.info(f"Optimizing outline with {len(outline)} sections, focus: {focus}")
|
||||
optimized_outline = await self.outline_optimizer.optimize(outline, focus)
|
||||
logger.info(f"✅ Outline optimization completed for {len(optimized_outline)} sections")
|
||||
return optimized_outline
|
||||
|
||||
def rebalance_outline_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Rebalance word count distribution across outline sections.
|
||||
|
||||
Args:
|
||||
outline: List of sections to rebalance
|
||||
target_words: Total target word count
|
||||
|
||||
Returns:
|
||||
Outline with rebalanced word counts
|
||||
"""
|
||||
logger.info(f"Rebalancing word counts for {len(outline)} sections, target: {target_words} words")
|
||||
rebalanced_outline = self.outline_optimizer.rebalance_word_counts(outline, target_words)
|
||||
logger.info(f"✅ Word count rebalancing completed")
|
||||
return rebalanced_outline
|
||||
|
||||
def get_grounding_insights(self, research_data) -> Dict[str, Any]:
|
||||
"""
|
||||
Get grounding metadata insights for research data.
|
||||
|
||||
Args:
|
||||
research_data: Research data with grounding metadata
|
||||
|
||||
Returns:
|
||||
Dictionary containing grounding insights and analysis
|
||||
"""
|
||||
logger.info("Extracting grounding insights from research data...")
|
||||
insights = self.grounding_engine.extract_contextual_insights(research_data.grounding_metadata)
|
||||
logger.info(f"✅ Extracted {len(insights)} grounding insight categories")
|
||||
return insights
|
||||
|
||||
def get_authority_sources(self, research_data) -> List[Tuple]:
|
||||
"""
|
||||
Get high-authority sources from grounding metadata.
|
||||
|
||||
Args:
|
||||
research_data: Research data with grounding metadata
|
||||
|
||||
Returns:
|
||||
List of (chunk, authority_score) tuples sorted by authority
|
||||
"""
|
||||
logger.info("Identifying high-authority sources from grounding metadata...")
|
||||
authority_sources = self.grounding_engine.get_authority_sources(research_data.grounding_metadata)
|
||||
logger.info(f"✅ Identified {len(authority_sources)} high-authority sources")
|
||||
return authority_sources
|
||||
|
||||
def get_high_confidence_insights(self, research_data) -> List[str]:
|
||||
"""
|
||||
Get high-confidence insights from grounding metadata.
|
||||
|
||||
Args:
|
||||
research_data: Research data with grounding metadata
|
||||
|
||||
Returns:
|
||||
List of high-confidence insights
|
||||
"""
|
||||
logger.info("Extracting high-confidence insights from grounding metadata...")
|
||||
insights = self.grounding_engine.get_high_confidence_insights(research_data.grounding_metadata)
|
||||
logger.info(f"✅ Extracted {len(insights)} high-confidence insights")
|
||||
return insights
|
||||
|
||||
|
||||
|
||||
137
backend/services/blog_writer/outline/outline_optimizer.py
Normal file
137
backend/services/blog_writer/outline/outline_optimizer.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Outline Optimizer - AI-powered outline optimization and rebalancing.
|
||||
|
||||
Optimizes outlines for better flow, SEO, and engagement.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogOutlineSection
|
||||
|
||||
|
||||
class OutlineOptimizer:
|
||||
"""Optimizes outlines for better flow, SEO, and engagement."""
|
||||
|
||||
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str) -> List[BlogOutlineSection]:
|
||||
"""Optimize entire outline for better flow, SEO, and engagement.
|
||||
|
||||
Args:
|
||||
outline: List of outline sections to optimize
|
||||
focus: Optimization focus (e.g., "general optimization")
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Returns:
|
||||
List of optimized outline sections
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for outline optimization (subscription checks and usage tracking)")
|
||||
|
||||
outline_text = "\n".join([f"{i+1}. {s.heading}" for i, s in enumerate(outline)])
|
||||
|
||||
optimization_prompt = f"""Optimize this blog outline for better flow, engagement, and SEO:
|
||||
|
||||
Current Outline:
|
||||
{outline_text}
|
||||
|
||||
Optimization Focus: {focus}
|
||||
|
||||
Goals: Improve narrative flow, enhance SEO, increase engagement, ensure comprehensive coverage.
|
||||
|
||||
Return JSON format:
|
||||
{{
|
||||
"outline": [
|
||||
{{
|
||||
"heading": "Optimized heading",
|
||||
"subheadings": ["subheading 1", "subheading 2"],
|
||||
"key_points": ["point 1", "point 2"],
|
||||
"target_words": 300,
|
||||
"keywords": ["keyword1", "keyword2"]
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
try:
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
optimization_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"outline": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"heading": {"type": "string"},
|
||||
"subheadings": {"type": "array", "items": {"type": "string"}},
|
||||
"key_points": {"type": "array", "items": {"type": "string"}},
|
||||
"target_words": {"type": "integer"},
|
||||
"keywords": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["outline"],
|
||||
"propertyOrdering": ["outline"]
|
||||
}
|
||||
|
||||
optimized_data = llm_text_gen(
|
||||
prompt=optimization_prompt,
|
||||
json_struct=optimization_schema,
|
||||
system_prompt=None,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Handle the new schema format with "outline" wrapper
|
||||
if isinstance(optimized_data, dict) and 'outline' in optimized_data:
|
||||
optimized_sections = []
|
||||
for i, section_data in enumerate(optimized_data['outline']):
|
||||
section = BlogOutlineSection(
|
||||
id=f"s{i+1}",
|
||||
heading=section_data.get('heading', f'Section {i+1}'),
|
||||
subheadings=section_data.get('subheadings', []),
|
||||
key_points=section_data.get('key_points', []),
|
||||
references=outline[i].references if i < len(outline) else [],
|
||||
target_words=section_data.get('target_words', 300),
|
||||
keywords=section_data.get('keywords', [])
|
||||
)
|
||||
optimized_sections.append(section)
|
||||
logger.info(f"✅ Outline optimization completed: {len(optimized_sections)} sections optimized")
|
||||
return optimized_sections
|
||||
else:
|
||||
logger.warning(f"Invalid optimization response format: {type(optimized_data)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI outline optimization failed: {e}")
|
||||
logger.info("Returning original outline without optimization")
|
||||
|
||||
return outline
|
||||
|
||||
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
|
||||
"""Rebalance word count distribution across sections."""
|
||||
total_sections = len(outline)
|
||||
if total_sections == 0:
|
||||
return outline
|
||||
|
||||
# Calculate target distribution
|
||||
intro_words = int(target_words * 0.12) # 12% for intro
|
||||
conclusion_words = int(target_words * 0.12) # 12% for conclusion
|
||||
main_content_words = target_words - intro_words - conclusion_words
|
||||
|
||||
# Distribute main content words across sections
|
||||
words_per_section = main_content_words // total_sections
|
||||
remainder = main_content_words % total_sections
|
||||
|
||||
for i, section in enumerate(outline):
|
||||
if i == 0: # First section (intro)
|
||||
section.target_words = intro_words
|
||||
elif i == total_sections - 1: # Last section (conclusion)
|
||||
section.target_words = conclusion_words
|
||||
else: # Main content sections
|
||||
section.target_words = words_per_section + (1 if i < remainder else 0)
|
||||
|
||||
return outline
|
||||
268
backend/services/blog_writer/outline/outline_service.py
Normal file
268
backend/services/blog_writer/outline/outline_service.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""
|
||||
Outline Service - Core outline generation and management functionality.
|
||||
|
||||
Handles AI-powered outline generation, refinement, and optimization.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
import asyncio
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
BlogOutlineRequest,
|
||||
BlogOutlineResponse,
|
||||
BlogOutlineRefineRequest,
|
||||
BlogOutlineSection,
|
||||
)
|
||||
|
||||
from .outline_generator import OutlineGenerator
|
||||
from .outline_optimizer import OutlineOptimizer
|
||||
from .section_enhancer import SectionEnhancer
|
||||
from services.cache.persistent_outline_cache import persistent_outline_cache
|
||||
|
||||
|
||||
class OutlineService:
|
||||
"""Service for generating and managing blog outlines using AI."""
|
||||
|
||||
def __init__(self):
|
||||
self.outline_generator = OutlineGenerator()
|
||||
self.outline_optimizer = OutlineOptimizer()
|
||||
self.section_enhancer = SectionEnhancer()
|
||||
|
||||
async def generate_outline(self, request: BlogOutlineRequest, user_id: str) -> BlogOutlineResponse:
|
||||
"""
|
||||
Stage 2: Content Planning with AI-generated outline using research results.
|
||||
Uses Gemini with research data to create comprehensive, SEO-optimized outline.
|
||||
|
||||
Args:
|
||||
request: Outline generation request with research data
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
|
||||
|
||||
# Extract cache parameters - use original user keywords for consistent caching
|
||||
keywords = request.research.original_keywords or request.research.keyword_analysis.get('primary', [])
|
||||
industry = getattr(request.persona, 'industry', 'general') if request.persona else 'general'
|
||||
target_audience = getattr(request.persona, 'target_audience', 'general') if request.persona else 'general'
|
||||
word_count = request.word_count or 1500
|
||||
custom_instructions = request.custom_instructions or ""
|
||||
persona_data = request.persona.dict() if request.persona else None
|
||||
|
||||
# Check cache first
|
||||
cached_result = persistent_outline_cache.get_cached_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
logger.info(f"Using cached outline for keywords: {keywords}")
|
||||
return BlogOutlineResponse(**cached_result)
|
||||
|
||||
# Generate new outline if not cached (user_id required)
|
||||
logger.info(f"Generating new outline for keywords: {keywords}")
|
||||
result = await self.outline_generator.generate(request, user_id)
|
||||
|
||||
# Cache the result
|
||||
persistent_outline_cache.cache_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data,
|
||||
result=result.dict()
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def generate_outline_with_progress(self, request: BlogOutlineRequest, task_id: str, user_id: str) -> BlogOutlineResponse:
|
||||
"""
|
||||
Outline generation method with progress updates for real-time feedback.
|
||||
"""
|
||||
# Extract cache parameters - use original user keywords for consistent caching
|
||||
keywords = request.research.original_keywords or request.research.keyword_analysis.get('primary', [])
|
||||
industry = getattr(request.persona, 'industry', 'general') if request.persona else 'general'
|
||||
target_audience = getattr(request.persona, 'target_audience', 'general') if request.persona else 'general'
|
||||
word_count = request.word_count or 1500
|
||||
custom_instructions = request.custom_instructions or ""
|
||||
persona_data = request.persona.dict() if request.persona else None
|
||||
|
||||
# Check cache first
|
||||
cached_result = persistent_outline_cache.get_cached_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
logger.info(f"Using cached outline for keywords: {keywords} (with progress updates)")
|
||||
# Update progress to show cache hit
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "✅ Using cached outline (saved generation time!)")
|
||||
return BlogOutlineResponse(**cached_result)
|
||||
|
||||
# Generate new outline if not cached
|
||||
logger.info(f"Generating new outline for keywords: {keywords} (with progress updates)")
|
||||
result = await self.outline_generator.generate_with_progress(request, task_id, user_id)
|
||||
|
||||
# Cache the result
|
||||
persistent_outline_cache.cache_outline(
|
||||
keywords=keywords,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
word_count=word_count,
|
||||
custom_instructions=custom_instructions,
|
||||
persona_data=persona_data,
|
||||
result=result.dict()
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def refine_outline(self, request: BlogOutlineRefineRequest) -> BlogOutlineResponse:
|
||||
"""
|
||||
Refine outline with HITL (Human-in-the-Loop) operations
|
||||
Supports add, remove, move, merge, rename operations
|
||||
"""
|
||||
outline = request.outline.copy()
|
||||
operation = request.operation.lower()
|
||||
section_id = request.section_id
|
||||
payload = request.payload or {}
|
||||
|
||||
try:
|
||||
if operation == 'add':
|
||||
# Add new section
|
||||
new_section = BlogOutlineSection(
|
||||
id=f"s{len(outline) + 1}",
|
||||
heading=payload.get('heading', 'New Section'),
|
||||
subheadings=payload.get('subheadings', []),
|
||||
key_points=payload.get('key_points', []),
|
||||
references=[],
|
||||
target_words=payload.get('target_words', 300)
|
||||
)
|
||||
outline.append(new_section)
|
||||
logger.info(f"Added new section: {new_section.heading}")
|
||||
|
||||
elif operation == 'remove' and section_id:
|
||||
# Remove section
|
||||
outline = [s for s in outline if s.id != section_id]
|
||||
logger.info(f"Removed section: {section_id}")
|
||||
|
||||
elif operation == 'rename' and section_id:
|
||||
# Rename section
|
||||
for section in outline:
|
||||
if section.id == section_id:
|
||||
section.heading = payload.get('heading', section.heading)
|
||||
break
|
||||
logger.info(f"Renamed section {section_id} to: {payload.get('heading')}")
|
||||
|
||||
elif operation == 'move' and section_id:
|
||||
# Move section (reorder)
|
||||
direction = payload.get('direction', 'down') # 'up' or 'down'
|
||||
current_index = next((i for i, s in enumerate(outline) if s.id == section_id), -1)
|
||||
|
||||
if current_index != -1:
|
||||
if direction == 'up' and current_index > 0:
|
||||
outline[current_index], outline[current_index - 1] = outline[current_index - 1], outline[current_index]
|
||||
elif direction == 'down' and current_index < len(outline) - 1:
|
||||
outline[current_index], outline[current_index + 1] = outline[current_index + 1], outline[current_index]
|
||||
logger.info(f"Moved section {section_id} {direction}")
|
||||
|
||||
elif operation == 'merge' and section_id:
|
||||
# Merge with next section
|
||||
current_index = next((i for i, s in enumerate(outline) if s.id == section_id), -1)
|
||||
if current_index != -1 and current_index < len(outline) - 1:
|
||||
current_section = outline[current_index]
|
||||
next_section = outline[current_index + 1]
|
||||
|
||||
# Merge sections
|
||||
current_section.heading = f"{current_section.heading} & {next_section.heading}"
|
||||
current_section.subheadings.extend(next_section.subheadings)
|
||||
current_section.key_points.extend(next_section.key_points)
|
||||
current_section.references.extend(next_section.references)
|
||||
current_section.target_words = (current_section.target_words or 0) + (next_section.target_words or 0)
|
||||
|
||||
# Remove the next section
|
||||
outline.pop(current_index + 1)
|
||||
logger.info(f"Merged section {section_id} with next section")
|
||||
|
||||
elif operation == 'update' and section_id:
|
||||
# Update section details
|
||||
for section in outline:
|
||||
if section.id == section_id:
|
||||
if 'heading' in payload:
|
||||
section.heading = payload['heading']
|
||||
if 'subheadings' in payload:
|
||||
section.subheadings = payload['subheadings']
|
||||
if 'key_points' in payload:
|
||||
section.key_points = payload['key_points']
|
||||
if 'target_words' in payload:
|
||||
section.target_words = payload['target_words']
|
||||
break
|
||||
logger.info(f"Updated section {section_id}")
|
||||
|
||||
# Reassign IDs to maintain order
|
||||
for i, section in enumerate(outline):
|
||||
section.id = f"s{i+1}"
|
||||
|
||||
return BlogOutlineResponse(
|
||||
success=True,
|
||||
title_options=["Refined Outline"],
|
||||
outline=outline
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Outline refinement failed: {e}")
|
||||
return BlogOutlineResponse(
|
||||
success=False,
|
||||
title_options=["Error"],
|
||||
outline=request.outline
|
||||
)
|
||||
|
||||
async def enhance_section_with_ai(self, section: BlogOutlineSection, focus: str = "general improvement") -> BlogOutlineSection:
|
||||
"""Enhance a section using AI with research context."""
|
||||
return await self.section_enhancer.enhance(section, focus)
|
||||
|
||||
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization") -> List[BlogOutlineSection]:
|
||||
"""Optimize entire outline for better flow, SEO, and engagement."""
|
||||
return await self.outline_optimizer.optimize(outline, focus)
|
||||
|
||||
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
|
||||
"""Rebalance word count distribution across sections."""
|
||||
return self.outline_optimizer.rebalance_word_counts(outline, target_words)
|
||||
|
||||
# Cache Management Methods
|
||||
|
||||
def get_outline_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get outline cache statistics."""
|
||||
return persistent_outline_cache.get_cache_stats()
|
||||
|
||||
def clear_outline_cache(self):
|
||||
"""Clear all cached outline entries."""
|
||||
persistent_outline_cache.clear_cache()
|
||||
logger.info("Outline cache cleared")
|
||||
|
||||
def invalidate_outline_cache_for_keywords(self, keywords: List[str]):
|
||||
"""
|
||||
Invalidate outline cache entries for specific keywords.
|
||||
Useful when research data is updated.
|
||||
|
||||
Args:
|
||||
keywords: Keywords to invalidate cache for
|
||||
"""
|
||||
persistent_outline_cache.invalidate_cache_for_keywords(keywords)
|
||||
logger.info(f"Invalidated outline cache for keywords: {keywords}")
|
||||
|
||||
def get_recent_outline_cache_entries(self, limit: int = 20) -> List[Dict[str, Any]]:
|
||||
"""Get recent outline cache entries for debugging."""
|
||||
return persistent_outline_cache.get_cache_entries(limit)
|
||||
121
backend/services/blog_writer/outline/parallel_processor.py
Normal file
121
backend/services/blog_writer/outline/parallel_processor.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
Parallel Processor - Handles parallel processing of outline generation tasks.
|
||||
|
||||
Manages concurrent execution of source mapping and grounding insights extraction.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Tuple, Any
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ParallelProcessor:
|
||||
"""Handles parallel processing of outline generation tasks for speed optimization."""
|
||||
|
||||
def __init__(self, source_mapper, grounding_engine):
|
||||
"""Initialize the parallel processor with required dependencies."""
|
||||
self.source_mapper = source_mapper
|
||||
self.grounding_engine = grounding_engine
|
||||
|
||||
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None) -> Tuple[Any, Any]:
|
||||
"""
|
||||
Run source mapping and grounding insights extraction in parallel.
|
||||
|
||||
Args:
|
||||
outline_sections: List of outline sections to process
|
||||
research: Research data object
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
task_id: Optional task ID for progress updates
|
||||
|
||||
Returns:
|
||||
Tuple of (mapped_sections, grounding_insights)
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for parallel processing (subscription checks and usage tracking)")
|
||||
|
||||
if task_id:
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "⚡ Running parallel processing for maximum speed...")
|
||||
|
||||
logger.info("Running parallel processing for maximum speed...")
|
||||
|
||||
# Run these tasks in parallel to save time
|
||||
source_mapping_task = asyncio.create_task(
|
||||
self._run_source_mapping(outline_sections, research, task_id, user_id)
|
||||
)
|
||||
|
||||
grounding_insights_task = asyncio.create_task(
|
||||
self._run_grounding_insights_extraction(research, task_id)
|
||||
)
|
||||
|
||||
# Wait for both parallel tasks to complete
|
||||
mapped_sections, grounding_insights = await asyncio.gather(
|
||||
source_mapping_task,
|
||||
grounding_insights_task
|
||||
)
|
||||
|
||||
return mapped_sections, grounding_insights
|
||||
|
||||
async def run_parallel_processing_async(self, outline_sections, research, user_id: str) -> Tuple[Any, Any]:
|
||||
"""
|
||||
Run parallel processing without progress updates (for non-progress methods).
|
||||
|
||||
Args:
|
||||
outline_sections: List of outline sections to process
|
||||
research: Research data object
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Returns:
|
||||
Tuple of (mapped_sections, grounding_insights)
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for parallel processing (subscription checks and usage tracking)")
|
||||
|
||||
logger.info("Running parallel processing for maximum speed...")
|
||||
|
||||
# Run these tasks in parallel to save time
|
||||
source_mapping_task = asyncio.create_task(
|
||||
self._run_source_mapping_async(outline_sections, research, user_id)
|
||||
)
|
||||
|
||||
grounding_insights_task = asyncio.create_task(
|
||||
self._run_grounding_insights_extraction_async(research)
|
||||
)
|
||||
|
||||
# Wait for both parallel tasks to complete
|
||||
mapped_sections, grounding_insights = await asyncio.gather(
|
||||
source_mapping_task,
|
||||
grounding_insights_task
|
||||
)
|
||||
|
||||
return mapped_sections, grounding_insights
|
||||
|
||||
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str):
|
||||
"""Run source mapping in parallel."""
|
||||
if task_id:
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "🔗 Applying intelligent source-to-section mapping...")
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
|
||||
|
||||
async def _run_grounding_insights_extraction(self, research, task_id):
|
||||
"""Run grounding insights extraction in parallel."""
|
||||
if task_id:
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "🧠 Extracting grounding metadata insights...")
|
||||
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
|
||||
|
||||
async def _run_source_mapping_async(self, outline_sections, research, user_id: str):
|
||||
"""Run source mapping in parallel (async version without progress updates)."""
|
||||
logger.info("Applying intelligent source-to-section mapping...")
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
|
||||
|
||||
async def _run_grounding_insights_extraction_async(self, research):
|
||||
"""Run grounding insights extraction in parallel (async version without progress updates)."""
|
||||
logger.info("Extracting grounding metadata insights...")
|
||||
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
|
||||
127
backend/services/blog_writer/outline/prompt_builder.py
Normal file
127
backend/services/blog_writer/outline/prompt_builder.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""
|
||||
Prompt Builder - Handles building of AI prompts for outline generation.
|
||||
|
||||
Constructs comprehensive prompts with research data, keywords, and strategic requirements.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
class PromptBuilder:
|
||||
"""Handles building of comprehensive AI prompts for outline generation."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the prompt builder."""
|
||||
pass
|
||||
|
||||
def build_outline_prompt(self, primary_keywords: List[str], secondary_keywords: List[str],
|
||||
content_angles: List[str], sources: List, search_intent: str,
|
||||
request, custom_instructions: str = None) -> str:
|
||||
"""Build the comprehensive outline generation prompt using filtered research data."""
|
||||
|
||||
# Use the filtered research data (already cleaned by ResearchDataFilter)
|
||||
research = request.research
|
||||
|
||||
primary_kw_text = ', '.join(primary_keywords) if primary_keywords else (request.topic or ', '.join(getattr(request.research, 'original_keywords', []) or ['the target topic']))
|
||||
secondary_kw_text = ', '.join(secondary_keywords) if secondary_keywords else "None provided"
|
||||
long_tail_text = ', '.join(research.keyword_analysis.get('long_tail', [])) if research and research.keyword_analysis else "None discovered"
|
||||
semantic_text = ', '.join(research.keyword_analysis.get('semantic_keywords', [])) if research and research.keyword_analysis else "None discovered"
|
||||
trending_text = ', '.join(research.keyword_analysis.get('trending_terms', [])) if research and research.keyword_analysis else "None discovered"
|
||||
content_gap_text = ', '.join(research.keyword_analysis.get('content_gaps', [])) if research and research.keyword_analysis else "None identified"
|
||||
content_angle_text = ', '.join(content_angles) if content_angles else "No explicit angles provided; infer compelling angles from research insights."
|
||||
competitor_text = ', '.join(research.competitor_analysis.get('top_competitors', [])) if research and research.competitor_analysis else "Not available"
|
||||
opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
|
||||
advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
|
||||
|
||||
return f"""Create a comprehensive blog outline for: {primary_kw_text}
|
||||
|
||||
CONTEXT:
|
||||
Search Intent: {search_intent}
|
||||
Target: {request.word_count or 1500} words
|
||||
Industry: {getattr(request.persona, 'industry', 'General') if request.persona else 'General'}
|
||||
Audience: {getattr(request.persona, 'target_audience', 'General') if request.persona else 'General'}
|
||||
|
||||
KEYWORDS:
|
||||
Primary: {primary_kw_text}
|
||||
Secondary: {secondary_kw_text}
|
||||
Long-tail: {long_tail_text}
|
||||
Semantic: {semantic_text}
|
||||
Trending: {trending_text}
|
||||
Content Gaps: {content_gap_text}
|
||||
|
||||
CONTENT ANGLES / STORYLINES: {content_angle_text}
|
||||
|
||||
COMPETITIVE INTELLIGENCE:
|
||||
Top Competitors: {competitor_text}
|
||||
Market Opportunities: {opportunity_text}
|
||||
Competitive Advantages: {advantages_text}
|
||||
|
||||
RESEARCH SOURCES: {len(sources)} authoritative sources available
|
||||
|
||||
{f"CUSTOM INSTRUCTIONS: {custom_instructions}" if custom_instructions else ""}
|
||||
|
||||
STRATEGIC REQUIREMENTS:
|
||||
- Create SEO-optimized headings with natural keyword integration
|
||||
- Surface the strongest research-backed angles within the outline
|
||||
- Build logical narrative flow from problem to solution
|
||||
- Include data-driven insights from research sources
|
||||
- Address content gaps and market opportunities
|
||||
- Optimize for search intent and user questions
|
||||
- Ensure engaging, actionable content throughout
|
||||
|
||||
Return JSON format:
|
||||
{
|
||||
"title_options": [
|
||||
"Title option 1",
|
||||
"Title option 2",
|
||||
"Title option 3"
|
||||
],
|
||||
"outline": [
|
||||
{
|
||||
"heading": "Section heading with primary keyword",
|
||||
"subheadings": ["Subheading 1", "Subheading 2", "Subheading 3"],
|
||||
"key_points": ["Key point 1", "Key point 2", "Key point 3"],
|
||||
"target_words": 300,
|
||||
"keywords": ["primary keyword", "secondary keyword"]
|
||||
}
|
||||
]
|
||||
}"""
|
||||
|
||||
def get_outline_schema(self) -> Dict[str, Any]:
|
||||
"""Get the structured JSON schema for outline generation."""
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title_options": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"outline": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"heading": {"type": "string"},
|
||||
"subheadings": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"key_points": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"target_words": {"type": "integer"},
|
||||
"keywords": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["title_options", "outline"],
|
||||
"propertyOrdering": ["title_options", "outline"]
|
||||
}
|
||||
120
backend/services/blog_writer/outline/response_processor.py
Normal file
120
backend/services/blog_writer/outline/response_processor.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""
|
||||
Response Processor - Handles AI response processing and retry logic.
|
||||
|
||||
Processes AI responses, handles retries, and converts data to proper formats.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
import asyncio
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogOutlineSection
|
||||
|
||||
|
||||
class ResponseProcessor:
|
||||
"""Handles AI response processing, retry logic, and data conversion."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the response processor."""
|
||||
pass
|
||||
|
||||
async def generate_with_retry(self, prompt: str, schema: Dict[str, Any], user_id: str, task_id: str = None) -> Dict[str, Any]:
|
||||
"""Generate outline with retry logic for API failures.
|
||||
|
||||
Args:
|
||||
prompt: The prompt for outline generation
|
||||
schema: JSON schema for structured response
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
task_id: Optional task ID for progress updates
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
|
||||
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
|
||||
max_retries = 2 # Conservative retry for expensive API calls
|
||||
retry_delay = 5 # 5 second delay between retries
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"🤖 Calling AI API for outline generation (attempt {attempt + 1}/{max_retries + 1})...")
|
||||
|
||||
outline_data = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=schema,
|
||||
system_prompt=None,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Log response for debugging
|
||||
logger.info(f"AI response received: {type(outline_data)}")
|
||||
|
||||
# Check for errors in the response
|
||||
if isinstance(outline_data, dict) and 'error' in outline_data:
|
||||
error_msg = str(outline_data['error'])
|
||||
if "503" in error_msg and "overloaded" in error_msg and attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ AI service overloaded, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"AI API overloaded, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
elif "No valid structured response content found" in error_msg and attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ Invalid response format, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"AI response parsing failed, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"AI structured response error: {outline_data['error']}")
|
||||
raise ValueError(f"AI outline generation failed: {outline_data['error']}")
|
||||
|
||||
# Validate required fields
|
||||
if not isinstance(outline_data, dict) or 'outline' not in outline_data or not isinstance(outline_data['outline'], list):
|
||||
if attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ Invalid response structure, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"Invalid response structure, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
raise ValueError("Invalid outline structure in AI response")
|
||||
|
||||
# If we get here, the response is valid
|
||||
return outline_data
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
if ("503" in error_str or "overloaded" in error_str) and attempt < max_retries:
|
||||
if task_id:
|
||||
await task_manager.update_progress(task_id, f"⚠️ AI service error, retrying in {retry_delay} seconds...")
|
||||
logger.warning(f"AI API error, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1}): {error_str}")
|
||||
await asyncio.sleep(retry_delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"Outline generation failed after {attempt + 1} attempts: {error_str}")
|
||||
raise ValueError(f"AI outline generation failed: {error_str}")
|
||||
|
||||
def convert_to_sections(self, outline_data: Dict[str, Any], sources: List) -> List[BlogOutlineSection]:
|
||||
"""Convert outline data to BlogOutlineSection objects."""
|
||||
outline_sections = []
|
||||
for i, section_data in enumerate(outline_data.get('outline', [])):
|
||||
if not isinstance(section_data, dict) or 'heading' not in section_data:
|
||||
continue
|
||||
|
||||
section = BlogOutlineSection(
|
||||
id=f"s{i+1}",
|
||||
heading=section_data.get('heading', f'Section {i+1}'),
|
||||
subheadings=section_data.get('subheadings', []),
|
||||
key_points=section_data.get('key_points', []),
|
||||
references=[], # Will be populated by intelligent mapping
|
||||
target_words=section_data.get('target_words', 200),
|
||||
keywords=section_data.get('keywords', [])
|
||||
)
|
||||
outline_sections.append(section)
|
||||
|
||||
return outline_sections
|
||||
96
backend/services/blog_writer/outline/section_enhancer.py
Normal file
96
backend/services/blog_writer/outline/section_enhancer.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""
|
||||
Section Enhancer - AI-powered section enhancement and improvement.
|
||||
|
||||
Enhances individual outline sections for better engagement and value.
|
||||
"""
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogOutlineSection
|
||||
|
||||
|
||||
class SectionEnhancer:
|
||||
"""Enhances individual outline sections using AI."""
|
||||
|
||||
async def enhance(self, section: BlogOutlineSection, focus: str, user_id: str) -> BlogOutlineSection:
|
||||
"""Enhance a section using AI with research context.
|
||||
|
||||
Args:
|
||||
section: Outline section to enhance
|
||||
focus: Enhancement focus (e.g., "general improvement")
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Returns:
|
||||
Enhanced outline section
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for section enhancement (subscription checks and usage tracking)")
|
||||
|
||||
enhancement_prompt = f"""
|
||||
Enhance the following blog section to make it more engaging, comprehensive, and valuable:
|
||||
|
||||
Current Section:
|
||||
Heading: {section.heading}
|
||||
Subheadings: {', '.join(section.subheadings)}
|
||||
Key Points: {', '.join(section.key_points)}
|
||||
Target Words: {section.target_words}
|
||||
Keywords: {', '.join(section.keywords)}
|
||||
|
||||
Enhancement Focus: {focus}
|
||||
|
||||
Improve:
|
||||
1. Make subheadings more specific and actionable
|
||||
2. Add more comprehensive key points with data/insights
|
||||
3. Include practical examples and case studies
|
||||
4. Address common questions and objections
|
||||
5. Optimize for SEO with better keyword integration
|
||||
|
||||
Respond with JSON:
|
||||
{{
|
||||
"heading": "Enhanced heading",
|
||||
"subheadings": ["enhanced subheading 1", "enhanced subheading 2"],
|
||||
"key_points": ["enhanced point 1", "enhanced point 2"],
|
||||
"target_words": 400,
|
||||
"keywords": ["keyword1", "keyword2"]
|
||||
}}
|
||||
"""
|
||||
|
||||
try:
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
enhancement_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"heading": {"type": "string"},
|
||||
"subheadings": {"type": "array", "items": {"type": "string"}},
|
||||
"key_points": {"type": "array", "items": {"type": "string"}},
|
||||
"target_words": {"type": "integer"},
|
||||
"keywords": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
|
||||
}
|
||||
|
||||
enhanced_data = llm_text_gen(
|
||||
prompt=enhancement_prompt,
|
||||
json_struct=enhancement_schema,
|
||||
system_prompt=None,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
if isinstance(enhanced_data, dict) and 'error' not in enhanced_data:
|
||||
return BlogOutlineSection(
|
||||
id=section.id,
|
||||
heading=enhanced_data.get('heading', section.heading),
|
||||
subheadings=enhanced_data.get('subheadings', section.subheadings),
|
||||
key_points=enhanced_data.get('key_points', section.key_points),
|
||||
references=section.references,
|
||||
target_words=enhanced_data.get('target_words', section.target_words),
|
||||
keywords=enhanced_data.get('keywords', section.keywords)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"AI section enhancement failed: {e}")
|
||||
|
||||
return section
|
||||
198
backend/services/blog_writer/outline/seo_title_generator.py
Normal file
198
backend/services/blog_writer/outline/seo_title_generator.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
SEO Title Generator - Specialized service for generating SEO-optimized blog titles.
|
||||
|
||||
Generates 5 premium SEO-optimized titles using research data and outline context.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogResearchResponse, BlogOutlineSection
|
||||
|
||||
|
||||
class SEOTitleGenerator:
|
||||
"""Generates SEO-optimized blog titles using research and outline data."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the SEO title generator."""
|
||||
pass
|
||||
|
||||
def build_title_prompt(
|
||||
self,
|
||||
research: BlogResearchResponse,
|
||||
outline: List[BlogOutlineSection],
|
||||
primary_keywords: List[str],
|
||||
secondary_keywords: List[str],
|
||||
content_angles: List[str],
|
||||
search_intent: str,
|
||||
word_count: int = 1500
|
||||
) -> str:
|
||||
"""Build a specialized prompt for SEO title generation."""
|
||||
|
||||
# Extract key research insights
|
||||
keyword_analysis = research.keyword_analysis or {}
|
||||
competitor_analysis = research.competitor_analysis or {}
|
||||
|
||||
primary_kw_text = ', '.join(primary_keywords) if primary_keywords else "the target topic"
|
||||
secondary_kw_text = ', '.join(secondary_keywords) if secondary_keywords else "None provided"
|
||||
long_tail_text = ', '.join(keyword_analysis.get('long_tail', [])) if keyword_analysis else "None discovered"
|
||||
semantic_text = ', '.join(keyword_analysis.get('semantic_keywords', [])) if keyword_analysis else "None discovered"
|
||||
trending_text = ', '.join(keyword_analysis.get('trending_terms', [])) if keyword_analysis else "None discovered"
|
||||
content_gap_text = ', '.join(keyword_analysis.get('content_gaps', [])) if keyword_analysis else "None identified"
|
||||
content_angle_text = ', '.join(content_angles) if content_angles else "No explicit angles provided"
|
||||
|
||||
# Extract outline structure summary
|
||||
outline_summary = []
|
||||
for i, section in enumerate(outline[:5], 1): # Limit to first 5 sections for context
|
||||
outline_summary.append(f"{i}. {section.heading}")
|
||||
if section.subheadings:
|
||||
outline_summary.append(f" Subtopics: {', '.join(section.subheadings[:3])}")
|
||||
|
||||
outline_text = '\n'.join(outline_summary) if outline_summary else "No outline available"
|
||||
|
||||
return f"""Generate exactly 5 SEO-optimized blog titles for: {primary_kw_text}
|
||||
|
||||
RESEARCH CONTEXT:
|
||||
Primary Keywords: {primary_kw_text}
|
||||
Secondary Keywords: {secondary_kw_text}
|
||||
Long-tail Keywords: {long_tail_text}
|
||||
Semantic Keywords: {semantic_text}
|
||||
Trending Terms: {trending_text}
|
||||
Content Gaps: {content_gap_text}
|
||||
Search Intent: {search_intent}
|
||||
Content Angles: {content_angle_text}
|
||||
|
||||
OUTLINE STRUCTURE:
|
||||
{outline_text}
|
||||
|
||||
COMPETITIVE INTELLIGENCE:
|
||||
Top Competitors: {', '.join(competitor_analysis.get('top_competitors', [])) if competitor_analysis else 'Not available'}
|
||||
Market Opportunities: {', '.join(competitor_analysis.get('opportunities', [])) if competitor_analysis else 'Not available'}
|
||||
|
||||
SEO REQUIREMENTS:
|
||||
- Each title must be 50-65 characters (optimal for search engine display)
|
||||
- Include the primary keyword within the first 55 characters
|
||||
- Highlight a unique value proposition from the research angles
|
||||
- Use power words that drive clicks (e.g., "Ultimate", "Complete", "Essential", "Proven")
|
||||
- Avoid generic phrasing - be specific and benefit-focused
|
||||
- Target the search intent: {search_intent}
|
||||
- Ensure titles are compelling and click-worthy
|
||||
|
||||
Return ONLY a JSON array of exactly 5 titles:
|
||||
[
|
||||
"Title 1 (50-65 chars)",
|
||||
"Title 2 (50-65 chars)",
|
||||
"Title 3 (50-65 chars)",
|
||||
"Title 4 (50-65 chars)",
|
||||
"Title 5 (50-65 chars)"
|
||||
]"""
|
||||
|
||||
def get_title_schema(self) -> Dict[str, Any]:
|
||||
"""Get the JSON schema for title generation."""
|
||||
return {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minLength": 50,
|
||||
"maxLength": 65
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5
|
||||
}
|
||||
|
||||
async def generate_seo_titles(
|
||||
self,
|
||||
research: BlogResearchResponse,
|
||||
outline: List[BlogOutlineSection],
|
||||
primary_keywords: List[str],
|
||||
secondary_keywords: List[str],
|
||||
content_angles: List[str],
|
||||
search_intent: str,
|
||||
word_count: int,
|
||||
user_id: str
|
||||
) -> List[str]:
|
||||
"""Generate SEO-optimized titles using research and outline data.
|
||||
|
||||
Args:
|
||||
research: Research data with keywords and insights
|
||||
outline: Blog outline sections
|
||||
primary_keywords: Primary keywords for the blog
|
||||
secondary_keywords: Secondary keywords
|
||||
content_angles: Content angles from research
|
||||
search_intent: Search intent (informational, commercial, etc.)
|
||||
word_count: Target word count
|
||||
user_id: User ID for API calls
|
||||
|
||||
Returns:
|
||||
List of 5 SEO-optimized titles
|
||||
"""
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for title generation")
|
||||
|
||||
# Build specialized prompt
|
||||
prompt = self.build_title_prompt(
|
||||
research=research,
|
||||
outline=outline,
|
||||
primary_keywords=primary_keywords,
|
||||
secondary_keywords=secondary_keywords,
|
||||
content_angles=content_angles,
|
||||
search_intent=search_intent,
|
||||
word_count=word_count
|
||||
)
|
||||
|
||||
# Get schema
|
||||
schema = self.get_title_schema()
|
||||
|
||||
logger.info(f"Generating SEO-optimized titles for user {user_id}")
|
||||
|
||||
try:
|
||||
# Generate titles using structured JSON response
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=schema,
|
||||
system_prompt="You are an expert SEO content strategist specializing in creating compelling, search-optimized blog titles.",
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Handle response - could be array directly or wrapped in dict
|
||||
if isinstance(result, list):
|
||||
titles = result
|
||||
elif isinstance(result, dict):
|
||||
# Try common keys
|
||||
titles = result.get('titles', result.get('title_options', result.get('options', [])))
|
||||
if not titles and isinstance(result.get('response'), list):
|
||||
titles = result['response']
|
||||
else:
|
||||
logger.warning(f"Unexpected title generation result type: {type(result)}")
|
||||
titles = []
|
||||
|
||||
# Validate and clean titles
|
||||
cleaned_titles = []
|
||||
for title in titles:
|
||||
if isinstance(title, str) and len(title.strip()) >= 30: # Minimum reasonable length
|
||||
cleaned = title.strip()
|
||||
# Ensure it's within reasonable bounds (allow slight overflow for quality)
|
||||
if len(cleaned) <= 70: # Allow slight overflow for quality
|
||||
cleaned_titles.append(cleaned)
|
||||
|
||||
# Ensure we have exactly 5 titles
|
||||
if len(cleaned_titles) < 5:
|
||||
logger.warning(f"Generated only {len(cleaned_titles)} titles, expected 5")
|
||||
# Pad with placeholder if needed (shouldn't happen with proper schema)
|
||||
while len(cleaned_titles) < 5:
|
||||
cleaned_titles.append(f"{primary_keywords[0] if primary_keywords else 'Blog'} - Comprehensive Guide")
|
||||
|
||||
# Return exactly 5 titles
|
||||
return cleaned_titles[:5]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate SEO titles: {e}")
|
||||
# Fallback: generate simple titles from keywords
|
||||
fallback_titles = []
|
||||
primary = primary_keywords[0] if primary_keywords else "Blog Post"
|
||||
for i in range(5):
|
||||
fallback_titles.append(f"{primary}: Complete Guide {i+1}")
|
||||
return fallback_titles
|
||||
|
||||
690
backend/services/blog_writer/outline/source_mapper.py
Normal file
690
backend/services/blog_writer/outline/source_mapper.py
Normal file
@@ -0,0 +1,690 @@
|
||||
"""
|
||||
Source-to-Section Mapper - Intelligent mapping of research sources to outline sections.
|
||||
|
||||
This module provides algorithmic mapping of research sources to specific outline sections
|
||||
based on semantic similarity, keyword relevance, and contextual matching. Uses a hybrid
|
||||
approach of algorithmic scoring followed by AI validation for optimal results.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Tuple, Optional
|
||||
import re
|
||||
from collections import Counter
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
BlogOutlineSection,
|
||||
ResearchSource,
|
||||
BlogResearchResponse,
|
||||
)
|
||||
|
||||
|
||||
class SourceToSectionMapper:
|
||||
"""Maps research sources to outline sections using intelligent algorithms."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the source-to-section mapper."""
|
||||
self.min_semantic_score = 0.3
|
||||
self.min_keyword_score = 0.2
|
||||
self.min_contextual_score = 0.2
|
||||
self.max_sources_per_section = 3
|
||||
self.min_total_score = 0.4
|
||||
|
||||
# Weight factors for different scoring methods
|
||||
self.weights = {
|
||||
'semantic': 0.4, # Semantic similarity weight
|
||||
'keyword': 0.3, # Keyword matching weight
|
||||
'contextual': 0.3 # Contextual relevance weight
|
||||
}
|
||||
|
||||
# Common stop words for text processing
|
||||
self.stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
||||
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
||||
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
|
||||
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
|
||||
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
|
||||
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
|
||||
'over', 'under', 'again', 'further', 'then', 'once'
|
||||
}
|
||||
|
||||
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
|
||||
|
||||
def map_sources_to_sections(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse,
|
||||
user_id: str
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Map research sources to outline sections using intelligent algorithms.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections to map sources to
|
||||
research_data: Research data containing sources and metadata
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Returns:
|
||||
List of outline sections with intelligently mapped sources
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for source mapping (subscription checks and usage tracking)")
|
||||
|
||||
if not sections or not research_data.sources:
|
||||
logger.warning("No sections or sources to map")
|
||||
return sections
|
||||
|
||||
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
|
||||
|
||||
# Step 1: Algorithmic mapping
|
||||
mapping_results = self._algorithmic_source_mapping(sections, research_data)
|
||||
|
||||
# Step 2: AI validation and improvement (single prompt, user_id required for subscription checks)
|
||||
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
|
||||
|
||||
# Step 3: Apply validated mapping to sections
|
||||
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
|
||||
|
||||
logger.info("✅ Source-to-section mapping completed successfully")
|
||||
return mapped_sections
|
||||
|
||||
def _algorithmic_source_mapping(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Perform algorithmic mapping of sources to sections.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections
|
||||
research_data: Research data with sources
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section IDs to list of (source, score) tuples
|
||||
"""
|
||||
mapping_results = {}
|
||||
|
||||
for section in sections:
|
||||
section_scores = []
|
||||
|
||||
for source in research_data.sources:
|
||||
# Calculate multi-dimensional relevance score
|
||||
semantic_score = self._calculate_semantic_similarity(section, source)
|
||||
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
|
||||
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
|
||||
|
||||
# Weighted total score
|
||||
total_score = (
|
||||
semantic_score * self.weights['semantic'] +
|
||||
keyword_score * self.weights['keyword'] +
|
||||
contextual_score * self.weights['contextual']
|
||||
)
|
||||
|
||||
# Only include sources that meet minimum threshold
|
||||
if total_score >= self.min_total_score:
|
||||
section_scores.append((source, total_score))
|
||||
|
||||
# Sort by score and limit to max sources per section
|
||||
section_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
section_scores = section_scores[:self.max_sources_per_section]
|
||||
|
||||
mapping_results[section.id] = section_scores
|
||||
|
||||
logger.debug(f"Section '{section.heading}': {len(section_scores)} sources mapped")
|
||||
|
||||
return mapping_results
|
||||
|
||||
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
|
||||
"""
|
||||
Calculate semantic similarity between section and source.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
|
||||
Returns:
|
||||
Semantic similarity score (0.0 to 1.0)
|
||||
"""
|
||||
# Extract text content for comparison
|
||||
section_text = self._extract_section_text(section)
|
||||
source_text = self._extract_source_text(source)
|
||||
|
||||
# Calculate word overlap
|
||||
section_words = self._extract_meaningful_words(section_text)
|
||||
source_words = self._extract_meaningful_words(source_text)
|
||||
|
||||
if not section_words or not source_words:
|
||||
return 0.0
|
||||
|
||||
# Calculate Jaccard similarity
|
||||
intersection = len(set(section_words) & set(source_words))
|
||||
union = len(set(section_words) | set(source_words))
|
||||
|
||||
jaccard_similarity = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Boost score for exact phrase matches
|
||||
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
|
||||
|
||||
# Combine Jaccard similarity with phrase boost
|
||||
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
|
||||
|
||||
return semantic_score
|
||||
|
||||
def _calculate_keyword_relevance(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
source: ResearchSource,
|
||||
research_data: BlogResearchResponse
|
||||
) -> float:
|
||||
"""
|
||||
Calculate keyword-based relevance between section and source.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
research_data: Research data with keyword analysis
|
||||
|
||||
Returns:
|
||||
Keyword relevance score (0.0 to 1.0)
|
||||
"""
|
||||
# Get section keywords
|
||||
section_keywords = set(section.keywords)
|
||||
if not section_keywords:
|
||||
# Extract keywords from section heading and content
|
||||
section_text = self._extract_section_text(section)
|
||||
section_keywords = set(self._extract_meaningful_words(section_text))
|
||||
|
||||
# Get source keywords from title and excerpt
|
||||
source_text = f"{source.title} {source.excerpt or ''}"
|
||||
source_keywords = set(self._extract_meaningful_words(source_text))
|
||||
|
||||
# Get research keywords for context
|
||||
research_keywords = set()
|
||||
for category in ['primary', 'secondary', 'long_tail', 'semantic_keywords']:
|
||||
research_keywords.update(research_data.keyword_analysis.get(category, []))
|
||||
|
||||
# Calculate keyword overlap scores
|
||||
section_overlap = len(section_keywords & source_keywords) / len(section_keywords) if section_keywords else 0.0
|
||||
research_overlap = len(research_keywords & source_keywords) / len(research_keywords) if research_keywords else 0.0
|
||||
|
||||
# Weighted combination
|
||||
keyword_score = (section_overlap * 0.7) + (research_overlap * 0.3)
|
||||
|
||||
return min(1.0, keyword_score)
|
||||
|
||||
def _calculate_contextual_relevance(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
source: ResearchSource,
|
||||
research_data: BlogResearchResponse
|
||||
) -> float:
|
||||
"""
|
||||
Calculate contextual relevance based on section content and source context.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
research_data: Research data with context
|
||||
|
||||
Returns:
|
||||
Contextual relevance score (0.0 to 1.0)
|
||||
"""
|
||||
contextual_score = 0.0
|
||||
|
||||
# 1. Content angle matching
|
||||
section_text = self._extract_section_text(section).lower()
|
||||
source_text = f"{source.title} {source.excerpt or ''}".lower()
|
||||
|
||||
# Check for content angle matches
|
||||
content_angles = research_data.suggested_angles
|
||||
for angle in content_angles:
|
||||
angle_words = self._extract_meaningful_words(angle.lower())
|
||||
if angle_words:
|
||||
section_angle_match = sum(1 for word in angle_words if word in section_text) / len(angle_words)
|
||||
source_angle_match = sum(1 for word in angle_words if word in source_text) / len(angle_words)
|
||||
contextual_score += (section_angle_match + source_angle_match) * 0.3
|
||||
|
||||
# 2. Search intent alignment
|
||||
search_intent = research_data.keyword_analysis.get('search_intent', 'informational')
|
||||
intent_keywords = self._get_intent_keywords(search_intent)
|
||||
|
||||
intent_score = 0.0
|
||||
for keyword in intent_keywords:
|
||||
if keyword in section_text or keyword in source_text:
|
||||
intent_score += 0.1
|
||||
|
||||
contextual_score += min(0.3, intent_score)
|
||||
|
||||
# 3. Industry/domain relevance
|
||||
if hasattr(research_data, 'industry') and research_data.industry:
|
||||
industry_words = self._extract_meaningful_words(research_data.industry.lower())
|
||||
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
|
||||
contextual_score += industry_score * 0.2
|
||||
|
||||
return min(1.0, contextual_score)
|
||||
|
||||
def _ai_validate_mapping(
|
||||
self,
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse,
|
||||
user_id: str
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Use AI to validate and improve the algorithmic mapping results.
|
||||
|
||||
Args:
|
||||
mapping_results: Algorithmic mapping results
|
||||
research_data: Research data for context
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Returns:
|
||||
AI-validated and improved mapping results
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for AI validation (subscription checks and usage tracking)")
|
||||
|
||||
try:
|
||||
logger.info("Starting AI validation of source-to-section mapping...")
|
||||
|
||||
# Build AI validation prompt
|
||||
validation_prompt = self._build_validation_prompt(mapping_results, research_data)
|
||||
|
||||
# Get AI validation response (user_id required for subscription checks)
|
||||
validation_response = self._get_ai_validation_response(validation_prompt, user_id)
|
||||
|
||||
# Parse and apply AI validation results
|
||||
validated_mapping = self._parse_validation_response(validation_response, mapping_results, research_data)
|
||||
|
||||
logger.info("✅ AI validation completed successfully")
|
||||
return validated_mapping
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI validation failed: {e}. Using algorithmic results as fallback.")
|
||||
return mapping_results
|
||||
|
||||
def _apply_mapping_to_sections(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Apply the mapping results to the outline sections.
|
||||
|
||||
Args:
|
||||
sections: Original outline sections
|
||||
mapping_results: Mapping results from algorithmic/AI processing
|
||||
|
||||
Returns:
|
||||
Sections with mapped sources
|
||||
"""
|
||||
mapped_sections = []
|
||||
|
||||
for section in sections:
|
||||
# Get mapped sources for this section
|
||||
mapped_sources = mapping_results.get(section.id, [])
|
||||
|
||||
# Extract just the sources (without scores)
|
||||
section_sources = [source for source, score in mapped_sources]
|
||||
|
||||
# Create new section with mapped sources
|
||||
mapped_section = BlogOutlineSection(
|
||||
id=section.id,
|
||||
heading=section.heading,
|
||||
subheadings=section.subheadings,
|
||||
key_points=section.key_points,
|
||||
references=section_sources,
|
||||
target_words=section.target_words,
|
||||
keywords=section.keywords
|
||||
)
|
||||
|
||||
mapped_sections.append(mapped_section)
|
||||
|
||||
logger.debug(f"Applied {len(section_sources)} sources to section '{section.heading}'")
|
||||
|
||||
return mapped_sections
|
||||
|
||||
# Helper methods
|
||||
|
||||
def _extract_section_text(self, section: BlogOutlineSection) -> str:
|
||||
"""Extract all text content from a section."""
|
||||
text_parts = [section.heading]
|
||||
text_parts.extend(section.subheadings)
|
||||
text_parts.extend(section.key_points)
|
||||
text_parts.extend(section.keywords)
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_source_text(self, source: ResearchSource) -> str:
|
||||
"""Extract all text content from a source."""
|
||||
text_parts = [source.title]
|
||||
if source.excerpt:
|
||||
text_parts.append(source.excerpt)
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_meaningful_words(self, text: str) -> List[str]:
|
||||
"""Extract meaningful words from text, removing stop words and cleaning."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Clean and tokenize
|
||||
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
||||
|
||||
# Remove stop words and short words
|
||||
meaningful_words = [
|
||||
word for word in words
|
||||
if word not in self.stop_words and len(word) > 2
|
||||
]
|
||||
|
||||
return meaningful_words
|
||||
|
||||
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Calculate phrase similarity boost score."""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
text1_lower = text1.lower()
|
||||
text2_lower = text2.lower()
|
||||
|
||||
# Look for 2-3 word phrases
|
||||
phrase_boost = 0.0
|
||||
|
||||
# Extract 2-word phrases
|
||||
words1 = text1_lower.split()
|
||||
words2 = text2_lower.split()
|
||||
|
||||
for i in range(len(words1) - 1):
|
||||
phrase = f"{words1[i]} {words1[i+1]}"
|
||||
if phrase in text2_lower:
|
||||
phrase_boost += 0.1
|
||||
|
||||
# Extract 3-word phrases
|
||||
for i in range(len(words1) - 2):
|
||||
phrase = f"{words1[i]} {words1[i+1]} {words1[i+2]}"
|
||||
if phrase in text2_lower:
|
||||
phrase_boost += 0.15
|
||||
|
||||
return min(0.3, phrase_boost) # Cap at 0.3
|
||||
|
||||
def _get_intent_keywords(self, search_intent: str) -> List[str]:
|
||||
"""Get keywords associated with search intent."""
|
||||
intent_keywords = {
|
||||
'informational': ['what', 'how', 'why', 'guide', 'tutorial', 'explain', 'learn', 'understand'],
|
||||
'navigational': ['find', 'locate', 'search', 'where', 'site', 'website', 'page'],
|
||||
'transactional': ['buy', 'purchase', 'order', 'price', 'cost', 'deal', 'offer', 'discount'],
|
||||
'commercial': ['compare', 'review', 'best', 'top', 'vs', 'versus', 'alternative', 'option']
|
||||
}
|
||||
|
||||
return intent_keywords.get(search_intent, [])
|
||||
|
||||
def get_mapping_statistics(self, mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Get statistics about the mapping results.
|
||||
|
||||
Args:
|
||||
mapping_results: Mapping results to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary with mapping statistics
|
||||
"""
|
||||
total_sections = len(mapping_results)
|
||||
total_mappings = sum(len(sources) for sources in mapping_results.values())
|
||||
|
||||
# Calculate score distribution
|
||||
all_scores = []
|
||||
for sources in mapping_results.values():
|
||||
all_scores.extend([score for source, score in sources])
|
||||
|
||||
avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
||||
max_score = max(all_scores) if all_scores else 0.0
|
||||
min_score = min(all_scores) if all_scores else 0.0
|
||||
|
||||
# Count sections with/without sources
|
||||
sections_with_sources = sum(1 for sources in mapping_results.values() if sources)
|
||||
sections_without_sources = total_sections - sections_with_sources
|
||||
|
||||
return {
|
||||
'total_sections': total_sections,
|
||||
'total_mappings': total_mappings,
|
||||
'sections_with_sources': sections_with_sources,
|
||||
'sections_without_sources': sections_without_sources,
|
||||
'average_score': avg_score,
|
||||
'max_score': max_score,
|
||||
'min_score': min_score,
|
||||
'mapping_coverage': sections_with_sources / total_sections if total_sections > 0 else 0.0
|
||||
}
|
||||
|
||||
def _build_validation_prompt(
|
||||
self,
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> str:
|
||||
"""
|
||||
Build comprehensive AI validation prompt for source-to-section mapping.
|
||||
|
||||
Args:
|
||||
mapping_results: Algorithmic mapping results
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Formatted AI validation prompt
|
||||
"""
|
||||
# Extract section information
|
||||
sections_info = []
|
||||
for section_id, sources in mapping_results.items():
|
||||
section_info = {
|
||||
'id': section_id,
|
||||
'sources': [
|
||||
{
|
||||
'title': source.title,
|
||||
'url': source.url,
|
||||
'excerpt': source.excerpt,
|
||||
'credibility_score': source.credibility_score,
|
||||
'algorithmic_score': score
|
||||
}
|
||||
for source, score in sources
|
||||
]
|
||||
}
|
||||
sections_info.append(section_info)
|
||||
|
||||
# Extract research context
|
||||
research_context = {
|
||||
'primary_keywords': research_data.keyword_analysis.get('primary', []),
|
||||
'secondary_keywords': research_data.keyword_analysis.get('secondary', []),
|
||||
'content_angles': research_data.suggested_angles,
|
||||
'search_intent': research_data.keyword_analysis.get('search_intent', 'informational'),
|
||||
'all_sources': [
|
||||
{
|
||||
'title': source.title,
|
||||
'url': source.url,
|
||||
'excerpt': source.excerpt,
|
||||
'credibility_score': source.credibility_score
|
||||
}
|
||||
for source in research_data.sources
|
||||
]
|
||||
}
|
||||
|
||||
prompt = f"""
|
||||
You are an expert content strategist and SEO specialist. Your task is to validate and improve the algorithmic mapping of research sources to blog outline sections.
|
||||
|
||||
## CONTEXT
|
||||
Research Topic: {', '.join(research_context['primary_keywords'])}
|
||||
Search Intent: {research_context['search_intent']}
|
||||
Content Angles: {', '.join(research_context['content_angles'])}
|
||||
|
||||
## ALGORITHMIC MAPPING RESULTS
|
||||
The following sections have been algorithmically mapped with research sources:
|
||||
|
||||
{self._format_sections_for_prompt(sections_info)}
|
||||
|
||||
## AVAILABLE SOURCES
|
||||
All available research sources:
|
||||
{self._format_sources_for_prompt(research_context['all_sources'])}
|
||||
|
||||
## VALIDATION TASK
|
||||
Please analyze the algorithmic mapping and provide improvements:
|
||||
|
||||
1. **Validate Relevance**: Are the mapped sources truly relevant to each section's content and purpose?
|
||||
2. **Identify Gaps**: Are there better sources available that weren't mapped?
|
||||
3. **Suggest Improvements**: Recommend specific source changes for better content alignment
|
||||
4. **Quality Assessment**: Rate the overall mapping quality (1-10)
|
||||
|
||||
## RESPONSE FORMAT
|
||||
Provide your analysis in the following JSON format:
|
||||
|
||||
```json
|
||||
{{
|
||||
"overall_quality_score": 8,
|
||||
"section_improvements": [
|
||||
{{
|
||||
"section_id": "s1",
|
||||
"current_sources": ["source_title_1", "source_title_2"],
|
||||
"recommended_sources": ["better_source_1", "better_source_2", "better_source_3"],
|
||||
"reasoning": "Explanation of why these sources are better suited for this section",
|
||||
"confidence": 0.9
|
||||
}}
|
||||
],
|
||||
"summary": "Overall assessment of the mapping quality and key improvements made"
|
||||
}}
|
||||
```
|
||||
|
||||
## GUIDELINES
|
||||
- Prioritize sources that directly support the section's key points and subheadings
|
||||
- Consider source credibility, recency, and content depth
|
||||
- Ensure sources provide actionable insights for content creation
|
||||
- Maintain diversity in source types and perspectives
|
||||
- Focus on sources that enhance the section's value proposition
|
||||
|
||||
Analyze the mapping and provide your recommendations.
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _get_ai_validation_response(self, prompt: str, user_id: str) -> str:
|
||||
"""
|
||||
Get AI validation response using LLM provider.
|
||||
|
||||
Args:
|
||||
prompt: Validation prompt
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Returns:
|
||||
AI validation response
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for AI validation response (subscription checks and usage tracking)")
|
||||
|
||||
try:
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
response = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=None,
|
||||
system_prompt=None,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get AI validation response: {e}")
|
||||
raise
|
||||
|
||||
def _parse_validation_response(
|
||||
self,
|
||||
response: str,
|
||||
original_mapping: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Parse AI validation response and apply improvements.
|
||||
|
||||
Args:
|
||||
response: AI validation response
|
||||
original_mapping: Original algorithmic mapping
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Improved mapping based on AI validation
|
||||
"""
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Extract JSON from response
|
||||
json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
|
||||
if not json_match:
|
||||
# Try to find JSON without code blocks
|
||||
json_match = re.search(r'(\{.*?\})', response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
logger.warning("Could not extract JSON from AI response")
|
||||
return original_mapping
|
||||
|
||||
validation_data = json.loads(json_match.group(1))
|
||||
|
||||
# Create source lookup for quick access
|
||||
source_lookup = {source.title: source for source in research_data.sources}
|
||||
|
||||
# Apply AI improvements
|
||||
improved_mapping = {}
|
||||
|
||||
for improvement in validation_data.get('section_improvements', []):
|
||||
section_id = improvement['section_id']
|
||||
recommended_titles = improvement['recommended_sources']
|
||||
|
||||
# Map recommended titles to actual sources
|
||||
recommended_sources = []
|
||||
for title in recommended_titles:
|
||||
if title in source_lookup:
|
||||
source = source_lookup[title]
|
||||
# Use high confidence score for AI-recommended sources
|
||||
recommended_sources.append((source, 0.9))
|
||||
|
||||
if recommended_sources:
|
||||
improved_mapping[section_id] = recommended_sources
|
||||
else:
|
||||
# Fallback to original mapping if no valid sources found
|
||||
improved_mapping[section_id] = original_mapping.get(section_id, [])
|
||||
|
||||
# Add sections not mentioned in AI response
|
||||
for section_id, sources in original_mapping.items():
|
||||
if section_id not in improved_mapping:
|
||||
improved_mapping[section_id] = sources
|
||||
|
||||
logger.info(f"AI validation applied: {len(validation_data.get('section_improvements', []))} sections improved")
|
||||
return improved_mapping
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse AI validation response: {e}")
|
||||
return original_mapping
|
||||
|
||||
def _format_sections_for_prompt(self, sections_info: List[Dict]) -> str:
|
||||
"""Format sections information for AI prompt."""
|
||||
formatted = []
|
||||
for section in sections_info:
|
||||
section_text = f"**Section {section['id']}:**\n"
|
||||
section_text += f"Sources mapped: {len(section['sources'])}\n"
|
||||
for source in section['sources']:
|
||||
section_text += f"- {source['title']} (Score: {source['algorithmic_score']:.2f})\n"
|
||||
formatted.append(section_text)
|
||||
return "\n".join(formatted)
|
||||
|
||||
def _format_sources_for_prompt(self, sources: List[Dict]) -> str:
|
||||
"""Format sources information for AI prompt."""
|
||||
formatted = []
|
||||
for i, source in enumerate(sources, 1):
|
||||
source_text = f"{i}. **{source['title']}**\n"
|
||||
source_text += f" URL: {source['url']}\n"
|
||||
source_text += f" Credibility: {source['credibility_score']}\n"
|
||||
if source['excerpt']:
|
||||
source_text += f" Excerpt: {source['excerpt'][:200]}...\n"
|
||||
formatted.append(source_text)
|
||||
return "\n".join(formatted)
|
||||
123
backend/services/blog_writer/outline/title_generator.py
Normal file
123
backend/services/blog_writer/outline/title_generator.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
Title Generator - Handles title generation and formatting for blog outlines.
|
||||
|
||||
Extracts content angles from research data and combines them with AI-generated titles.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class TitleGenerator:
|
||||
"""Handles title generation, formatting, and combination logic."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the title generator."""
|
||||
pass
|
||||
|
||||
def extract_content_angle_titles(self, research) -> List[str]:
|
||||
"""
|
||||
Extract content angles from research data and convert them to blog titles.
|
||||
|
||||
Args:
|
||||
research: BlogResearchResponse object containing suggested_angles
|
||||
|
||||
Returns:
|
||||
List of title-formatted content angles
|
||||
"""
|
||||
if not research or not hasattr(research, 'suggested_angles'):
|
||||
return []
|
||||
|
||||
content_angles = research.suggested_angles or []
|
||||
if not content_angles:
|
||||
return []
|
||||
|
||||
# Convert content angles to title format
|
||||
title_formatted_angles = []
|
||||
for angle in content_angles:
|
||||
if isinstance(angle, str) and angle.strip():
|
||||
# Clean and format the angle as a title
|
||||
formatted_angle = self._format_angle_as_title(angle.strip())
|
||||
if formatted_angle and formatted_angle not in title_formatted_angles:
|
||||
title_formatted_angles.append(formatted_angle)
|
||||
|
||||
logger.info(f"Extracted {len(title_formatted_angles)} content angle titles from research data")
|
||||
return title_formatted_angles
|
||||
|
||||
def _format_angle_as_title(self, angle: str) -> str:
|
||||
"""
|
||||
Format a content angle as a proper blog title.
|
||||
|
||||
Args:
|
||||
angle: Raw content angle string
|
||||
|
||||
Returns:
|
||||
Formatted title string
|
||||
"""
|
||||
if not angle or len(angle.strip()) < 10: # Too short to be a good title
|
||||
return ""
|
||||
|
||||
# Clean up the angle
|
||||
cleaned_angle = angle.strip()
|
||||
|
||||
# Capitalize first letter of each sentence and proper nouns
|
||||
sentences = cleaned_angle.split('. ')
|
||||
formatted_sentences = []
|
||||
for sentence in sentences:
|
||||
if sentence.strip():
|
||||
# Use title case for better formatting
|
||||
formatted_sentence = sentence.strip().title()
|
||||
formatted_sentences.append(formatted_sentence)
|
||||
|
||||
formatted_title = '. '.join(formatted_sentences)
|
||||
|
||||
# Ensure it ends with proper punctuation
|
||||
if not formatted_title.endswith(('.', '!', '?')):
|
||||
formatted_title += '.'
|
||||
|
||||
# Limit length to reasonable blog title size
|
||||
if len(formatted_title) > 100:
|
||||
formatted_title = formatted_title[:97] + "..."
|
||||
|
||||
return formatted_title
|
||||
|
||||
def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str]) -> List[str]:
|
||||
"""
|
||||
Combine AI-generated titles with content angle titles, ensuring variety and quality.
|
||||
|
||||
Args:
|
||||
ai_titles: AI-generated title options
|
||||
content_angle_titles: Titles derived from content angles
|
||||
primary_keywords: Primary keywords for fallback generation
|
||||
|
||||
Returns:
|
||||
Combined list of title options (max 6 total)
|
||||
"""
|
||||
all_titles = []
|
||||
|
||||
# Add content angle titles first (these are research-based and valuable)
|
||||
for title in content_angle_titles[:3]: # Limit to top 3 content angles
|
||||
if title and title not in all_titles:
|
||||
all_titles.append(title)
|
||||
|
||||
# Add AI-generated titles
|
||||
for title in ai_titles:
|
||||
if title and title not in all_titles:
|
||||
all_titles.append(title)
|
||||
|
||||
# Note: Removed fallback titles as requested - only use research and AI-generated titles
|
||||
|
||||
# Limit to 6 titles maximum for UI usability
|
||||
final_titles = all_titles[:6]
|
||||
|
||||
logger.info(f"Combined title options: {len(final_titles)} total (AI: {len(ai_titles)}, Content angles: {len(content_angle_titles)})")
|
||||
return final_titles
|
||||
|
||||
def generate_fallback_titles(self, primary_keywords: List[str]) -> List[str]:
|
||||
"""Generate fallback titles when AI generation fails."""
|
||||
primary_keyword = primary_keywords[0] if primary_keywords else "Topic"
|
||||
return [
|
||||
f"The Complete Guide to {primary_keyword}",
|
||||
f"{primary_keyword}: Everything You Need to Know",
|
||||
f"How to Master {primary_keyword} in 2024"
|
||||
]
|
||||
Reference in New Issue
Block a user