Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
"""
Outline module for AI Blog Writer.
This module handles all outline-related functionality including:
- AI-powered outline generation
- Outline refinement and optimization
- Section enhancement and rebalancing
- Strategic content planning
"""
from .outline_service import OutlineService
from .outline_generator import OutlineGenerator
from .outline_optimizer import OutlineOptimizer
from .section_enhancer import SectionEnhancer
from .source_mapper import SourceToSectionMapper
from .grounding_engine import GroundingContextEngine
__all__ = [
'OutlineService',
'OutlineGenerator',
'OutlineOptimizer',
'SectionEnhancer',
'SourceToSectionMapper',
'GroundingContextEngine'
]

View File

@@ -0,0 +1,644 @@
"""
Grounding Context Engine - Enhanced utilization of grounding metadata.
This module extracts and utilizes rich contextual information from Google Search
grounding metadata to enhance outline generation with authoritative insights,
temporal relevance, and content relationships.
"""
from typing import Dict, Any, List, Tuple, Optional
from collections import Counter, defaultdict
from datetime import datetime, timedelta
import re
from loguru import logger
from models.blog_models import (
GroundingMetadata,
GroundingChunk,
GroundingSupport,
Citation,
BlogOutlineSection,
ResearchSource,
)
class GroundingContextEngine:
"""Extract and utilize rich context from grounding metadata."""
def __init__(self):
"""Initialize the grounding context engine."""
self.min_confidence_threshold = 0.7
self.high_confidence_threshold = 0.9
self.max_contextual_insights = 10
self.max_authority_sources = 5
# Authority indicators for source scoring
self.authority_indicators = {
'high_authority': ['research', 'study', 'analysis', 'report', 'journal', 'academic', 'university', 'institute'],
'medium_authority': ['guide', 'tutorial', 'best practices', 'expert', 'professional', 'industry'],
'low_authority': ['blog', 'opinion', 'personal', 'review', 'commentary']
}
# Temporal relevance patterns
self.temporal_patterns = {
'recent': ['2024', '2025', 'latest', 'new', 'recent', 'current', 'updated'],
'trending': ['trend', 'emerging', 'growing', 'increasing', 'rising'],
'evergreen': ['fundamental', 'basic', 'principles', 'foundation', 'core']
}
logger.info("✅ GroundingContextEngine initialized with contextual analysis capabilities")
def extract_contextual_insights(self, grounding_metadata: Optional[GroundingMetadata]) -> Dict[str, Any]:
"""
Extract comprehensive contextual insights from grounding metadata.
Args:
grounding_metadata: Google Search grounding metadata
Returns:
Dictionary containing contextual insights and analysis
"""
if not grounding_metadata:
return self._get_empty_insights()
logger.info("Extracting contextual insights from grounding metadata...")
insights = {
'confidence_analysis': self._analyze_confidence_patterns(grounding_metadata),
'authority_analysis': self._analyze_source_authority(grounding_metadata),
'temporal_analysis': self._analyze_temporal_relevance(grounding_metadata),
'content_relationships': self._analyze_content_relationships(grounding_metadata),
'citation_insights': self._analyze_citation_patterns(grounding_metadata),
'search_intent_insights': self._analyze_search_intent(grounding_metadata),
'quality_indicators': self._assess_quality_indicators(grounding_metadata)
}
logger.info(f"✅ Extracted {len(insights)} contextual insight categories")
return insights
def enhance_sections_with_grounding(
self,
sections: List[BlogOutlineSection],
grounding_metadata: Optional[GroundingMetadata],
insights: Dict[str, Any]
) -> List[BlogOutlineSection]:
"""
Enhance outline sections using grounding metadata insights.
Args:
sections: List of outline sections to enhance
grounding_metadata: Google Search grounding metadata
insights: Extracted contextual insights
Returns:
Enhanced sections with grounding-driven improvements
"""
if not grounding_metadata or not insights:
return sections
logger.info(f"Enhancing {len(sections)} sections with grounding insights...")
enhanced_sections = []
for section in sections:
enhanced_section = self._enhance_single_section(section, grounding_metadata, insights)
enhanced_sections.append(enhanced_section)
logger.info("✅ Section enhancement with grounding insights completed")
return enhanced_sections
def get_authority_sources(self, grounding_metadata: Optional[GroundingMetadata]) -> List[Tuple[GroundingChunk, float]]:
"""
Get high-authority sources from grounding metadata.
Args:
grounding_metadata: Google Search grounding metadata
Returns:
List of (chunk, authority_score) tuples sorted by authority
"""
if not grounding_metadata:
return []
authority_sources = []
for chunk in grounding_metadata.grounding_chunks:
authority_score = self._calculate_chunk_authority(chunk)
if authority_score >= 0.6: # Only include sources with reasonable authority
authority_sources.append((chunk, authority_score))
# Sort by authority score (descending)
authority_sources.sort(key=lambda x: x[1], reverse=True)
return authority_sources[:self.max_authority_sources]
def get_high_confidence_insights(self, grounding_metadata: Optional[GroundingMetadata]) -> List[str]:
"""
Extract high-confidence insights from grounding supports.
Args:
grounding_metadata: Google Search grounding metadata
Returns:
List of high-confidence insights
"""
if not grounding_metadata:
return []
high_confidence_insights = []
for support in grounding_metadata.grounding_supports:
if support.confidence_scores and max(support.confidence_scores) >= self.high_confidence_threshold:
# Extract meaningful insights from segment text
insight = self._extract_insight_from_segment(support.segment_text)
if insight:
high_confidence_insights.append(insight)
return high_confidence_insights[:self.max_contextual_insights]
# Private helper methods
def _get_empty_insights(self) -> Dict[str, Any]:
"""Return empty insights structure when no grounding metadata is available."""
return {
'confidence_analysis': {
'average_confidence': 0.0,
'high_confidence_sources_count': 0,
'confidence_distribution': {'high': 0, 'medium': 0, 'low': 0}
},
'authority_analysis': {
'average_authority_score': 0.0,
'high_authority_sources': [],
'authority_distribution': {'high': 0, 'medium': 0, 'low': 0}
},
'temporal_analysis': {
'recent_content': 0,
'trending_topics': [],
'evergreen_content': 0
},
'content_relationships': {
'related_concepts': [],
'content_gaps': [],
'concept_coverage_score': 0.0
},
'citation_insights': {
'citation_types': {},
'citation_density': 0.0
},
'search_intent_insights': {
'primary_intent': 'informational',
'intent_signals': [],
'user_questions': []
},
'quality_indicators': {
'overall_quality': 0.0,
'quality_factors': []
}
}
def _analyze_confidence_patterns(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
"""Analyze confidence patterns across grounding data."""
all_confidences = []
# Collect confidence scores from chunks
for chunk in grounding_metadata.grounding_chunks:
if chunk.confidence_score:
all_confidences.append(chunk.confidence_score)
# Collect confidence scores from supports
for support in grounding_metadata.grounding_supports:
all_confidences.extend(support.confidence_scores)
if not all_confidences:
return {
'average_confidence': 0.0,
'high_confidence_sources_count': 0,
'confidence_distribution': {'high': 0, 'medium': 0, 'low': 0}
}
average_confidence = sum(all_confidences) / len(all_confidences)
high_confidence_count = sum(1 for c in all_confidences if c >= self.high_confidence_threshold)
return {
'average_confidence': average_confidence,
'high_confidence_sources_count': high_confidence_count,
'confidence_distribution': self._get_confidence_distribution(all_confidences)
}
def _analyze_source_authority(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
"""Analyze source authority patterns."""
authority_scores = []
authority_distribution = defaultdict(int)
for chunk in grounding_metadata.grounding_chunks:
authority_score = self._calculate_chunk_authority(chunk)
authority_scores.append(authority_score)
# Categorize authority level
if authority_score >= 0.8:
authority_distribution['high'] += 1
elif authority_score >= 0.6:
authority_distribution['medium'] += 1
else:
authority_distribution['low'] += 1
return {
'average_authority_score': sum(authority_scores) / len(authority_scores) if authority_scores else 0.0,
'high_authority_sources': [{'title': 'High Authority Source', 'url': 'example.com', 'score': 0.9}], # Placeholder
'authority_distribution': dict(authority_distribution)
}
def _analyze_temporal_relevance(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
"""Analyze temporal relevance of grounding content."""
recent_content = 0
trending_topics = []
evergreen_content = 0
for chunk in grounding_metadata.grounding_chunks:
chunk_text = f"{chunk.title} {chunk.url}".lower()
# Check for recent indicators
if any(pattern in chunk_text for pattern in self.temporal_patterns['recent']):
recent_content += 1
# Check for trending indicators
if any(pattern in chunk_text for pattern in self.temporal_patterns['trending']):
trending_topics.append(chunk.title)
# Check for evergreen indicators
if any(pattern in chunk_text for pattern in self.temporal_patterns['evergreen']):
evergreen_content += 1
return {
'recent_content': recent_content,
'trending_topics': trending_topics[:5], # Limit to top 5
'evergreen_content': evergreen_content,
'temporal_balance': self._calculate_temporal_balance(recent_content, evergreen_content)
}
def _analyze_content_relationships(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
"""Analyze content relationships and identify gaps."""
all_text = []
# Collect text from chunks
for chunk in grounding_metadata.grounding_chunks:
all_text.append(chunk.title)
# Collect text from supports
for support in grounding_metadata.grounding_supports:
all_text.append(support.segment_text)
# Extract related concepts
related_concepts = self._extract_related_concepts(all_text)
# Identify potential content gaps
content_gaps = self._identify_content_gaps(all_text)
# Calculate concept coverage score (0-1 scale)
concept_coverage_score = min(1.0, len(related_concepts) / 10.0) if related_concepts else 0.0
return {
'related_concepts': related_concepts,
'content_gaps': content_gaps,
'concept_coverage_score': concept_coverage_score,
'gap_count': len(content_gaps)
}
def _analyze_citation_patterns(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
"""Analyze citation patterns and types."""
citation_types = Counter()
total_citations = len(grounding_metadata.citations)
for citation in grounding_metadata.citations:
citation_types[citation.citation_type] += 1
# Calculate citation density (citations per 1000 words of content)
total_content_length = sum(len(support.segment_text) for support in grounding_metadata.grounding_supports)
citation_density = (total_citations / max(total_content_length, 1)) * 1000 if total_content_length > 0 else 0.0
return {
'citation_types': dict(citation_types),
'total_citations': total_citations,
'citation_density': citation_density,
'citation_quality': self._assess_citation_quality(grounding_metadata.citations)
}
def _analyze_search_intent(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
"""Analyze search intent signals from grounding data."""
intent_signals = []
user_questions = []
# Analyze search queries
for query in grounding_metadata.web_search_queries:
query_lower = query.lower()
# Identify intent signals
if any(word in query_lower for word in ['how', 'what', 'why', 'when', 'where']):
intent_signals.append('informational')
elif any(word in query_lower for word in ['best', 'top', 'compare', 'vs']):
intent_signals.append('comparison')
elif any(word in query_lower for word in ['buy', 'price', 'cost', 'deal']):
intent_signals.append('transactional')
# Extract potential user questions
if query_lower.startswith(('how to', 'what is', 'why does', 'when should')):
user_questions.append(query)
return {
'intent_signals': list(set(intent_signals)),
'user_questions': user_questions[:5], # Limit to top 5
'primary_intent': self._determine_primary_intent(intent_signals)
}
def _assess_quality_indicators(self, grounding_metadata: GroundingMetadata) -> Dict[str, Any]:
"""Assess overall quality indicators from grounding metadata."""
quality_factors = []
quality_score = 0.0
# Factor 1: Confidence levels
confidences = [chunk.confidence_score for chunk in grounding_metadata.grounding_chunks if chunk.confidence_score]
if confidences:
avg_confidence = sum(confidences) / len(confidences)
quality_score += avg_confidence * 0.3
quality_factors.append(f"Average confidence: {avg_confidence:.2f}")
# Factor 2: Source diversity
unique_domains = set()
for chunk in grounding_metadata.grounding_chunks:
try:
domain = chunk.url.split('/')[2] if '://' in chunk.url else chunk.url.split('/')[0]
unique_domains.add(domain)
except:
continue
diversity_score = min(len(unique_domains) / 5.0, 1.0) # Normalize to 0-1
quality_score += diversity_score * 0.2
quality_factors.append(f"Source diversity: {len(unique_domains)} unique domains")
# Factor 3: Content depth
total_content_length = sum(len(support.segment_text) for support in grounding_metadata.grounding_supports)
depth_score = min(total_content_length / 5000.0, 1.0) # Normalize to 0-1
quality_score += depth_score * 0.2
quality_factors.append(f"Content depth: {total_content_length} characters")
# Factor 4: Citation quality
citation_quality = self._assess_citation_quality(grounding_metadata.citations)
quality_score += citation_quality * 0.3
quality_factors.append(f"Citation quality: {citation_quality:.2f}")
return {
'overall_quality': min(quality_score, 1.0),
'quality_factors': quality_factors,
'quality_grade': self._get_quality_grade(quality_score)
}
def _enhance_single_section(
self,
section: BlogOutlineSection,
grounding_metadata: GroundingMetadata,
insights: Dict[str, Any]
) -> BlogOutlineSection:
"""Enhance a single section using grounding insights."""
# Extract relevant grounding data for this section
relevant_chunks = self._find_relevant_chunks(section, grounding_metadata)
relevant_supports = self._find_relevant_supports(section, grounding_metadata)
# Enhance subheadings with high-confidence insights
enhanced_subheadings = self._enhance_subheadings(section, relevant_supports, insights)
# Enhance key points with authoritative insights
enhanced_key_points = self._enhance_key_points(section, relevant_chunks, insights)
# Enhance keywords with related concepts
enhanced_keywords = self._enhance_keywords(section, insights)
return BlogOutlineSection(
id=section.id,
heading=section.heading,
subheadings=enhanced_subheadings,
key_points=enhanced_key_points,
references=section.references,
target_words=section.target_words,
keywords=enhanced_keywords
)
def _calculate_chunk_authority(self, chunk: GroundingChunk) -> float:
"""Calculate authority score for a grounding chunk."""
authority_score = 0.5 # Base score
chunk_text = f"{chunk.title} {chunk.url}".lower()
# Check for authority indicators
for level, indicators in self.authority_indicators.items():
for indicator in indicators:
if indicator in chunk_text:
if level == 'high_authority':
authority_score += 0.3
elif level == 'medium_authority':
authority_score += 0.2
else: # low_authority
authority_score -= 0.1
# Boost score based on confidence
if chunk.confidence_score:
authority_score += chunk.confidence_score * 0.2
return min(max(authority_score, 0.0), 1.0)
def _extract_insight_from_segment(self, segment_text: str) -> Optional[str]:
"""Extract meaningful insight from segment text."""
if not segment_text or len(segment_text.strip()) < 20:
return None
# Clean and truncate insight
insight = segment_text.strip()
if len(insight) > 200:
insight = insight[:200] + "..."
return insight
def _get_confidence_distribution(self, confidences: List[float]) -> Dict[str, int]:
"""Get distribution of confidence scores."""
distribution = {'high': 0, 'medium': 0, 'low': 0}
for confidence in confidences:
if confidence >= 0.8:
distribution['high'] += 1
elif confidence >= 0.6:
distribution['medium'] += 1
else:
distribution['low'] += 1
return distribution
def _calculate_temporal_balance(self, recent: int, evergreen: int) -> str:
"""Calculate temporal balance of content."""
total = recent + evergreen
if total == 0:
return 'unknown'
recent_ratio = recent / total
if recent_ratio > 0.7:
return 'recent_heavy'
elif recent_ratio < 0.3:
return 'evergreen_heavy'
else:
return 'balanced'
def _extract_related_concepts(self, text_list: List[str]) -> List[str]:
"""Extract related concepts from text."""
# Simple concept extraction - could be enhanced with NLP
concepts = set()
for text in text_list:
# Extract capitalized words (potential concepts)
words = re.findall(r'\b[A-Z][a-z]+\b', text)
concepts.update(words)
return list(concepts)[:10] # Limit to top 10
def _identify_content_gaps(self, text_list: List[str]) -> List[str]:
"""Identify potential content gaps."""
# Simple gap identification - could be enhanced with more sophisticated analysis
gaps = []
# Look for common gap indicators
gap_indicators = ['missing', 'lack of', 'not covered', 'gap', 'unclear', 'unexplained']
for text in text_list:
text_lower = text.lower()
for indicator in gap_indicators:
if indicator in text_lower:
# Extract potential gap
gap = self._extract_gap_from_text(text, indicator)
if gap:
gaps.append(gap)
return gaps[:5] # Limit to top 5
def _extract_gap_from_text(self, text: str, indicator: str) -> Optional[str]:
"""Extract content gap from text containing gap indicator."""
# Simple extraction - could be enhanced
sentences = text.split('.')
for sentence in sentences:
if indicator in sentence.lower():
return sentence.strip()
return None
def _assess_citation_quality(self, citations: List[Citation]) -> float:
"""Assess quality of citations."""
if not citations:
return 0.0
quality_score = 0.0
for citation in citations:
# Check citation type
if citation.citation_type in ['expert_opinion', 'statistical_data', 'research_study']:
quality_score += 0.3
elif citation.citation_type in ['recent_news', 'case_study']:
quality_score += 0.2
else:
quality_score += 0.1
# Check text quality
if len(citation.text) > 20:
quality_score += 0.1
return min(quality_score / len(citations), 1.0)
def _determine_primary_intent(self, intent_signals: List[str]) -> str:
"""Determine primary search intent from signals."""
if not intent_signals:
return 'informational'
intent_counts = Counter(intent_signals)
return intent_counts.most_common(1)[0][0]
def _get_quality_grade(self, quality_score: float) -> str:
"""Get quality grade from score."""
if quality_score >= 0.9:
return 'A'
elif quality_score >= 0.8:
return 'B'
elif quality_score >= 0.7:
return 'C'
elif quality_score >= 0.6:
return 'D'
else:
return 'F'
def _find_relevant_chunks(self, section: BlogOutlineSection, grounding_metadata: GroundingMetadata) -> List[GroundingChunk]:
"""Find grounding chunks relevant to the section."""
relevant_chunks = []
section_text = f"{section.heading} {' '.join(section.subheadings)} {' '.join(section.key_points)}".lower()
for chunk in grounding_metadata.grounding_chunks:
chunk_text = chunk.title.lower()
# Simple relevance check - could be enhanced with semantic similarity
if any(word in chunk_text for word in section_text.split() if len(word) > 3):
relevant_chunks.append(chunk)
return relevant_chunks
def _find_relevant_supports(self, section: BlogOutlineSection, grounding_metadata: GroundingMetadata) -> List[GroundingSupport]:
"""Find grounding supports relevant to the section."""
relevant_supports = []
section_text = f"{section.heading} {' '.join(section.subheadings)} {' '.join(section.key_points)}".lower()
for support in grounding_metadata.grounding_supports:
support_text = support.segment_text.lower()
# Simple relevance check
if any(word in support_text for word in section_text.split() if len(word) > 3):
relevant_supports.append(support)
return relevant_supports
def _enhance_subheadings(self, section: BlogOutlineSection, relevant_supports: List[GroundingSupport], insights: Dict[str, Any]) -> List[str]:
"""Enhance subheadings with grounding insights."""
enhanced_subheadings = list(section.subheadings)
# Add high-confidence insights as subheadings
high_confidence_insights = self._get_high_confidence_insights_from_supports(relevant_supports)
for insight in high_confidence_insights[:2]: # Add up to 2 new subheadings
if insight not in enhanced_subheadings:
enhanced_subheadings.append(insight)
return enhanced_subheadings
def _enhance_key_points(self, section: BlogOutlineSection, relevant_chunks: List[GroundingChunk], insights: Dict[str, Any]) -> List[str]:
"""Enhance key points with authoritative insights."""
enhanced_key_points = list(section.key_points)
# Add insights from high-authority chunks
for chunk in relevant_chunks:
if chunk.confidence_score and chunk.confidence_score >= self.high_confidence_threshold:
insight = f"Based on {chunk.title}: {self._extract_key_insight(chunk)}"
if insight not in enhanced_key_points:
enhanced_key_points.append(insight)
return enhanced_key_points
def _enhance_keywords(self, section: BlogOutlineSection, insights: Dict[str, Any]) -> List[str]:
"""Enhance keywords with related concepts from grounding."""
enhanced_keywords = list(section.keywords)
# Add related concepts from grounding analysis
related_concepts = insights.get('content_relationships', {}).get('related_concepts', [])
for concept in related_concepts[:3]: # Add up to 3 new keywords
if concept.lower() not in [kw.lower() for kw in enhanced_keywords]:
enhanced_keywords.append(concept)
return enhanced_keywords
def _get_high_confidence_insights_from_supports(self, supports: List[GroundingSupport]) -> List[str]:
"""Get high-confidence insights from grounding supports."""
insights = []
for support in supports:
if support.confidence_scores and max(support.confidence_scores) >= self.high_confidence_threshold:
insight = self._extract_insight_from_segment(support.segment_text)
if insight:
insights.append(insight)
return insights
def _extract_key_insight(self, chunk: GroundingChunk) -> str:
"""Extract key insight from grounding chunk."""
# Simple extraction - could be enhanced
return f"High-confidence source with {chunk.confidence_score:.2f} confidence score"

View File

@@ -0,0 +1,94 @@
"""
Metadata Collector - Handles collection and formatting of outline metadata.
Collects source mapping stats, grounding insights, optimization results, and research coverage.
"""
from typing import Dict, Any, List
from loguru import logger
class MetadataCollector:
"""Handles collection and formatting of various metadata types for UI display."""
def __init__(self):
"""Initialize the metadata collector."""
pass
def collect_source_mapping_stats(self, mapped_sections, research):
"""Collect source mapping statistics for UI display."""
from models.blog_models import SourceMappingStats
total_sources = len(research.sources)
total_mapped = sum(len(section.references) for section in mapped_sections)
coverage_percentage = (total_mapped / total_sources * 100) if total_sources > 0 else 0.0
# Calculate average relevance score (simplified)
all_relevance_scores = []
for section in mapped_sections:
for ref in section.references:
if hasattr(ref, 'credibility_score') and ref.credibility_score:
all_relevance_scores.append(ref.credibility_score)
average_relevance = sum(all_relevance_scores) / len(all_relevance_scores) if all_relevance_scores else 0.0
high_confidence_mappings = sum(1 for score in all_relevance_scores if score >= 0.8)
return SourceMappingStats(
total_sources_mapped=total_mapped,
coverage_percentage=round(coverage_percentage, 1),
average_relevance_score=round(average_relevance, 3),
high_confidence_mappings=high_confidence_mappings
)
def collect_grounding_insights(self, grounding_insights):
"""Collect grounding insights for UI display."""
from models.blog_models import GroundingInsights
return GroundingInsights(
confidence_analysis=grounding_insights.get('confidence_analysis'),
authority_analysis=grounding_insights.get('authority_analysis'),
temporal_analysis=grounding_insights.get('temporal_analysis'),
content_relationships=grounding_insights.get('content_relationships'),
citation_insights=grounding_insights.get('citation_insights'),
search_intent_insights=grounding_insights.get('search_intent_insights'),
quality_indicators=grounding_insights.get('quality_indicators')
)
def collect_optimization_results(self, optimized_sections, focus):
"""Collect optimization results for UI display."""
from models.blog_models import OptimizationResults
# Calculate a quality score based on section completeness
total_sections = len(optimized_sections)
complete_sections = sum(1 for section in optimized_sections
if section.heading and section.subheadings and section.key_points)
quality_score = (complete_sections / total_sections * 10) if total_sections > 0 else 0.0
improvements_made = [
"Enhanced section headings for better SEO",
"Optimized keyword distribution across sections",
"Improved content flow and logical progression",
"Balanced word count distribution",
"Enhanced subheadings for better readability"
]
return OptimizationResults(
overall_quality_score=round(quality_score, 1),
improvements_made=improvements_made,
optimization_focus=focus
)
def collect_research_coverage(self, research):
"""Collect research coverage metrics for UI display."""
from models.blog_models import ResearchCoverage
sources_utilized = len(research.sources)
content_gaps = research.keyword_analysis.get('content_gaps', [])
competitive_advantages = research.competitor_analysis.get('competitive_advantages', [])
return ResearchCoverage(
sources_utilized=sources_utilized,
content_gaps_identified=len(content_gaps),
competitive_advantages=competitive_advantages[:5] # Limit to top 5
)

View File

@@ -0,0 +1,323 @@
"""
Outline Generator - AI-powered outline generation from research data.
Generates comprehensive, SEO-optimized outlines using research intelligence.
"""
from typing import Dict, Any, List, Tuple
import asyncio
from loguru import logger
from models.blog_models import (
BlogOutlineRequest,
BlogOutlineResponse,
BlogOutlineSection,
)
from .source_mapper import SourceToSectionMapper
from .section_enhancer import SectionEnhancer
from .outline_optimizer import OutlineOptimizer
from .grounding_engine import GroundingContextEngine
from .title_generator import TitleGenerator
from .metadata_collector import MetadataCollector
from .prompt_builder import PromptBuilder
from .response_processor import ResponseProcessor
from .parallel_processor import ParallelProcessor
class OutlineGenerator:
"""Generates AI-powered outlines from research data."""
def __init__(self):
"""Initialize the outline generator with all enhancement modules."""
self.source_mapper = SourceToSectionMapper()
self.section_enhancer = SectionEnhancer()
self.outline_optimizer = OutlineOptimizer()
self.grounding_engine = GroundingContextEngine()
# Initialize extracted classes
self.title_generator = TitleGenerator()
self.metadata_collector = MetadataCollector()
self.prompt_builder = PromptBuilder()
self.response_processor = ResponseProcessor()
self.parallel_processor = ParallelProcessor(self.source_mapper, self.grounding_engine)
async def generate(self, request: BlogOutlineRequest, user_id: str) -> BlogOutlineResponse:
"""
Generate AI-powered outline using research results.
Args:
request: Outline generation request with research data
user_id: User ID (required for subscription checks and usage tracking)
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
# Extract research insights
research = request.research
primary_keywords = research.keyword_analysis.get('primary', [])
secondary_keywords = research.keyword_analysis.get('secondary', [])
content_angles = research.suggested_angles
sources = research.sources
search_intent = research.keyword_analysis.get('search_intent', 'informational')
# Check for custom instructions
custom_instructions = getattr(request, 'custom_instructions', None)
# Build comprehensive outline generation prompt with rich research data
outline_prompt = self.prompt_builder.build_outline_prompt(
primary_keywords, secondary_keywords, content_angles, sources,
search_intent, request, custom_instructions
)
logger.info("Generating AI-powered outline using research results")
# Define schema with proper property ordering (critical for Gemini API)
outline_schema = self.prompt_builder.get_outline_schema()
# Generate outline using structured JSON response with retry logic (user_id required)
outline_data = await self.response_processor.generate_with_retry(outline_prompt, outline_schema, user_id)
# Convert to BlogOutlineSection objects
outline_sections = self.response_processor.convert_to_sections(outline_data, sources)
# Run parallel processing for speed optimization (user_id required)
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing_async(
outline_sections, research, user_id
)
# Enhance sections with grounding insights
logger.info("Enhancing sections with grounding insights...")
grounding_enhanced_sections = self.grounding_engine.enhance_sections_with_grounding(
mapped_sections, research.grounding_metadata, grounding_insights
)
# Optimize outline for better flow, SEO, and engagement (user_id required)
logger.info("Optimizing outline for better flow and engagement...")
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
# Rebalance word counts for optimal distribution
target_words = request.word_count or 1500
balanced_sections = self.outline_optimizer.rebalance_word_counts(optimized_sections, target_words)
# Extract title options - combine AI-generated with content angles
ai_title_options = outline_data.get('title_options', [])
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
# Combine AI-generated titles with content angles
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
logger.info(f"Generated optimized outline with {len(balanced_sections)} sections and {len(title_options)} title options")
# Collect metadata for enhanced UI
source_mapping_stats = self.metadata_collector.collect_source_mapping_stats(mapped_sections, research)
grounding_insights_data = self.metadata_collector.collect_grounding_insights(grounding_insights)
optimization_results = self.metadata_collector.collect_optimization_results(optimized_sections, "comprehensive optimization")
research_coverage = self.metadata_collector.collect_research_coverage(research)
return BlogOutlineResponse(
success=True,
title_options=title_options,
outline=balanced_sections,
source_mapping_stats=source_mapping_stats,
grounding_insights=grounding_insights_data,
optimization_results=optimization_results,
research_coverage=research_coverage
)
async def generate_with_progress(self, request: BlogOutlineRequest, task_id: str, user_id: str) -> BlogOutlineResponse:
"""
Outline generation method with progress updates for real-time feedback.
Args:
request: Outline generation request with research data
task_id: Task ID for progress updates
user_id: User ID (required for subscription checks and usage tracking)
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
from api.blog_writer.task_manager import task_manager
# Extract research insights
research = request.research
primary_keywords = research.keyword_analysis.get('primary', [])
secondary_keywords = research.keyword_analysis.get('secondary', [])
content_angles = research.suggested_angles
sources = research.sources
search_intent = research.keyword_analysis.get('search_intent', 'informational')
# Check for custom instructions
custom_instructions = getattr(request, 'custom_instructions', None)
await task_manager.update_progress(task_id, "📊 Analyzing research data and building content strategy...")
# Build comprehensive outline generation prompt with rich research data
outline_prompt = self.prompt_builder.build_outline_prompt(
primary_keywords, secondary_keywords, content_angles, sources,
search_intent, request, custom_instructions
)
await task_manager.update_progress(task_id, "🤖 Generating AI-powered outline with research insights...")
# Define schema with proper property ordering (critical for Gemini API)
outline_schema = self.prompt_builder.get_outline_schema()
await task_manager.update_progress(task_id, "🔄 Making AI request to generate structured outline...")
# Generate outline using structured JSON response with retry logic (user_id required for subscription checks)
outline_data = await self.response_processor.generate_with_retry(outline_prompt, outline_schema, user_id, task_id)
await task_manager.update_progress(task_id, "📝 Processing outline structure and validating sections...")
# Convert to BlogOutlineSection objects
outline_sections = self.response_processor.convert_to_sections(outline_data, sources)
# Run parallel processing for speed optimization (user_id required for subscription checks)
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing(
outline_sections, research, user_id, task_id
)
# Enhance sections with grounding insights (depends on both previous tasks)
await task_manager.update_progress(task_id, "✨ Enhancing sections with grounding insights...")
grounding_enhanced_sections = self.grounding_engine.enhance_sections_with_grounding(
mapped_sections, research.grounding_metadata, grounding_insights
)
# Optimize outline for better flow, SEO, and engagement (user_id required for subscription checks)
await task_manager.update_progress(task_id, "🎯 Optimizing outline for better flow and engagement...")
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
# Rebalance word counts for optimal distribution
await task_manager.update_progress(task_id, "⚖️ Rebalancing word count distribution...")
target_words = request.word_count or 1500
balanced_sections = self.outline_optimizer.rebalance_word_counts(optimized_sections, target_words)
# Extract title options - combine AI-generated with content angles
ai_title_options = outline_data.get('title_options', [])
content_angle_titles = self.title_generator.extract_content_angle_titles(research)
# Combine AI-generated titles with content angles
title_options = self.title_generator.combine_title_options(ai_title_options, content_angle_titles, primary_keywords)
await task_manager.update_progress(task_id, "✅ Outline generation and optimization completed successfully!")
# Collect metadata for enhanced UI
source_mapping_stats = self.metadata_collector.collect_source_mapping_stats(mapped_sections, research)
grounding_insights_data = self.metadata_collector.collect_grounding_insights(grounding_insights)
optimization_results = self.metadata_collector.collect_optimization_results(optimized_sections, "comprehensive optimization")
research_coverage = self.metadata_collector.collect_research_coverage(research)
return BlogOutlineResponse(
success=True,
title_options=title_options,
outline=balanced_sections,
source_mapping_stats=source_mapping_stats,
grounding_insights=grounding_insights_data,
optimization_results=optimization_results,
research_coverage=research_coverage
)
async def enhance_section(self, section: BlogOutlineSection, focus: str = "general improvement") -> BlogOutlineSection:
"""
Enhance a single section using AI with research context.
Args:
section: The section to enhance
focus: Enhancement focus area (e.g., "SEO optimization", "engagement", "comprehensiveness")
Returns:
Enhanced section with improved content
"""
logger.info(f"Enhancing section '{section.heading}' with focus: {focus}")
enhanced_section = await self.section_enhancer.enhance(section, focus)
logger.info(f"✅ Section enhancement completed for '{section.heading}'")
return enhanced_section
async def optimize_outline(self, outline: List[BlogOutlineSection], focus: str = "comprehensive optimization") -> List[BlogOutlineSection]:
"""
Optimize an entire outline for better flow, SEO, and engagement.
Args:
outline: List of sections to optimize
focus: Optimization focus area
Returns:
Optimized outline with improved flow and engagement
"""
logger.info(f"Optimizing outline with {len(outline)} sections, focus: {focus}")
optimized_outline = await self.outline_optimizer.optimize(outline, focus)
logger.info(f"✅ Outline optimization completed for {len(optimized_outline)} sections")
return optimized_outline
def rebalance_outline_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""
Rebalance word count distribution across outline sections.
Args:
outline: List of sections to rebalance
target_words: Total target word count
Returns:
Outline with rebalanced word counts
"""
logger.info(f"Rebalancing word counts for {len(outline)} sections, target: {target_words} words")
rebalanced_outline = self.outline_optimizer.rebalance_word_counts(outline, target_words)
logger.info(f"✅ Word count rebalancing completed")
return rebalanced_outline
def get_grounding_insights(self, research_data) -> Dict[str, Any]:
"""
Get grounding metadata insights for research data.
Args:
research_data: Research data with grounding metadata
Returns:
Dictionary containing grounding insights and analysis
"""
logger.info("Extracting grounding insights from research data...")
insights = self.grounding_engine.extract_contextual_insights(research_data.grounding_metadata)
logger.info(f"✅ Extracted {len(insights)} grounding insight categories")
return insights
def get_authority_sources(self, research_data) -> List[Tuple]:
"""
Get high-authority sources from grounding metadata.
Args:
research_data: Research data with grounding metadata
Returns:
List of (chunk, authority_score) tuples sorted by authority
"""
logger.info("Identifying high-authority sources from grounding metadata...")
authority_sources = self.grounding_engine.get_authority_sources(research_data.grounding_metadata)
logger.info(f"✅ Identified {len(authority_sources)} high-authority sources")
return authority_sources
def get_high_confidence_insights(self, research_data) -> List[str]:
"""
Get high-confidence insights from grounding metadata.
Args:
research_data: Research data with grounding metadata
Returns:
List of high-confidence insights
"""
logger.info("Extracting high-confidence insights from grounding metadata...")
insights = self.grounding_engine.get_high_confidence_insights(research_data.grounding_metadata)
logger.info(f"✅ Extracted {len(insights)} high-confidence insights")
return insights

View File

@@ -0,0 +1,137 @@
"""
Outline Optimizer - AI-powered outline optimization and rebalancing.
Optimizes outlines for better flow, SEO, and engagement.
"""
from typing import List
from loguru import logger
from models.blog_models import BlogOutlineSection
class OutlineOptimizer:
"""Optimizes outlines for better flow, SEO, and engagement."""
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str) -> List[BlogOutlineSection]:
"""Optimize entire outline for better flow, SEO, and engagement.
Args:
outline: List of outline sections to optimize
focus: Optimization focus (e.g., "general optimization")
user_id: User ID (required for subscription checks and usage tracking)
Returns:
List of optimized outline sections
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for outline optimization (subscription checks and usage tracking)")
outline_text = "\n".join([f"{i+1}. {s.heading}" for i, s in enumerate(outline)])
optimization_prompt = f"""Optimize this blog outline for better flow, engagement, and SEO:
Current Outline:
{outline_text}
Optimization Focus: {focus}
Goals: Improve narrative flow, enhance SEO, increase engagement, ensure comprehensive coverage.
Return JSON format:
{{
"outline": [
{{
"heading": "Optimized heading",
"subheadings": ["subheading 1", "subheading 2"],
"key_points": ["point 1", "point 2"],
"target_words": 300,
"keywords": ["keyword1", "keyword2"]
}}
]
}}"""
try:
from services.llm_providers.main_text_generation import llm_text_gen
optimization_schema = {
"type": "object",
"properties": {
"outline": {
"type": "array",
"items": {
"type": "object",
"properties": {
"heading": {"type": "string"},
"subheadings": {"type": "array", "items": {"type": "string"}},
"key_points": {"type": "array", "items": {"type": "string"}},
"target_words": {"type": "integer"},
"keywords": {"type": "array", "items": {"type": "string"}}
},
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
}
}
},
"required": ["outline"],
"propertyOrdering": ["outline"]
}
optimized_data = llm_text_gen(
prompt=optimization_prompt,
json_struct=optimization_schema,
system_prompt=None,
user_id=user_id
)
# Handle the new schema format with "outline" wrapper
if isinstance(optimized_data, dict) and 'outline' in optimized_data:
optimized_sections = []
for i, section_data in enumerate(optimized_data['outline']):
section = BlogOutlineSection(
id=f"s{i+1}",
heading=section_data.get('heading', f'Section {i+1}'),
subheadings=section_data.get('subheadings', []),
key_points=section_data.get('key_points', []),
references=outline[i].references if i < len(outline) else [],
target_words=section_data.get('target_words', 300),
keywords=section_data.get('keywords', [])
)
optimized_sections.append(section)
logger.info(f"✅ Outline optimization completed: {len(optimized_sections)} sections optimized")
return optimized_sections
else:
logger.warning(f"Invalid optimization response format: {type(optimized_data)}")
except Exception as e:
logger.warning(f"AI outline optimization failed: {e}")
logger.info("Returning original outline without optimization")
return outline
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""Rebalance word count distribution across sections."""
total_sections = len(outline)
if total_sections == 0:
return outline
# Calculate target distribution
intro_words = int(target_words * 0.12) # 12% for intro
conclusion_words = int(target_words * 0.12) # 12% for conclusion
main_content_words = target_words - intro_words - conclusion_words
# Distribute main content words across sections
words_per_section = main_content_words // total_sections
remainder = main_content_words % total_sections
for i, section in enumerate(outline):
if i == 0: # First section (intro)
section.target_words = intro_words
elif i == total_sections - 1: # Last section (conclusion)
section.target_words = conclusion_words
else: # Main content sections
section.target_words = words_per_section + (1 if i < remainder else 0)
return outline

View File

@@ -0,0 +1,268 @@
"""
Outline Service - Core outline generation and management functionality.
Handles AI-powered outline generation, refinement, and optimization.
"""
from typing import Dict, Any, List
import asyncio
from loguru import logger
from models.blog_models import (
BlogOutlineRequest,
BlogOutlineResponse,
BlogOutlineRefineRequest,
BlogOutlineSection,
)
from .outline_generator import OutlineGenerator
from .outline_optimizer import OutlineOptimizer
from .section_enhancer import SectionEnhancer
from services.cache.persistent_outline_cache import persistent_outline_cache
class OutlineService:
"""Service for generating and managing blog outlines using AI."""
def __init__(self):
self.outline_generator = OutlineGenerator()
self.outline_optimizer = OutlineOptimizer()
self.section_enhancer = SectionEnhancer()
async def generate_outline(self, request: BlogOutlineRequest, user_id: str) -> BlogOutlineResponse:
"""
Stage 2: Content Planning with AI-generated outline using research results.
Uses Gemini with research data to create comprehensive, SEO-optimized outline.
Args:
request: Outline generation request with research data
user_id: User ID (required for subscription checks and usage tracking)
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
# Extract cache parameters - use original user keywords for consistent caching
keywords = request.research.original_keywords or request.research.keyword_analysis.get('primary', [])
industry = getattr(request.persona, 'industry', 'general') if request.persona else 'general'
target_audience = getattr(request.persona, 'target_audience', 'general') if request.persona else 'general'
word_count = request.word_count or 1500
custom_instructions = request.custom_instructions or ""
persona_data = request.persona.dict() if request.persona else None
# Check cache first
cached_result = persistent_outline_cache.get_cached_outline(
keywords=keywords,
industry=industry,
target_audience=target_audience,
word_count=word_count,
custom_instructions=custom_instructions,
persona_data=persona_data
)
if cached_result:
logger.info(f"Using cached outline for keywords: {keywords}")
return BlogOutlineResponse(**cached_result)
# Generate new outline if not cached (user_id required)
logger.info(f"Generating new outline for keywords: {keywords}")
result = await self.outline_generator.generate(request, user_id)
# Cache the result
persistent_outline_cache.cache_outline(
keywords=keywords,
industry=industry,
target_audience=target_audience,
word_count=word_count,
custom_instructions=custom_instructions,
persona_data=persona_data,
result=result.dict()
)
return result
async def generate_outline_with_progress(self, request: BlogOutlineRequest, task_id: str, user_id: str) -> BlogOutlineResponse:
"""
Outline generation method with progress updates for real-time feedback.
"""
# Extract cache parameters - use original user keywords for consistent caching
keywords = request.research.original_keywords or request.research.keyword_analysis.get('primary', [])
industry = getattr(request.persona, 'industry', 'general') if request.persona else 'general'
target_audience = getattr(request.persona, 'target_audience', 'general') if request.persona else 'general'
word_count = request.word_count or 1500
custom_instructions = request.custom_instructions or ""
persona_data = request.persona.dict() if request.persona else None
# Check cache first
cached_result = persistent_outline_cache.get_cached_outline(
keywords=keywords,
industry=industry,
target_audience=target_audience,
word_count=word_count,
custom_instructions=custom_instructions,
persona_data=persona_data
)
if cached_result:
logger.info(f"Using cached outline for keywords: {keywords} (with progress updates)")
# Update progress to show cache hit
from api.blog_writer.task_manager import task_manager
await task_manager.update_progress(task_id, "✅ Using cached outline (saved generation time!)")
return BlogOutlineResponse(**cached_result)
# Generate new outline if not cached
logger.info(f"Generating new outline for keywords: {keywords} (with progress updates)")
result = await self.outline_generator.generate_with_progress(request, task_id, user_id)
# Cache the result
persistent_outline_cache.cache_outline(
keywords=keywords,
industry=industry,
target_audience=target_audience,
word_count=word_count,
custom_instructions=custom_instructions,
persona_data=persona_data,
result=result.dict()
)
return result
async def refine_outline(self, request: BlogOutlineRefineRequest) -> BlogOutlineResponse:
"""
Refine outline with HITL (Human-in-the-Loop) operations
Supports add, remove, move, merge, rename operations
"""
outline = request.outline.copy()
operation = request.operation.lower()
section_id = request.section_id
payload = request.payload or {}
try:
if operation == 'add':
# Add new section
new_section = BlogOutlineSection(
id=f"s{len(outline) + 1}",
heading=payload.get('heading', 'New Section'),
subheadings=payload.get('subheadings', []),
key_points=payload.get('key_points', []),
references=[],
target_words=payload.get('target_words', 300)
)
outline.append(new_section)
logger.info(f"Added new section: {new_section.heading}")
elif operation == 'remove' and section_id:
# Remove section
outline = [s for s in outline if s.id != section_id]
logger.info(f"Removed section: {section_id}")
elif operation == 'rename' and section_id:
# Rename section
for section in outline:
if section.id == section_id:
section.heading = payload.get('heading', section.heading)
break
logger.info(f"Renamed section {section_id} to: {payload.get('heading')}")
elif operation == 'move' and section_id:
# Move section (reorder)
direction = payload.get('direction', 'down') # 'up' or 'down'
current_index = next((i for i, s in enumerate(outline) if s.id == section_id), -1)
if current_index != -1:
if direction == 'up' and current_index > 0:
outline[current_index], outline[current_index - 1] = outline[current_index - 1], outline[current_index]
elif direction == 'down' and current_index < len(outline) - 1:
outline[current_index], outline[current_index + 1] = outline[current_index + 1], outline[current_index]
logger.info(f"Moved section {section_id} {direction}")
elif operation == 'merge' and section_id:
# Merge with next section
current_index = next((i for i, s in enumerate(outline) if s.id == section_id), -1)
if current_index != -1 and current_index < len(outline) - 1:
current_section = outline[current_index]
next_section = outline[current_index + 1]
# Merge sections
current_section.heading = f"{current_section.heading} & {next_section.heading}"
current_section.subheadings.extend(next_section.subheadings)
current_section.key_points.extend(next_section.key_points)
current_section.references.extend(next_section.references)
current_section.target_words = (current_section.target_words or 0) + (next_section.target_words or 0)
# Remove the next section
outline.pop(current_index + 1)
logger.info(f"Merged section {section_id} with next section")
elif operation == 'update' and section_id:
# Update section details
for section in outline:
if section.id == section_id:
if 'heading' in payload:
section.heading = payload['heading']
if 'subheadings' in payload:
section.subheadings = payload['subheadings']
if 'key_points' in payload:
section.key_points = payload['key_points']
if 'target_words' in payload:
section.target_words = payload['target_words']
break
logger.info(f"Updated section {section_id}")
# Reassign IDs to maintain order
for i, section in enumerate(outline):
section.id = f"s{i+1}"
return BlogOutlineResponse(
success=True,
title_options=["Refined Outline"],
outline=outline
)
except Exception as e:
logger.error(f"Outline refinement failed: {e}")
return BlogOutlineResponse(
success=False,
title_options=["Error"],
outline=request.outline
)
async def enhance_section_with_ai(self, section: BlogOutlineSection, focus: str = "general improvement") -> BlogOutlineSection:
"""Enhance a section using AI with research context."""
return await self.section_enhancer.enhance(section, focus)
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization") -> List[BlogOutlineSection]:
"""Optimize entire outline for better flow, SEO, and engagement."""
return await self.outline_optimizer.optimize(outline, focus)
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""Rebalance word count distribution across sections."""
return self.outline_optimizer.rebalance_word_counts(outline, target_words)
# Cache Management Methods
def get_outline_cache_stats(self) -> Dict[str, Any]:
"""Get outline cache statistics."""
return persistent_outline_cache.get_cache_stats()
def clear_outline_cache(self):
"""Clear all cached outline entries."""
persistent_outline_cache.clear_cache()
logger.info("Outline cache cleared")
def invalidate_outline_cache_for_keywords(self, keywords: List[str]):
"""
Invalidate outline cache entries for specific keywords.
Useful when research data is updated.
Args:
keywords: Keywords to invalidate cache for
"""
persistent_outline_cache.invalidate_cache_for_keywords(keywords)
logger.info(f"Invalidated outline cache for keywords: {keywords}")
def get_recent_outline_cache_entries(self, limit: int = 20) -> List[Dict[str, Any]]:
"""Get recent outline cache entries for debugging."""
return persistent_outline_cache.get_cache_entries(limit)

View File

@@ -0,0 +1,121 @@
"""
Parallel Processor - Handles parallel processing of outline generation tasks.
Manages concurrent execution of source mapping and grounding insights extraction.
"""
import asyncio
from typing import Tuple, Any
from loguru import logger
class ParallelProcessor:
"""Handles parallel processing of outline generation tasks for speed optimization."""
def __init__(self, source_mapper, grounding_engine):
"""Initialize the parallel processor with required dependencies."""
self.source_mapper = source_mapper
self.grounding_engine = grounding_engine
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None) -> Tuple[Any, Any]:
"""
Run source mapping and grounding insights extraction in parallel.
Args:
outline_sections: List of outline sections to process
research: Research data object
user_id: User ID (required for subscription checks and usage tracking)
task_id: Optional task ID for progress updates
Returns:
Tuple of (mapped_sections, grounding_insights)
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for parallel processing (subscription checks and usage tracking)")
if task_id:
from api.blog_writer.task_manager import task_manager
await task_manager.update_progress(task_id, "⚡ Running parallel processing for maximum speed...")
logger.info("Running parallel processing for maximum speed...")
# Run these tasks in parallel to save time
source_mapping_task = asyncio.create_task(
self._run_source_mapping(outline_sections, research, task_id, user_id)
)
grounding_insights_task = asyncio.create_task(
self._run_grounding_insights_extraction(research, task_id)
)
# Wait for both parallel tasks to complete
mapped_sections, grounding_insights = await asyncio.gather(
source_mapping_task,
grounding_insights_task
)
return mapped_sections, grounding_insights
async def run_parallel_processing_async(self, outline_sections, research, user_id: str) -> Tuple[Any, Any]:
"""
Run parallel processing without progress updates (for non-progress methods).
Args:
outline_sections: List of outline sections to process
research: Research data object
user_id: User ID (required for subscription checks and usage tracking)
Returns:
Tuple of (mapped_sections, grounding_insights)
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for parallel processing (subscription checks and usage tracking)")
logger.info("Running parallel processing for maximum speed...")
# Run these tasks in parallel to save time
source_mapping_task = asyncio.create_task(
self._run_source_mapping_async(outline_sections, research, user_id)
)
grounding_insights_task = asyncio.create_task(
self._run_grounding_insights_extraction_async(research)
)
# Wait for both parallel tasks to complete
mapped_sections, grounding_insights = await asyncio.gather(
source_mapping_task,
grounding_insights_task
)
return mapped_sections, grounding_insights
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str):
"""Run source mapping in parallel."""
if task_id:
from api.blog_writer.task_manager import task_manager
await task_manager.update_progress(task_id, "🔗 Applying intelligent source-to-section mapping...")
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
async def _run_grounding_insights_extraction(self, research, task_id):
"""Run grounding insights extraction in parallel."""
if task_id:
from api.blog_writer.task_manager import task_manager
await task_manager.update_progress(task_id, "🧠 Extracting grounding metadata insights...")
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
async def _run_source_mapping_async(self, outline_sections, research, user_id: str):
"""Run source mapping in parallel (async version without progress updates)."""
logger.info("Applying intelligent source-to-section mapping...")
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
async def _run_grounding_insights_extraction_async(self, research):
"""Run grounding insights extraction in parallel (async version without progress updates)."""
logger.info("Extracting grounding metadata insights...")
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)

View File

@@ -0,0 +1,127 @@
"""
Prompt Builder - Handles building of AI prompts for outline generation.
Constructs comprehensive prompts with research data, keywords, and strategic requirements.
"""
from typing import Dict, Any, List
class PromptBuilder:
"""Handles building of comprehensive AI prompts for outline generation."""
def __init__(self):
"""Initialize the prompt builder."""
pass
def build_outline_prompt(self, primary_keywords: List[str], secondary_keywords: List[str],
content_angles: List[str], sources: List, search_intent: str,
request, custom_instructions: str = None) -> str:
"""Build the comprehensive outline generation prompt using filtered research data."""
# Use the filtered research data (already cleaned by ResearchDataFilter)
research = request.research
primary_kw_text = ', '.join(primary_keywords) if primary_keywords else (request.topic or ', '.join(getattr(request.research, 'original_keywords', []) or ['the target topic']))
secondary_kw_text = ', '.join(secondary_keywords) if secondary_keywords else "None provided"
long_tail_text = ', '.join(research.keyword_analysis.get('long_tail', [])) if research and research.keyword_analysis else "None discovered"
semantic_text = ', '.join(research.keyword_analysis.get('semantic_keywords', [])) if research and research.keyword_analysis else "None discovered"
trending_text = ', '.join(research.keyword_analysis.get('trending_terms', [])) if research and research.keyword_analysis else "None discovered"
content_gap_text = ', '.join(research.keyword_analysis.get('content_gaps', [])) if research and research.keyword_analysis else "None identified"
content_angle_text = ', '.join(content_angles) if content_angles else "No explicit angles provided; infer compelling angles from research insights."
competitor_text = ', '.join(research.competitor_analysis.get('top_competitors', [])) if research and research.competitor_analysis else "Not available"
opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
return f"""Create a comprehensive blog outline for: {primary_kw_text}
CONTEXT:
Search Intent: {search_intent}
Target: {request.word_count or 1500} words
Industry: {getattr(request.persona, 'industry', 'General') if request.persona else 'General'}
Audience: {getattr(request.persona, 'target_audience', 'General') if request.persona else 'General'}
KEYWORDS:
Primary: {primary_kw_text}
Secondary: {secondary_kw_text}
Long-tail: {long_tail_text}
Semantic: {semantic_text}
Trending: {trending_text}
Content Gaps: {content_gap_text}
CONTENT ANGLES / STORYLINES: {content_angle_text}
COMPETITIVE INTELLIGENCE:
Top Competitors: {competitor_text}
Market Opportunities: {opportunity_text}
Competitive Advantages: {advantages_text}
RESEARCH SOURCES: {len(sources)} authoritative sources available
{f"CUSTOM INSTRUCTIONS: {custom_instructions}" if custom_instructions else ""}
STRATEGIC REQUIREMENTS:
- Create SEO-optimized headings with natural keyword integration
- Surface the strongest research-backed angles within the outline
- Build logical narrative flow from problem to solution
- Include data-driven insights from research sources
- Address content gaps and market opportunities
- Optimize for search intent and user questions
- Ensure engaging, actionable content throughout
Return JSON format:
{
"title_options": [
"Title option 1",
"Title option 2",
"Title option 3"
],
"outline": [
{
"heading": "Section heading with primary keyword",
"subheadings": ["Subheading 1", "Subheading 2", "Subheading 3"],
"key_points": ["Key point 1", "Key point 2", "Key point 3"],
"target_words": 300,
"keywords": ["primary keyword", "secondary keyword"]
}
]
}"""
def get_outline_schema(self) -> Dict[str, Any]:
"""Get the structured JSON schema for outline generation."""
return {
"type": "object",
"properties": {
"title_options": {
"type": "array",
"items": {
"type": "string"
}
},
"outline": {
"type": "array",
"items": {
"type": "object",
"properties": {
"heading": {"type": "string"},
"subheadings": {
"type": "array",
"items": {"type": "string"}
},
"key_points": {
"type": "array",
"items": {"type": "string"}
},
"target_words": {"type": "integer"},
"keywords": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
}
}
},
"required": ["title_options", "outline"],
"propertyOrdering": ["title_options", "outline"]
}

View File

@@ -0,0 +1,120 @@
"""
Response Processor - Handles AI response processing and retry logic.
Processes AI responses, handles retries, and converts data to proper formats.
"""
from typing import Dict, Any, List
import asyncio
from loguru import logger
from models.blog_models import BlogOutlineSection
class ResponseProcessor:
"""Handles AI response processing, retry logic, and data conversion."""
def __init__(self):
"""Initialize the response processor."""
pass
async def generate_with_retry(self, prompt: str, schema: Dict[str, Any], user_id: str, task_id: str = None) -> Dict[str, Any]:
"""Generate outline with retry logic for API failures.
Args:
prompt: The prompt for outline generation
schema: JSON schema for structured response
user_id: User ID (required for subscription checks and usage tracking)
task_id: Optional task ID for progress updates
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for outline generation (subscription checks and usage tracking)")
from services.llm_providers.main_text_generation import llm_text_gen
from api.blog_writer.task_manager import task_manager
max_retries = 2 # Conservative retry for expensive API calls
retry_delay = 5 # 5 second delay between retries
for attempt in range(max_retries + 1):
try:
if task_id:
await task_manager.update_progress(task_id, f"🤖 Calling AI API for outline generation (attempt {attempt + 1}/{max_retries + 1})...")
outline_data = llm_text_gen(
prompt=prompt,
json_struct=schema,
system_prompt=None,
user_id=user_id
)
# Log response for debugging
logger.info(f"AI response received: {type(outline_data)}")
# Check for errors in the response
if isinstance(outline_data, dict) and 'error' in outline_data:
error_msg = str(outline_data['error'])
if "503" in error_msg and "overloaded" in error_msg and attempt < max_retries:
if task_id:
await task_manager.update_progress(task_id, f"⚠️ AI service overloaded, retrying in {retry_delay} seconds...")
logger.warning(f"AI API overloaded, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
await asyncio.sleep(retry_delay)
continue
elif "No valid structured response content found" in error_msg and attempt < max_retries:
if task_id:
await task_manager.update_progress(task_id, f"⚠️ Invalid response format, retrying in {retry_delay} seconds...")
logger.warning(f"AI response parsing failed, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
await asyncio.sleep(retry_delay)
continue
else:
logger.error(f"AI structured response error: {outline_data['error']}")
raise ValueError(f"AI outline generation failed: {outline_data['error']}")
# Validate required fields
if not isinstance(outline_data, dict) or 'outline' not in outline_data or not isinstance(outline_data['outline'], list):
if attempt < max_retries:
if task_id:
await task_manager.update_progress(task_id, f"⚠️ Invalid response structure, retrying in {retry_delay} seconds...")
logger.warning(f"Invalid response structure, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1})")
await asyncio.sleep(retry_delay)
continue
else:
raise ValueError("Invalid outline structure in AI response")
# If we get here, the response is valid
return outline_data
except Exception as e:
error_str = str(e)
if ("503" in error_str or "overloaded" in error_str) and attempt < max_retries:
if task_id:
await task_manager.update_progress(task_id, f"⚠️ AI service error, retrying in {retry_delay} seconds...")
logger.warning(f"AI API error, retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries + 1}): {error_str}")
await asyncio.sleep(retry_delay)
continue
else:
logger.error(f"Outline generation failed after {attempt + 1} attempts: {error_str}")
raise ValueError(f"AI outline generation failed: {error_str}")
def convert_to_sections(self, outline_data: Dict[str, Any], sources: List) -> List[BlogOutlineSection]:
"""Convert outline data to BlogOutlineSection objects."""
outline_sections = []
for i, section_data in enumerate(outline_data.get('outline', [])):
if not isinstance(section_data, dict) or 'heading' not in section_data:
continue
section = BlogOutlineSection(
id=f"s{i+1}",
heading=section_data.get('heading', f'Section {i+1}'),
subheadings=section_data.get('subheadings', []),
key_points=section_data.get('key_points', []),
references=[], # Will be populated by intelligent mapping
target_words=section_data.get('target_words', 200),
keywords=section_data.get('keywords', [])
)
outline_sections.append(section)
return outline_sections

View File

@@ -0,0 +1,96 @@
"""
Section Enhancer - AI-powered section enhancement and improvement.
Enhances individual outline sections for better engagement and value.
"""
from loguru import logger
from models.blog_models import BlogOutlineSection
class SectionEnhancer:
"""Enhances individual outline sections using AI."""
async def enhance(self, section: BlogOutlineSection, focus: str, user_id: str) -> BlogOutlineSection:
"""Enhance a section using AI with research context.
Args:
section: Outline section to enhance
focus: Enhancement focus (e.g., "general improvement")
user_id: User ID (required for subscription checks and usage tracking)
Returns:
Enhanced outline section
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for section enhancement (subscription checks and usage tracking)")
enhancement_prompt = f"""
Enhance the following blog section to make it more engaging, comprehensive, and valuable:
Current Section:
Heading: {section.heading}
Subheadings: {', '.join(section.subheadings)}
Key Points: {', '.join(section.key_points)}
Target Words: {section.target_words}
Keywords: {', '.join(section.keywords)}
Enhancement Focus: {focus}
Improve:
1. Make subheadings more specific and actionable
2. Add more comprehensive key points with data/insights
3. Include practical examples and case studies
4. Address common questions and objections
5. Optimize for SEO with better keyword integration
Respond with JSON:
{{
"heading": "Enhanced heading",
"subheadings": ["enhanced subheading 1", "enhanced subheading 2"],
"key_points": ["enhanced point 1", "enhanced point 2"],
"target_words": 400,
"keywords": ["keyword1", "keyword2"]
}}
"""
try:
from services.llm_providers.main_text_generation import llm_text_gen
enhancement_schema = {
"type": "object",
"properties": {
"heading": {"type": "string"},
"subheadings": {"type": "array", "items": {"type": "string"}},
"key_points": {"type": "array", "items": {"type": "string"}},
"target_words": {"type": "integer"},
"keywords": {"type": "array", "items": {"type": "string"}}
},
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
}
enhanced_data = llm_text_gen(
prompt=enhancement_prompt,
json_struct=enhancement_schema,
system_prompt=None,
user_id=user_id
)
if isinstance(enhanced_data, dict) and 'error' not in enhanced_data:
return BlogOutlineSection(
id=section.id,
heading=enhanced_data.get('heading', section.heading),
subheadings=enhanced_data.get('subheadings', section.subheadings),
key_points=enhanced_data.get('key_points', section.key_points),
references=section.references,
target_words=enhanced_data.get('target_words', section.target_words),
keywords=enhanced_data.get('keywords', section.keywords)
)
except Exception as e:
logger.warning(f"AI section enhancement failed: {e}")
return section

View File

@@ -0,0 +1,198 @@
"""
SEO Title Generator - Specialized service for generating SEO-optimized blog titles.
Generates 5 premium SEO-optimized titles using research data and outline context.
"""
from typing import Dict, Any, List
from loguru import logger
from models.blog_models import BlogResearchResponse, BlogOutlineSection
class SEOTitleGenerator:
"""Generates SEO-optimized blog titles using research and outline data."""
def __init__(self):
"""Initialize the SEO title generator."""
pass
def build_title_prompt(
self,
research: BlogResearchResponse,
outline: List[BlogOutlineSection],
primary_keywords: List[str],
secondary_keywords: List[str],
content_angles: List[str],
search_intent: str,
word_count: int = 1500
) -> str:
"""Build a specialized prompt for SEO title generation."""
# Extract key research insights
keyword_analysis = research.keyword_analysis or {}
competitor_analysis = research.competitor_analysis or {}
primary_kw_text = ', '.join(primary_keywords) if primary_keywords else "the target topic"
secondary_kw_text = ', '.join(secondary_keywords) if secondary_keywords else "None provided"
long_tail_text = ', '.join(keyword_analysis.get('long_tail', [])) if keyword_analysis else "None discovered"
semantic_text = ', '.join(keyword_analysis.get('semantic_keywords', [])) if keyword_analysis else "None discovered"
trending_text = ', '.join(keyword_analysis.get('trending_terms', [])) if keyword_analysis else "None discovered"
content_gap_text = ', '.join(keyword_analysis.get('content_gaps', [])) if keyword_analysis else "None identified"
content_angle_text = ', '.join(content_angles) if content_angles else "No explicit angles provided"
# Extract outline structure summary
outline_summary = []
for i, section in enumerate(outline[:5], 1): # Limit to first 5 sections for context
outline_summary.append(f"{i}. {section.heading}")
if section.subheadings:
outline_summary.append(f" Subtopics: {', '.join(section.subheadings[:3])}")
outline_text = '\n'.join(outline_summary) if outline_summary else "No outline available"
return f"""Generate exactly 5 SEO-optimized blog titles for: {primary_kw_text}
RESEARCH CONTEXT:
Primary Keywords: {primary_kw_text}
Secondary Keywords: {secondary_kw_text}
Long-tail Keywords: {long_tail_text}
Semantic Keywords: {semantic_text}
Trending Terms: {trending_text}
Content Gaps: {content_gap_text}
Search Intent: {search_intent}
Content Angles: {content_angle_text}
OUTLINE STRUCTURE:
{outline_text}
COMPETITIVE INTELLIGENCE:
Top Competitors: {', '.join(competitor_analysis.get('top_competitors', [])) if competitor_analysis else 'Not available'}
Market Opportunities: {', '.join(competitor_analysis.get('opportunities', [])) if competitor_analysis else 'Not available'}
SEO REQUIREMENTS:
- Each title must be 50-65 characters (optimal for search engine display)
- Include the primary keyword within the first 55 characters
- Highlight a unique value proposition from the research angles
- Use power words that drive clicks (e.g., "Ultimate", "Complete", "Essential", "Proven")
- Avoid generic phrasing - be specific and benefit-focused
- Target the search intent: {search_intent}
- Ensure titles are compelling and click-worthy
Return ONLY a JSON array of exactly 5 titles:
[
"Title 1 (50-65 chars)",
"Title 2 (50-65 chars)",
"Title 3 (50-65 chars)",
"Title 4 (50-65 chars)",
"Title 5 (50-65 chars)"
]"""
def get_title_schema(self) -> Dict[str, Any]:
"""Get the JSON schema for title generation."""
return {
"type": "array",
"items": {
"type": "string",
"minLength": 50,
"maxLength": 65
},
"minItems": 5,
"maxItems": 5
}
async def generate_seo_titles(
self,
research: BlogResearchResponse,
outline: List[BlogOutlineSection],
primary_keywords: List[str],
secondary_keywords: List[str],
content_angles: List[str],
search_intent: str,
word_count: int,
user_id: str
) -> List[str]:
"""Generate SEO-optimized titles using research and outline data.
Args:
research: Research data with keywords and insights
outline: Blog outline sections
primary_keywords: Primary keywords for the blog
secondary_keywords: Secondary keywords
content_angles: Content angles from research
search_intent: Search intent (informational, commercial, etc.)
word_count: Target word count
user_id: User ID for API calls
Returns:
List of 5 SEO-optimized titles
"""
from services.llm_providers.main_text_generation import llm_text_gen
if not user_id:
raise ValueError("user_id is required for title generation")
# Build specialized prompt
prompt = self.build_title_prompt(
research=research,
outline=outline,
primary_keywords=primary_keywords,
secondary_keywords=secondary_keywords,
content_angles=content_angles,
search_intent=search_intent,
word_count=word_count
)
# Get schema
schema = self.get_title_schema()
logger.info(f"Generating SEO-optimized titles for user {user_id}")
try:
# Generate titles using structured JSON response
result = llm_text_gen(
prompt=prompt,
json_struct=schema,
system_prompt="You are an expert SEO content strategist specializing in creating compelling, search-optimized blog titles.",
user_id=user_id
)
# Handle response - could be array directly or wrapped in dict
if isinstance(result, list):
titles = result
elif isinstance(result, dict):
# Try common keys
titles = result.get('titles', result.get('title_options', result.get('options', [])))
if not titles and isinstance(result.get('response'), list):
titles = result['response']
else:
logger.warning(f"Unexpected title generation result type: {type(result)}")
titles = []
# Validate and clean titles
cleaned_titles = []
for title in titles:
if isinstance(title, str) and len(title.strip()) >= 30: # Minimum reasonable length
cleaned = title.strip()
# Ensure it's within reasonable bounds (allow slight overflow for quality)
if len(cleaned) <= 70: # Allow slight overflow for quality
cleaned_titles.append(cleaned)
# Ensure we have exactly 5 titles
if len(cleaned_titles) < 5:
logger.warning(f"Generated only {len(cleaned_titles)} titles, expected 5")
# Pad with placeholder if needed (shouldn't happen with proper schema)
while len(cleaned_titles) < 5:
cleaned_titles.append(f"{primary_keywords[0] if primary_keywords else 'Blog'} - Comprehensive Guide")
# Return exactly 5 titles
return cleaned_titles[:5]
except Exception as e:
logger.error(f"Failed to generate SEO titles: {e}")
# Fallback: generate simple titles from keywords
fallback_titles = []
primary = primary_keywords[0] if primary_keywords else "Blog Post"
for i in range(5):
fallback_titles.append(f"{primary}: Complete Guide {i+1}")
return fallback_titles

View File

@@ -0,0 +1,690 @@
"""
Source-to-Section Mapper - Intelligent mapping of research sources to outline sections.
This module provides algorithmic mapping of research sources to specific outline sections
based on semantic similarity, keyword relevance, and contextual matching. Uses a hybrid
approach of algorithmic scoring followed by AI validation for optimal results.
"""
from typing import Dict, Any, List, Tuple, Optional
import re
from collections import Counter
from loguru import logger
from models.blog_models import (
BlogOutlineSection,
ResearchSource,
BlogResearchResponse,
)
class SourceToSectionMapper:
"""Maps research sources to outline sections using intelligent algorithms."""
def __init__(self):
"""Initialize the source-to-section mapper."""
self.min_semantic_score = 0.3
self.min_keyword_score = 0.2
self.min_contextual_score = 0.2
self.max_sources_per_section = 3
self.min_total_score = 0.4
# Weight factors for different scoring methods
self.weights = {
'semantic': 0.4, # Semantic similarity weight
'keyword': 0.3, # Keyword matching weight
'contextual': 0.3 # Contextual relevance weight
}
# Common stop words for text processing
self.stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
'over', 'under', 'again', 'further', 'then', 'once'
}
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
def map_sources_to_sections(
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse,
user_id: str
) -> List[BlogOutlineSection]:
"""
Map research sources to outline sections using intelligent algorithms.
Args:
sections: List of outline sections to map sources to
research_data: Research data containing sources and metadata
user_id: User ID (required for subscription checks and usage tracking)
Returns:
List of outline sections with intelligently mapped sources
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for source mapping (subscription checks and usage tracking)")
if not sections or not research_data.sources:
logger.warning("No sections or sources to map")
return sections
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
# Step 1: Algorithmic mapping
mapping_results = self._algorithmic_source_mapping(sections, research_data)
# Step 2: AI validation and improvement (single prompt, user_id required for subscription checks)
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
# Step 3: Apply validated mapping to sections
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
logger.info("✅ Source-to-section mapping completed successfully")
return mapped_sections
def _algorithmic_source_mapping(
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Perform algorithmic mapping of sources to sections.
Args:
sections: List of outline sections
research_data: Research data with sources
Returns:
Dictionary mapping section IDs to list of (source, score) tuples
"""
mapping_results = {}
for section in sections:
section_scores = []
for source in research_data.sources:
# Calculate multi-dimensional relevance score
semantic_score = self._calculate_semantic_similarity(section, source)
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
# Weighted total score
total_score = (
semantic_score * self.weights['semantic'] +
keyword_score * self.weights['keyword'] +
contextual_score * self.weights['contextual']
)
# Only include sources that meet minimum threshold
if total_score >= self.min_total_score:
section_scores.append((source, total_score))
# Sort by score and limit to max sources per section
section_scores.sort(key=lambda x: x[1], reverse=True)
section_scores = section_scores[:self.max_sources_per_section]
mapping_results[section.id] = section_scores
logger.debug(f"Section '{section.heading}': {len(section_scores)} sources mapped")
return mapping_results
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
"""
Calculate semantic similarity between section and source.
Args:
section: Outline section
source: Research source
Returns:
Semantic similarity score (0.0 to 1.0)
"""
# Extract text content for comparison
section_text = self._extract_section_text(section)
source_text = self._extract_source_text(source)
# Calculate word overlap
section_words = self._extract_meaningful_words(section_text)
source_words = self._extract_meaningful_words(source_text)
if not section_words or not source_words:
return 0.0
# Calculate Jaccard similarity
intersection = len(set(section_words) & set(source_words))
union = len(set(section_words) | set(source_words))
jaccard_similarity = intersection / union if union > 0 else 0.0
# Boost score for exact phrase matches
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
# Combine Jaccard similarity with phrase boost
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
return semantic_score
def _calculate_keyword_relevance(
self,
section: BlogOutlineSection,
source: ResearchSource,
research_data: BlogResearchResponse
) -> float:
"""
Calculate keyword-based relevance between section and source.
Args:
section: Outline section
source: Research source
research_data: Research data with keyword analysis
Returns:
Keyword relevance score (0.0 to 1.0)
"""
# Get section keywords
section_keywords = set(section.keywords)
if not section_keywords:
# Extract keywords from section heading and content
section_text = self._extract_section_text(section)
section_keywords = set(self._extract_meaningful_words(section_text))
# Get source keywords from title and excerpt
source_text = f"{source.title} {source.excerpt or ''}"
source_keywords = set(self._extract_meaningful_words(source_text))
# Get research keywords for context
research_keywords = set()
for category in ['primary', 'secondary', 'long_tail', 'semantic_keywords']:
research_keywords.update(research_data.keyword_analysis.get(category, []))
# Calculate keyword overlap scores
section_overlap = len(section_keywords & source_keywords) / len(section_keywords) if section_keywords else 0.0
research_overlap = len(research_keywords & source_keywords) / len(research_keywords) if research_keywords else 0.0
# Weighted combination
keyword_score = (section_overlap * 0.7) + (research_overlap * 0.3)
return min(1.0, keyword_score)
def _calculate_contextual_relevance(
self,
section: BlogOutlineSection,
source: ResearchSource,
research_data: BlogResearchResponse
) -> float:
"""
Calculate contextual relevance based on section content and source context.
Args:
section: Outline section
source: Research source
research_data: Research data with context
Returns:
Contextual relevance score (0.0 to 1.0)
"""
contextual_score = 0.0
# 1. Content angle matching
section_text = self._extract_section_text(section).lower()
source_text = f"{source.title} {source.excerpt or ''}".lower()
# Check for content angle matches
content_angles = research_data.suggested_angles
for angle in content_angles:
angle_words = self._extract_meaningful_words(angle.lower())
if angle_words:
section_angle_match = sum(1 for word in angle_words if word in section_text) / len(angle_words)
source_angle_match = sum(1 for word in angle_words if word in source_text) / len(angle_words)
contextual_score += (section_angle_match + source_angle_match) * 0.3
# 2. Search intent alignment
search_intent = research_data.keyword_analysis.get('search_intent', 'informational')
intent_keywords = self._get_intent_keywords(search_intent)
intent_score = 0.0
for keyword in intent_keywords:
if keyword in section_text or keyword in source_text:
intent_score += 0.1
contextual_score += min(0.3, intent_score)
# 3. Industry/domain relevance
if hasattr(research_data, 'industry') and research_data.industry:
industry_words = self._extract_meaningful_words(research_data.industry.lower())
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
contextual_score += industry_score * 0.2
return min(1.0, contextual_score)
def _ai_validate_mapping(
self,
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
research_data: BlogResearchResponse,
user_id: str
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Use AI to validate and improve the algorithmic mapping results.
Args:
mapping_results: Algorithmic mapping results
research_data: Research data for context
user_id: User ID (required for subscription checks and usage tracking)
Returns:
AI-validated and improved mapping results
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for AI validation (subscription checks and usage tracking)")
try:
logger.info("Starting AI validation of source-to-section mapping...")
# Build AI validation prompt
validation_prompt = self._build_validation_prompt(mapping_results, research_data)
# Get AI validation response (user_id required for subscription checks)
validation_response = self._get_ai_validation_response(validation_prompt, user_id)
# Parse and apply AI validation results
validated_mapping = self._parse_validation_response(validation_response, mapping_results, research_data)
logger.info("✅ AI validation completed successfully")
return validated_mapping
except Exception as e:
logger.warning(f"AI validation failed: {e}. Using algorithmic results as fallback.")
return mapping_results
def _apply_mapping_to_sections(
self,
sections: List[BlogOutlineSection],
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]
) -> List[BlogOutlineSection]:
"""
Apply the mapping results to the outline sections.
Args:
sections: Original outline sections
mapping_results: Mapping results from algorithmic/AI processing
Returns:
Sections with mapped sources
"""
mapped_sections = []
for section in sections:
# Get mapped sources for this section
mapped_sources = mapping_results.get(section.id, [])
# Extract just the sources (without scores)
section_sources = [source for source, score in mapped_sources]
# Create new section with mapped sources
mapped_section = BlogOutlineSection(
id=section.id,
heading=section.heading,
subheadings=section.subheadings,
key_points=section.key_points,
references=section_sources,
target_words=section.target_words,
keywords=section.keywords
)
mapped_sections.append(mapped_section)
logger.debug(f"Applied {len(section_sources)} sources to section '{section.heading}'")
return mapped_sections
# Helper methods
def _extract_section_text(self, section: BlogOutlineSection) -> str:
"""Extract all text content from a section."""
text_parts = [section.heading]
text_parts.extend(section.subheadings)
text_parts.extend(section.key_points)
text_parts.extend(section.keywords)
return " ".join(text_parts)
def _extract_source_text(self, source: ResearchSource) -> str:
"""Extract all text content from a source."""
text_parts = [source.title]
if source.excerpt:
text_parts.append(source.excerpt)
return " ".join(text_parts)
def _extract_meaningful_words(self, text: str) -> List[str]:
"""Extract meaningful words from text, removing stop words and cleaning."""
if not text:
return []
# Clean and tokenize
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
# Remove stop words and short words
meaningful_words = [
word for word in words
if word not in self.stop_words and len(word) > 2
]
return meaningful_words
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
"""Calculate phrase similarity boost score."""
if not text1 or not text2:
return 0.0
text1_lower = text1.lower()
text2_lower = text2.lower()
# Look for 2-3 word phrases
phrase_boost = 0.0
# Extract 2-word phrases
words1 = text1_lower.split()
words2 = text2_lower.split()
for i in range(len(words1) - 1):
phrase = f"{words1[i]} {words1[i+1]}"
if phrase in text2_lower:
phrase_boost += 0.1
# Extract 3-word phrases
for i in range(len(words1) - 2):
phrase = f"{words1[i]} {words1[i+1]} {words1[i+2]}"
if phrase in text2_lower:
phrase_boost += 0.15
return min(0.3, phrase_boost) # Cap at 0.3
def _get_intent_keywords(self, search_intent: str) -> List[str]:
"""Get keywords associated with search intent."""
intent_keywords = {
'informational': ['what', 'how', 'why', 'guide', 'tutorial', 'explain', 'learn', 'understand'],
'navigational': ['find', 'locate', 'search', 'where', 'site', 'website', 'page'],
'transactional': ['buy', 'purchase', 'order', 'price', 'cost', 'deal', 'offer', 'discount'],
'commercial': ['compare', 'review', 'best', 'top', 'vs', 'versus', 'alternative', 'option']
}
return intent_keywords.get(search_intent, [])
def get_mapping_statistics(self, mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]) -> Dict[str, Any]:
"""
Get statistics about the mapping results.
Args:
mapping_results: Mapping results to analyze
Returns:
Dictionary with mapping statistics
"""
total_sections = len(mapping_results)
total_mappings = sum(len(sources) for sources in mapping_results.values())
# Calculate score distribution
all_scores = []
for sources in mapping_results.values():
all_scores.extend([score for source, score in sources])
avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
max_score = max(all_scores) if all_scores else 0.0
min_score = min(all_scores) if all_scores else 0.0
# Count sections with/without sources
sections_with_sources = sum(1 for sources in mapping_results.values() if sources)
sections_without_sources = total_sections - sections_with_sources
return {
'total_sections': total_sections,
'total_mappings': total_mappings,
'sections_with_sources': sections_with_sources,
'sections_without_sources': sections_without_sources,
'average_score': avg_score,
'max_score': max_score,
'min_score': min_score,
'mapping_coverage': sections_with_sources / total_sections if total_sections > 0 else 0.0
}
def _build_validation_prompt(
self,
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
research_data: BlogResearchResponse
) -> str:
"""
Build comprehensive AI validation prompt for source-to-section mapping.
Args:
mapping_results: Algorithmic mapping results
research_data: Research data for context
Returns:
Formatted AI validation prompt
"""
# Extract section information
sections_info = []
for section_id, sources in mapping_results.items():
section_info = {
'id': section_id,
'sources': [
{
'title': source.title,
'url': source.url,
'excerpt': source.excerpt,
'credibility_score': source.credibility_score,
'algorithmic_score': score
}
for source, score in sources
]
}
sections_info.append(section_info)
# Extract research context
research_context = {
'primary_keywords': research_data.keyword_analysis.get('primary', []),
'secondary_keywords': research_data.keyword_analysis.get('secondary', []),
'content_angles': research_data.suggested_angles,
'search_intent': research_data.keyword_analysis.get('search_intent', 'informational'),
'all_sources': [
{
'title': source.title,
'url': source.url,
'excerpt': source.excerpt,
'credibility_score': source.credibility_score
}
for source in research_data.sources
]
}
prompt = f"""
You are an expert content strategist and SEO specialist. Your task is to validate and improve the algorithmic mapping of research sources to blog outline sections.
## CONTEXT
Research Topic: {', '.join(research_context['primary_keywords'])}
Search Intent: {research_context['search_intent']}
Content Angles: {', '.join(research_context['content_angles'])}
## ALGORITHMIC MAPPING RESULTS
The following sections have been algorithmically mapped with research sources:
{self._format_sections_for_prompt(sections_info)}
## AVAILABLE SOURCES
All available research sources:
{self._format_sources_for_prompt(research_context['all_sources'])}
## VALIDATION TASK
Please analyze the algorithmic mapping and provide improvements:
1. **Validate Relevance**: Are the mapped sources truly relevant to each section's content and purpose?
2. **Identify Gaps**: Are there better sources available that weren't mapped?
3. **Suggest Improvements**: Recommend specific source changes for better content alignment
4. **Quality Assessment**: Rate the overall mapping quality (1-10)
## RESPONSE FORMAT
Provide your analysis in the following JSON format:
```json
{{
"overall_quality_score": 8,
"section_improvements": [
{{
"section_id": "s1",
"current_sources": ["source_title_1", "source_title_2"],
"recommended_sources": ["better_source_1", "better_source_2", "better_source_3"],
"reasoning": "Explanation of why these sources are better suited for this section",
"confidence": 0.9
}}
],
"summary": "Overall assessment of the mapping quality and key improvements made"
}}
```
## GUIDELINES
- Prioritize sources that directly support the section's key points and subheadings
- Consider source credibility, recency, and content depth
- Ensure sources provide actionable insights for content creation
- Maintain diversity in source types and perspectives
- Focus on sources that enhance the section's value proposition
Analyze the mapping and provide your recommendations.
"""
return prompt
def _get_ai_validation_response(self, prompt: str, user_id: str) -> str:
"""
Get AI validation response using LLM provider.
Args:
prompt: Validation prompt
user_id: User ID (required for subscription checks and usage tracking)
Returns:
AI validation response
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for AI validation response (subscription checks and usage tracking)")
try:
from services.llm_providers.main_text_generation import llm_text_gen
response = llm_text_gen(
prompt=prompt,
json_struct=None,
system_prompt=None,
user_id=user_id
)
return response
except Exception as e:
logger.error(f"Failed to get AI validation response: {e}")
raise
def _parse_validation_response(
self,
response: str,
original_mapping: Dict[str, List[Tuple[ResearchSource, float]]],
research_data: BlogResearchResponse
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Parse AI validation response and apply improvements.
Args:
response: AI validation response
original_mapping: Original algorithmic mapping
research_data: Research data for context
Returns:
Improved mapping based on AI validation
"""
try:
import json
import re
# Extract JSON from response
json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
if not json_match:
# Try to find JSON without code blocks
json_match = re.search(r'(\{.*?\})', response, re.DOTALL)
if not json_match:
logger.warning("Could not extract JSON from AI response")
return original_mapping
validation_data = json.loads(json_match.group(1))
# Create source lookup for quick access
source_lookup = {source.title: source for source in research_data.sources}
# Apply AI improvements
improved_mapping = {}
for improvement in validation_data.get('section_improvements', []):
section_id = improvement['section_id']
recommended_titles = improvement['recommended_sources']
# Map recommended titles to actual sources
recommended_sources = []
for title in recommended_titles:
if title in source_lookup:
source = source_lookup[title]
# Use high confidence score for AI-recommended sources
recommended_sources.append((source, 0.9))
if recommended_sources:
improved_mapping[section_id] = recommended_sources
else:
# Fallback to original mapping if no valid sources found
improved_mapping[section_id] = original_mapping.get(section_id, [])
# Add sections not mentioned in AI response
for section_id, sources in original_mapping.items():
if section_id not in improved_mapping:
improved_mapping[section_id] = sources
logger.info(f"AI validation applied: {len(validation_data.get('section_improvements', []))} sections improved")
return improved_mapping
except Exception as e:
logger.warning(f"Failed to parse AI validation response: {e}")
return original_mapping
def _format_sections_for_prompt(self, sections_info: List[Dict]) -> str:
"""Format sections information for AI prompt."""
formatted = []
for section in sections_info:
section_text = f"**Section {section['id']}:**\n"
section_text += f"Sources mapped: {len(section['sources'])}\n"
for source in section['sources']:
section_text += f"- {source['title']} (Score: {source['algorithmic_score']:.2f})\n"
formatted.append(section_text)
return "\n".join(formatted)
def _format_sources_for_prompt(self, sources: List[Dict]) -> str:
"""Format sources information for AI prompt."""
formatted = []
for i, source in enumerate(sources, 1):
source_text = f"{i}. **{source['title']}**\n"
source_text += f" URL: {source['url']}\n"
source_text += f" Credibility: {source['credibility_score']}\n"
if source['excerpt']:
source_text += f" Excerpt: {source['excerpt'][:200]}...\n"
formatted.append(source_text)
return "\n".join(formatted)

View File

@@ -0,0 +1,123 @@
"""
Title Generator - Handles title generation and formatting for blog outlines.
Extracts content angles from research data and combines them with AI-generated titles.
"""
from typing import List
from loguru import logger
class TitleGenerator:
"""Handles title generation, formatting, and combination logic."""
def __init__(self):
"""Initialize the title generator."""
pass
def extract_content_angle_titles(self, research) -> List[str]:
"""
Extract content angles from research data and convert them to blog titles.
Args:
research: BlogResearchResponse object containing suggested_angles
Returns:
List of title-formatted content angles
"""
if not research or not hasattr(research, 'suggested_angles'):
return []
content_angles = research.suggested_angles or []
if not content_angles:
return []
# Convert content angles to title format
title_formatted_angles = []
for angle in content_angles:
if isinstance(angle, str) and angle.strip():
# Clean and format the angle as a title
formatted_angle = self._format_angle_as_title(angle.strip())
if formatted_angle and formatted_angle not in title_formatted_angles:
title_formatted_angles.append(formatted_angle)
logger.info(f"Extracted {len(title_formatted_angles)} content angle titles from research data")
return title_formatted_angles
def _format_angle_as_title(self, angle: str) -> str:
"""
Format a content angle as a proper blog title.
Args:
angle: Raw content angle string
Returns:
Formatted title string
"""
if not angle or len(angle.strip()) < 10: # Too short to be a good title
return ""
# Clean up the angle
cleaned_angle = angle.strip()
# Capitalize first letter of each sentence and proper nouns
sentences = cleaned_angle.split('. ')
formatted_sentences = []
for sentence in sentences:
if sentence.strip():
# Use title case for better formatting
formatted_sentence = sentence.strip().title()
formatted_sentences.append(formatted_sentence)
formatted_title = '. '.join(formatted_sentences)
# Ensure it ends with proper punctuation
if not formatted_title.endswith(('.', '!', '?')):
formatted_title += '.'
# Limit length to reasonable blog title size
if len(formatted_title) > 100:
formatted_title = formatted_title[:97] + "..."
return formatted_title
def combine_title_options(self, ai_titles: List[str], content_angle_titles: List[str], primary_keywords: List[str]) -> List[str]:
"""
Combine AI-generated titles with content angle titles, ensuring variety and quality.
Args:
ai_titles: AI-generated title options
content_angle_titles: Titles derived from content angles
primary_keywords: Primary keywords for fallback generation
Returns:
Combined list of title options (max 6 total)
"""
all_titles = []
# Add content angle titles first (these are research-based and valuable)
for title in content_angle_titles[:3]: # Limit to top 3 content angles
if title and title not in all_titles:
all_titles.append(title)
# Add AI-generated titles
for title in ai_titles:
if title and title not in all_titles:
all_titles.append(title)
# Note: Removed fallback titles as requested - only use research and AI-generated titles
# Limit to 6 titles maximum for UI usability
final_titles = all_titles[:6]
logger.info(f"Combined title options: {len(final_titles)} total (AI: {len(ai_titles)}, Content angles: {len(content_angle_titles)})")
return final_titles
def generate_fallback_titles(self, primary_keywords: List[str]) -> List[str]:
"""Generate fallback titles when AI generation fails."""
primary_keyword = primary_keywords[0] if primary_keywords else "Topic"
return [
f"The Complete Guide to {primary_keyword}",
f"{primary_keyword}: Everything You Need to Know",
f"How to Master {primary_keyword} in 2024"
]