chore: push all remaining changes
- Blog writer enhancements and bug fixes - Wix integration improvements - Frontend UI updates - GSC dashboard docs cleanup - Image studio assets - LinkedIn requirements file - Various dependency updates
This commit is contained in:
@@ -241,9 +241,23 @@ class GroundingContextEngine:
|
||||
else:
|
||||
authority_distribution['low'] += 1
|
||||
|
||||
# Extract actual high-authority sources from chunks
|
||||
high_authority_sources = []
|
||||
for chunk in grounding_metadata.grounding_chunks:
|
||||
chunk_authority = self._calculate_chunk_authority(chunk)
|
||||
if chunk_authority >= 0.8:
|
||||
high_authority_sources.append({
|
||||
'title': chunk.title if chunk.title else 'Unknown Source',
|
||||
'url': chunk.url if chunk.url else '',
|
||||
'score': round(chunk_authority, 3)
|
||||
})
|
||||
# Sort by authority score descending, keep top 5
|
||||
high_authority_sources.sort(key=lambda x: x['score'], reverse=True)
|
||||
high_authority_sources = high_authority_sources[:5]
|
||||
|
||||
return {
|
||||
'average_authority_score': sum(authority_scores) / len(authority_scores) if authority_scores else 0.0,
|
||||
'high_authority_sources': [{'title': 'High Authority Source', 'url': 'example.com', 'score': 0.9}], # Placeholder
|
||||
'high_authority_sources': high_authority_sources,
|
||||
'authority_distribution': dict(authority_distribution)
|
||||
}
|
||||
|
||||
|
||||
@@ -52,6 +52,44 @@ class OutlineGenerator:
|
||||
raw_analysis = research.keyword_analysis if research else {}
|
||||
return self.keyword_curator.curate(raw_analysis)
|
||||
|
||||
def _build_optimization_context(self, research) -> str:
|
||||
"""Build a compact research context for the outline optimizer.
|
||||
Provides keywords, competitor data, and top source summaries so
|
||||
the optimizer doesn't run blind to the research."""
|
||||
if not research:
|
||||
return ""
|
||||
parts = []
|
||||
kw = research.keyword_analysis if research.keyword_analysis else {}
|
||||
primary = kw.get('primary', [])
|
||||
if primary:
|
||||
parts.append(f"Primary keywords: {', '.join(primary[:5])}")
|
||||
search_intent = kw.get('search_intent', '')
|
||||
if search_intent:
|
||||
parts.append(f"Search intent: {search_intent}")
|
||||
comp = research.competitor_analysis if research.competitor_analysis else {}
|
||||
top_competitors = comp.get('top_competitors', [])
|
||||
if top_competitors:
|
||||
parts.append(f"Top competitors: {', '.join(str(c) for c in top_competitors[:5])}")
|
||||
content_gaps = kw.get('content_gaps', [])
|
||||
if content_gaps:
|
||||
parts.append(f"Content gaps: {'; '.join(str(g) for g in content_gaps[:5])}")
|
||||
opportunities = comp.get('opportunities', [])
|
||||
if opportunities:
|
||||
parts.append(f"Opportunities: {'; '.join(str(o) for o in opportunities[:5])}")
|
||||
sources = research.sources if research.sources else []
|
||||
if sources:
|
||||
top_sources = sorted(sources, key=lambda s: s.credibility_score or 0.8, reverse=True)[:5]
|
||||
source_lines = []
|
||||
for s in top_sources:
|
||||
line = f"- {s.title}"
|
||||
if s.summary:
|
||||
line += f": {s.summary[:150]}"
|
||||
elif s.excerpt:
|
||||
line += f": {s.excerpt[:150]}"
|
||||
source_lines.append(line)
|
||||
parts.append("Key research sources:\n" + "\n".join(source_lines))
|
||||
return "\n".join(parts)
|
||||
|
||||
async def generate(self, request: BlogOutlineRequest, user_id: str) -> BlogOutlineResponse:
|
||||
"""
|
||||
Generate AI-powered outline using research results.
|
||||
@@ -102,7 +140,7 @@ class OutlineGenerator:
|
||||
|
||||
# Run parallel processing for speed optimization (user_id required)
|
||||
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing_async(
|
||||
outline_sections, research, user_id
|
||||
outline_sections, research, user_id, competitive_advantage=selected_competitive_advantage or ""
|
||||
)
|
||||
|
||||
# Enhance sections with grounding insights
|
||||
@@ -113,7 +151,8 @@ class OutlineGenerator:
|
||||
|
||||
# Optimize outline for better flow, SEO, and engagement (user_id required)
|
||||
logger.info("Optimizing outline for better flow and engagement...")
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
|
||||
optimization_context = self._build_optimization_context(research)
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id, research_context=optimization_context)
|
||||
|
||||
# Rebalance word counts for optimal distribution
|
||||
target_words = request.word_count or 1500
|
||||
@@ -202,7 +241,7 @@ class OutlineGenerator:
|
||||
|
||||
# Run parallel processing for speed optimization (user_id required for subscription checks)
|
||||
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing(
|
||||
outline_sections, research, user_id, task_id
|
||||
outline_sections, research, user_id, task_id, competitive_advantage=selected_competitive_advantage or ""
|
||||
)
|
||||
|
||||
# Enhance sections with grounding insights (depends on both previous tasks)
|
||||
@@ -213,7 +252,8 @@ class OutlineGenerator:
|
||||
|
||||
# Optimize outline for better flow, SEO, and engagement (user_id required for subscription checks)
|
||||
await task_manager.update_progress(task_id, "🎯 Optimizing outline for better flow and engagement...")
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
|
||||
optimization_context = self._build_optimization_context(research)
|
||||
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id, research_context=optimization_context)
|
||||
|
||||
# Rebalance word counts for optimal distribution
|
||||
await task_manager.update_progress(task_id, "⚖️ Rebalancing word count distribution...")
|
||||
|
||||
@@ -4,7 +4,7 @@ Outline Optimizer - AI-powered outline optimization and rebalancing.
|
||||
Optimizes outlines for better flow, SEO, and engagement.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from typing import List, Dict, Any, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogOutlineSection
|
||||
@@ -13,13 +13,14 @@ from models.blog_models import BlogOutlineSection
|
||||
class OutlineOptimizer:
|
||||
"""Optimizes outlines for better flow, SEO, and engagement."""
|
||||
|
||||
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str) -> List[BlogOutlineSection]:
|
||||
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str, research_context: str = "") -> List[BlogOutlineSection]:
|
||||
"""Optimize entire outline for better flow, SEO, and engagement.
|
||||
|
||||
Args:
|
||||
outline: List of outline sections to optimize
|
||||
focus: Optimization focus (e.g., "general optimization")
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
research_context: Optional research context to ground optimization
|
||||
|
||||
Returns:
|
||||
List of optimized outline sections
|
||||
@@ -40,19 +41,28 @@ Current Outline:
|
||||
Optimization Focus: {focus}
|
||||
|
||||
Goals: Improve narrative flow, enhance SEO, increase engagement, ensure comprehensive coverage.
|
||||
"""
|
||||
if research_context:
|
||||
optimization_prompt += f"""
|
||||
Research Context (use this to ground your optimization in real data):
|
||||
{research_context}
|
||||
|
||||
Ensure the optimized outline reflects the research insights above — headings should address the key topics, keywords should align with search intent, and sections should cover the most important angles from the research.
|
||||
"""
|
||||
|
||||
optimization_prompt += """
|
||||
Return JSON format:
|
||||
{{
|
||||
{
|
||||
"outline": [
|
||||
{{
|
||||
{
|
||||
"heading": "Optimized heading",
|
||||
"subheadings": ["subheading 1", "subheading 2"],
|
||||
"key_points": ["point 1", "point 2"],
|
||||
"target_words": 300,
|
||||
"keywords": ["keyword1", "keyword2"]
|
||||
}}
|
||||
}
|
||||
]
|
||||
}}"""
|
||||
}"""
|
||||
|
||||
try:
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
@@ -112,26 +122,34 @@ Return JSON format:
|
||||
return outline
|
||||
|
||||
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
|
||||
"""Rebalance word count distribution across sections."""
|
||||
"""Rebalance word count distribution across sections, weighting by source count."""
|
||||
total_sections = len(outline)
|
||||
if total_sections == 0:
|
||||
return outline
|
||||
|
||||
# Calculate target distribution
|
||||
intro_words = int(target_words * 0.12) # 12% for intro
|
||||
conclusion_words = int(target_words * 0.12) # 12% for conclusion
|
||||
intro_words = int(target_words * 0.12)
|
||||
conclusion_words = int(target_words * 0.12)
|
||||
main_content_words = target_words - intro_words - conclusion_words
|
||||
|
||||
# Distribute main content words across sections
|
||||
words_per_section = main_content_words // total_sections
|
||||
remainder = main_content_words % total_sections
|
||||
# Weight sections by research density (sections with more sources get more words)
|
||||
main_sections = outline[1:-1] if total_sections > 2 else outline
|
||||
source_weights = []
|
||||
for section in main_sections:
|
||||
ref_count = len(getattr(section, 'references', []) or [])
|
||||
source_weights.append(1.0 + ref_count * 0.5)
|
||||
|
||||
total_weight = sum(source_weights) if source_weights else len(main_sections)
|
||||
|
||||
for i, section in enumerate(outline):
|
||||
if i == 0: # First section (intro)
|
||||
if i == 0 and total_sections > 2:
|
||||
section.target_words = intro_words
|
||||
elif i == total_sections - 1: # Last section (conclusion)
|
||||
elif i == total_sections - 1 and total_sections > 2:
|
||||
section.target_words = conclusion_words
|
||||
else: # Main content sections
|
||||
section.target_words = words_per_section + (1 if i < remainder else 0)
|
||||
else:
|
||||
main_idx = i - 1 if total_sections > 2 else i
|
||||
if main_idx < len(source_weights):
|
||||
section.target_words = int(main_content_words * source_weights[main_idx] / total_weight)
|
||||
else:
|
||||
section.target_words = main_content_words // max(len(main_sections), 1)
|
||||
|
||||
return outline
|
||||
|
||||
@@ -233,9 +233,9 @@ class OutlineService:
|
||||
"""Enhance a section using AI with research context."""
|
||||
return await self.section_enhancer.enhance(section, focus)
|
||||
|
||||
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization") -> List[BlogOutlineSection]:
|
||||
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization", research_context: str = "") -> List[BlogOutlineSection]:
|
||||
"""Optimize entire outline for better flow, SEO, and engagement."""
|
||||
return await self.outline_optimizer.optimize(outline, focus)
|
||||
return await self.outline_optimizer.optimize(outline, focus, research_context=research_context)
|
||||
|
||||
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
|
||||
"""Rebalance word count distribution across sections."""
|
||||
|
||||
@@ -17,7 +17,7 @@ class ParallelProcessor:
|
||||
self.source_mapper = source_mapper
|
||||
self.grounding_engine = grounding_engine
|
||||
|
||||
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None) -> Tuple[Any, Any]:
|
||||
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None, competitive_advantage: str = "") -> Tuple[Any, Any]:
|
||||
"""
|
||||
Run source mapping and grounding insights extraction in parallel.
|
||||
|
||||
@@ -26,6 +26,7 @@ class ParallelProcessor:
|
||||
research: Research data object
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
task_id: Optional task ID for progress updates
|
||||
competitive_advantage: Selected competitive advantage for preferential source matching
|
||||
|
||||
Returns:
|
||||
Tuple of (mapped_sections, grounding_insights)
|
||||
@@ -44,7 +45,7 @@ class ParallelProcessor:
|
||||
|
||||
# Run these tasks in parallel to save time
|
||||
source_mapping_task = asyncio.create_task(
|
||||
self._run_source_mapping(outline_sections, research, task_id, user_id)
|
||||
self._run_source_mapping(outline_sections, research, task_id, user_id, competitive_advantage)
|
||||
)
|
||||
|
||||
grounding_insights_task = asyncio.create_task(
|
||||
@@ -59,7 +60,7 @@ class ParallelProcessor:
|
||||
|
||||
return mapped_sections, grounding_insights
|
||||
|
||||
async def run_parallel_processing_async(self, outline_sections, research, user_id: str) -> Tuple[Any, Any]:
|
||||
async def run_parallel_processing_async(self, outline_sections, research, user_id: str, competitive_advantage: str = "") -> Tuple[Any, Any]:
|
||||
"""
|
||||
Run parallel processing without progress updates (for non-progress methods).
|
||||
|
||||
@@ -67,6 +68,7 @@ class ParallelProcessor:
|
||||
outline_sections: List of outline sections to process
|
||||
research: Research data object
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
competitive_advantage: Selected competitive advantage for preferential source matching
|
||||
|
||||
Returns:
|
||||
Tuple of (mapped_sections, grounding_insights)
|
||||
@@ -81,7 +83,7 @@ class ParallelProcessor:
|
||||
|
||||
# Run these tasks in parallel to save time
|
||||
source_mapping_task = asyncio.create_task(
|
||||
self._run_source_mapping_async(outline_sections, research, user_id)
|
||||
self._run_source_mapping_async(outline_sections, research, user_id, competitive_advantage)
|
||||
)
|
||||
|
||||
grounding_insights_task = asyncio.create_task(
|
||||
@@ -96,12 +98,12 @@ class ParallelProcessor:
|
||||
|
||||
return mapped_sections, grounding_insights
|
||||
|
||||
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str):
|
||||
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str, competitive_advantage: str = ""):
|
||||
"""Run source mapping in parallel."""
|
||||
if task_id:
|
||||
from api.blog_writer.task_manager import task_manager
|
||||
await task_manager.update_progress(task_id, "🔗 Applying intelligent source-to-section mapping...")
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id, competitive_advantage=competitive_advantage)
|
||||
|
||||
async def _run_grounding_insights_extraction(self, research, task_id):
|
||||
"""Run grounding insights extraction in parallel."""
|
||||
@@ -110,10 +112,10 @@ class ParallelProcessor:
|
||||
await task_manager.update_progress(task_id, "🧠 Extracting grounding metadata insights...")
|
||||
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
|
||||
|
||||
async def _run_source_mapping_async(self, outline_sections, research, user_id: str):
|
||||
async def _run_source_mapping_async(self, outline_sections, research, user_id: str, competitive_advantage: str = ""):
|
||||
"""Run source mapping in parallel (async version without progress updates)."""
|
||||
logger.info("Applying intelligent source-to-section mapping...")
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
|
||||
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id, competitive_advantage=competitive_advantage)
|
||||
|
||||
async def _run_grounding_insights_extraction_async(self, research):
|
||||
"""Run grounding insights extraction in parallel (async version without progress updates)."""
|
||||
|
||||
@@ -37,27 +37,60 @@ class PromptBuilder:
|
||||
opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
|
||||
advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
|
||||
competitor_headings_text = ', '.join(research.competitor_analysis.get('competitor_headings', [])[:3]) if research and research.competitor_analysis and research.competitor_analysis.get('competitor_headings') else ""
|
||||
content_gaps_text = ', '.join(research.competitor_analysis.get('content_gaps', [])) if research and research.competitor_analysis and research.competitor_analysis.get('content_gaps') else ""
|
||||
industry_leaders_text = ', '.join(research.competitor_analysis.get('industry_leaders', [])) if research and research.competitor_analysis and research.competitor_analysis.get('industry_leaders') else ""
|
||||
|
||||
# Extract additional UI-mapped context fields
|
||||
analysis_insights_text = (research.keyword_analysis.get('analysis_insights', '') or '') if research and research.keyword_analysis else ''
|
||||
market_positioning_text = (research.competitor_analysis.get('market_positioning', '') or '') if research and research.competitor_analysis else ''
|
||||
difficulty_score = research.keyword_analysis.get('difficulty', None) if research and research.keyword_analysis else None
|
||||
|
||||
# Extract search queries as intent signals
|
||||
search_queries_text = ', '.join(research.search_queries) if research and hasattr(research, 'search_queries') and research.search_queries else ""
|
||||
|
||||
# Extract top 3 authoritative source excerpts as factual data points
|
||||
# Build numbered source list — all sources with index, title, excerpt, and highlights
|
||||
# The LLM will reference these indices when assigning sources to sections
|
||||
source_list_text = ""
|
||||
if sources:
|
||||
source_lines = []
|
||||
for i, src in enumerate(sources, 1):
|
||||
title = getattr(src, 'title', '') or ''
|
||||
excerpt = getattr(src, 'excerpt', '') or ''
|
||||
highlights = getattr(src, 'highlights', []) or []
|
||||
summary = getattr(src, 'summary', '') or ''
|
||||
source_type = getattr(src, 'source_type', '') or ''
|
||||
author = getattr(src, 'author', '') or ''
|
||||
|
||||
line = f" [{i}] {title}"
|
||||
if source_type:
|
||||
line += f" [{source_type}]"
|
||||
if author:
|
||||
line += f" by {author}"
|
||||
if summary:
|
||||
line += f" — {summary[:1000]}"
|
||||
elif excerpt:
|
||||
line += f" — {excerpt[:1000]}"
|
||||
if highlights:
|
||||
line += f" | Key findings: {'; '.join(h[:250] for h in highlights[:3])}"
|
||||
source_lines.append(line)
|
||||
if source_lines:
|
||||
source_list_text = "RESEARCH SOURCES (numbered for reference):\n" + "\n".join(source_lines)
|
||||
|
||||
# Top factual excerpts for depth (keep as supplement)
|
||||
source_excerpts_text = ""
|
||||
if sources:
|
||||
sorted_sources = sorted(
|
||||
[s for s in sources if (s.excerpt or s.summary)],
|
||||
key=lambda s: s.credibility_score or 0.8, reverse=True
|
||||
)[:3]
|
||||
)[:5]
|
||||
excerpts = []
|
||||
for i, src in enumerate(sorted_sources, 1):
|
||||
excerpt = src.excerpt or src.summary or ""
|
||||
if len(excerpt) > 300:
|
||||
excerpt = excerpt[:297] + "..."
|
||||
if len(excerpt) > 500:
|
||||
excerpt = excerpt[:497] + "..."
|
||||
excerpts.append(f" {i}. \"{src.title}\" — {excerpt}")
|
||||
if excerpts:
|
||||
source_excerpts_text = "FACTUAL DATA POINTS FROM RESEARCH:\n" + "\n".join(excerpts)
|
||||
source_excerpts_text = "DETAILED FACTS FROM TOP SOURCES:\n" + "\n".join(excerpts)
|
||||
|
||||
# Extract recency: newest source publication date
|
||||
newest_date_str = ""
|
||||
@@ -76,12 +109,12 @@ class PromptBuilder:
|
||||
grounding_evidence_text = ""
|
||||
if research and research.grounding_metadata and research.grounding_metadata.grounding_supports:
|
||||
supports = research.grounding_metadata.grounding_supports
|
||||
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:3]
|
||||
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:5]
|
||||
if top_supports:
|
||||
evidence_parts = []
|
||||
for i, s in enumerate(top_supports, 1):
|
||||
text = s.segment_text[:250]
|
||||
if len(s.segment_text) > 250:
|
||||
text = s.segment_text[:400]
|
||||
if len(s.segment_text) > 400:
|
||||
text += "..."
|
||||
evidence_parts.append(f" {i}. {text}")
|
||||
grounding_evidence_text = "VERIFIED EVIDENCE (high-confidence snippets):\n" + "\n".join(evidence_parts)
|
||||
@@ -151,8 +184,11 @@ Market Opportunities: {opportunity_text}
|
||||
Competitive Advantages: {advantages_text}
|
||||
{f"Market Positioning: {market_positioning_text}" if market_positioning_text else ""}
|
||||
{f"Competitor Headings (AVOID duplicating): {competitor_headings_text}" if competitor_headings_text else ""}
|
||||
{f"Content Gaps (MUST address these gaps): {content_gaps_text}" if content_gaps_text else ""}
|
||||
{f"Industry Leaders: {industry_leaders_text}" if industry_leaders_text else ""}
|
||||
{f"Search Intent Signals: {search_queries_text}" if search_queries_text else ""}
|
||||
|
||||
RESEARCH SOURCES: {len(sources)} authoritative sources available
|
||||
{source_list_text}
|
||||
{newest_date_str}
|
||||
|
||||
{source_excerpts_text}
|
||||
@@ -168,8 +204,9 @@ STRATEGIC REQUIREMENTS:
|
||||
- Create SEO-optimized headings with natural keyword integration
|
||||
- Surface the strongest research-backed angles within the outline
|
||||
- Build logical narrative flow from problem to solution
|
||||
- Include data-driven insights from research sources
|
||||
- Address content gaps and market opportunities
|
||||
- Include data-driven insights from research sources — use the numbered sources above
|
||||
- For each section, assign the most relevant source indices using the [N] numbers above
|
||||
- Address content gaps and market opportunities — if content gaps are listed, dedicate sections to fill those gaps
|
||||
- Optimize for search intent and user questions
|
||||
- Ensure engaging, actionable content throughout
|
||||
|
||||
@@ -186,7 +223,8 @@ Return JSON format:
|
||||
"subheadings": ["Subheading 1", "Subheading 2", "Subheading 3"],
|
||||
"key_points": ["Key point 1", "Key point 2", "Key point 3"],
|
||||
"target_words": 300,
|
||||
"keywords": ["keyword 1", "keyword 2"]
|
||||
"keywords": ["keyword 1", "keyword 2"],
|
||||
"source_indices": [1, 3, 5]
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
@@ -220,9 +258,14 @@ Return JSON format:
|
||||
"keywords": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"source_indices": {
|
||||
"type": "array",
|
||||
"items": {"type": "integer"},
|
||||
"description": "Indices of research sources (from the numbered list above) that support this section"
|
||||
}
|
||||
},
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
|
||||
"required": ["heading", "subheadings", "key_points", "target_words", "keywords", "source_indices"]
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -100,18 +100,37 @@ class ResponseProcessor:
|
||||
raise ValueError(f"AI outline generation failed: {error_str}")
|
||||
|
||||
def convert_to_sections(self, outline_data: Dict[str, Any], sources: List) -> List[BlogOutlineSection]:
|
||||
"""Convert outline data to BlogOutlineSection objects."""
|
||||
"""Convert outline data to BlogOutlineSection objects.
|
||||
|
||||
If the LLM assigned source_indices to sections, populate references
|
||||
directly from those indices. Indices are 1-based (matching the [N]
|
||||
labels in the prompt) — converted to 0-based for list access.
|
||||
Sections without source_indices will be populated by the algorithmic
|
||||
source mapper in a later step.
|
||||
"""
|
||||
outline_sections = []
|
||||
for i, section_data in enumerate(outline_data.get('outline', [])):
|
||||
if not isinstance(section_data, dict) or 'heading' not in section_data:
|
||||
continue
|
||||
|
||||
|
||||
# Parse LLM-assigned source indices (1-based)
|
||||
raw_indices = section_data.get('source_indices', [])
|
||||
section_refs = []
|
||||
if raw_indices and sources:
|
||||
for idx in raw_indices:
|
||||
try:
|
||||
source_idx = int(idx) - 1 # Convert 1-based → 0-based
|
||||
if 0 <= source_idx < len(sources):
|
||||
section_refs.append(sources[source_idx])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
section = BlogOutlineSection(
|
||||
id=f"s{i+1}",
|
||||
heading=section_data.get('heading', f'Section {i+1}'),
|
||||
subheadings=section_data.get('subheadings', []),
|
||||
key_points=section_data.get('key_points', []),
|
||||
references=[], # Will be populated by intelligent mapping
|
||||
references=section_refs, # LLM-assigned if provided, else []
|
||||
target_words=section_data.get('target_words', 200),
|
||||
keywords=section_data.get('keywords', [])
|
||||
)
|
||||
|
||||
@@ -41,10 +41,33 @@ class SourceToSectionMapper:
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
||||
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
||||
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
|
||||
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
|
||||
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'much', 'many', 'more', 'most',
|
||||
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
|
||||
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
|
||||
'over', 'under', 'again', 'further', 'then', 'once'
|
||||
'over', 'under', 'again', 'further', 'then', 'once', 'also', 'into', 'about', 'between',
|
||||
'through', 'during', 'before', 'after', 'above', 'below', 'from', 'since', 'until', 'while',
|
||||
'because', 'however', 'therefore', 'thus', 'hence', 'yet', 'still', 'already', 'even'
|
||||
}
|
||||
|
||||
# Common abbreviation/synonym pairs for fuzzy matching
|
||||
self._synonym_map = {
|
||||
'ai': ['artificial intelligence', 'machine intelligence'],
|
||||
'ml': ['machine learning'],
|
||||
'dl': ['deep learning'],
|
||||
'nlp': ['natural language processing'],
|
||||
'iot': ['internet of things'],
|
||||
'saas': ['software as a service'],
|
||||
'b2b': ['business to business'],
|
||||
'b2c': ['business to consumer'],
|
||||
'cx': ['customer experience'],
|
||||
'ux': ['user experience'],
|
||||
'roi': ['return on investment'],
|
||||
'kpi': ['key performance indicator'],
|
||||
'crm': ['customer relationship management'],
|
||||
'erp': ['enterprise resource planning'],
|
||||
'seo': ['search engine optimization'],
|
||||
'cto': ['chief technology officer'],
|
||||
'vp': ['vice president'],
|
||||
}
|
||||
|
||||
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
|
||||
@@ -53,15 +76,21 @@ class SourceToSectionMapper:
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse,
|
||||
user_id: str
|
||||
user_id: str,
|
||||
competitive_advantage: str = ""
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Map research sources to outline sections using intelligent algorithms.
|
||||
|
||||
Sections that already have LLM-assigned references (from source_indices
|
||||
in the outline prompt) are preserved. Algorithmic mapping fills gaps
|
||||
for sections without LLM-assigned sources.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections to map sources to
|
||||
research_data: Research data containing sources and metadata
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
competitive_advantage: Selected competitive advantage to preferentially match
|
||||
|
||||
Returns:
|
||||
List of outline sections with intelligently mapped sources
|
||||
@@ -76,16 +105,39 @@ class SourceToSectionMapper:
|
||||
logger.warning("No sections or sources to map")
|
||||
return sections
|
||||
|
||||
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
|
||||
# Separate sections with LLM-assigned references from those without
|
||||
sections_with_refs = [s for s in sections if s.references]
|
||||
sections_without_refs = [s for s in sections if not s.references]
|
||||
|
||||
# Step 1: Algorithmic mapping
|
||||
mapping_results = self._algorithmic_source_mapping(sections, research_data)
|
||||
logger.info(
|
||||
f"Mapping {len(research_data.sources)} sources to {len(sections)} sections "
|
||||
f"({len(sections_with_refs)} with LLM-assigned references, "
|
||||
f"{len(sections_without_refs)} need algorithmic mapping)"
|
||||
)
|
||||
|
||||
# Step 2: AI validation and improvement (single prompt, user_id required for subscription checks)
|
||||
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
|
||||
if sections_without_refs:
|
||||
# Step 1: Algorithmic mapping for sections without LLM-assigned references
|
||||
mapping_results = self._algorithmic_source_mapping(sections_without_refs, research_data, competitive_advantage)
|
||||
|
||||
# Step 2: AI validation and improvement
|
||||
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
|
||||
|
||||
# Step 3: Apply mapping only to sections that need it
|
||||
mapped_sections_with = self._apply_mapping_to_sections(sections_without_refs, validated_mapping)
|
||||
else:
|
||||
mapped_sections_with = []
|
||||
|
||||
# Step 3: Apply validated mapping to sections
|
||||
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
|
||||
# Combine: keep LLM-assigned sections as-is, add algorithmically mapped ones
|
||||
mapped_sections = list(sections_with_refs) + mapped_sections_with
|
||||
|
||||
# Preserve original ordering
|
||||
original_ids = [s.id for s in sections]
|
||||
mapped_sections.sort(key=lambda s: original_ids.index(s.id) if s.id in original_ids else 999)
|
||||
|
||||
# Warn if any section still has zero references
|
||||
for s in mapped_sections:
|
||||
if not s.references:
|
||||
logger.warning(f"Section '{s.heading}' (id={s.id}) has ZERO sources — content generator will use keyword-based fallback")
|
||||
|
||||
logger.info("✅ Source-to-section mapping completed successfully")
|
||||
return mapped_sections
|
||||
@@ -93,7 +145,8 @@ class SourceToSectionMapper:
|
||||
def _algorithmic_source_mapping(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse
|
||||
research_data: BlogResearchResponse,
|
||||
competitive_advantage: str = ""
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Perform algorithmic mapping of sources to sections.
|
||||
@@ -101,6 +154,7 @@ class SourceToSectionMapper:
|
||||
Args:
|
||||
sections: List of outline sections
|
||||
research_data: Research data with sources
|
||||
competitive_advantage: Selected competitive advantage to boost matching
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section IDs to list of (source, score) tuples
|
||||
@@ -114,7 +168,7 @@ class SourceToSectionMapper:
|
||||
# Calculate multi-dimensional relevance score
|
||||
semantic_score = self._calculate_semantic_similarity(section, source)
|
||||
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
|
||||
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
|
||||
contextual_score = self._calculate_contextual_relevance(section, source, research_data, competitive_advantage)
|
||||
|
||||
# Weighted total score
|
||||
total_score = (
|
||||
@@ -140,38 +194,54 @@ class SourceToSectionMapper:
|
||||
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
|
||||
"""
|
||||
Calculate semantic similarity between section and source.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
|
||||
Returns:
|
||||
Semantic similarity score (0.0 to 1.0)
|
||||
Uses word overlap, stem matching, bigram overlap, title-boost, and synonym expansion.
|
||||
"""
|
||||
# Extract text content for comparison
|
||||
section_text = self._extract_section_text(section)
|
||||
source_text = self._extract_source_text(source)
|
||||
|
||||
# Calculate word overlap
|
||||
section_words = self._extract_meaningful_words(section_text)
|
||||
source_words = self._extract_meaningful_words(source_text)
|
||||
|
||||
if not section_words or not source_words:
|
||||
return 0.0
|
||||
|
||||
# Calculate Jaccard similarity
|
||||
intersection = len(set(section_words) & set(source_words))
|
||||
union = len(set(section_words) | set(source_words))
|
||||
section_set = set(section_words)
|
||||
source_set = set(source_words)
|
||||
|
||||
jaccard_similarity = intersection / union if union > 0 else 0.0
|
||||
# 1. Jaccard similarity on raw words
|
||||
intersection = len(section_set & source_set)
|
||||
union = len(section_set | source_set)
|
||||
jaccard = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Boost score for exact phrase matches
|
||||
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
|
||||
# 2. Stem matching — catches word variants (e.g., "running" vs "runs")
|
||||
section_stems = set(self._stem_word(w) for w in section_words)
|
||||
source_stems = set(self._stem_word(w) for w in source_words)
|
||||
stem_intersection = len(section_stems & source_stems)
|
||||
stem_union = len(section_stems | source_stems)
|
||||
stem_similarity = stem_intersection / stem_union if stem_union > 0 else 0.0
|
||||
|
||||
# Combine Jaccard similarity with phrase boost
|
||||
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
|
||||
# 3. Bigram overlap — catches multi-word concepts (e.g., "machine learning")
|
||||
section_bigrams = set(self._extract_bigrams(section_text))
|
||||
source_bigrams = set(self._extract_bigrams(source_text))
|
||||
bigram_overlap = len(section_bigrams & source_bigrams)
|
||||
bigram_score = min(0.3, bigram_overlap * 0.1) if (section_bigrams or source_bigrams) else 0.0
|
||||
|
||||
return semantic_score
|
||||
# 4. Title-boost — section heading matching source title is a strong signal
|
||||
heading = (section.heading or '').lower()
|
||||
source_title = (source.title or '').lower()
|
||||
heading_words = set(self._extract_meaningful_words(heading))
|
||||
title_words = set(self._extract_meaningful_words(source_title))
|
||||
title_overlap = len(heading_words & title_words) / len(heading_words | title_words) if (heading_words or title_words) else 0.0
|
||||
title_boost = min(0.3, title_overlap * 0.5)
|
||||
|
||||
# 5. Synonym expansion — expand abbreviations and match across synonym pairs
|
||||
synonym_score = self._calculate_synonym_overlap(section_words, source_words)
|
||||
|
||||
# Combine: Jaccard + stem give base, bigram + title + synonyms boost
|
||||
base_similarity = max(jaccard, stem_similarity)
|
||||
combined = min(1.0, base_similarity + bigram_score + title_boost + synonym_score + 0.0)
|
||||
|
||||
return combined
|
||||
|
||||
def _calculate_keyword_relevance(
|
||||
self,
|
||||
@@ -219,7 +289,8 @@ class SourceToSectionMapper:
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
source: ResearchSource,
|
||||
research_data: BlogResearchResponse
|
||||
research_data: BlogResearchResponse,
|
||||
competitive_advantage: str = ""
|
||||
) -> float:
|
||||
"""
|
||||
Calculate contextual relevance based on section content and source context.
|
||||
@@ -228,6 +299,7 @@ class SourceToSectionMapper:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
research_data: Research data with context
|
||||
competitive_advantage: Selected competitive advantage to boost matching
|
||||
|
||||
Returns:
|
||||
Contextual relevance score (0.0 to 1.0)
|
||||
@@ -264,6 +336,15 @@ class SourceToSectionMapper:
|
||||
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
|
||||
contextual_score += industry_score * 0.2
|
||||
|
||||
# 4. Competitive advantage boost — sources that match the advantage get a score lift
|
||||
if competitive_advantage:
|
||||
advantage_words = set(self._extract_meaningful_words(competitive_advantage.lower()))
|
||||
if advantage_words:
|
||||
advantage_in_section = sum(1 for w in advantage_words if w in section_text) / len(advantage_words)
|
||||
advantage_in_source = sum(1 for w in advantage_words if w in source_text) / len(advantage_words)
|
||||
if advantage_in_section > 0.3 and advantage_in_source > 0.3:
|
||||
contextual_score += 0.25 * (advantage_in_section + advantage_in_source)
|
||||
|
||||
return min(1.0, contextual_score)
|
||||
|
||||
def _ai_validate_mapping(
|
||||
@@ -360,10 +441,15 @@ class SourceToSectionMapper:
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_source_text(self, source: ResearchSource) -> str:
|
||||
"""Extract all text content from a source."""
|
||||
"""Extract all text content from a source, including full text for better matching."""
|
||||
text_parts = [source.title]
|
||||
if source.summary:
|
||||
text_parts.append(source.summary)
|
||||
if source.excerpt:
|
||||
text_parts.append(source.excerpt)
|
||||
content = getattr(source, 'content', '') or ''
|
||||
if content:
|
||||
text_parts.append(content[:500])
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_meaningful_words(self, text: str) -> List[str]:
|
||||
@@ -382,6 +468,41 @@ class SourceToSectionMapper:
|
||||
|
||||
return meaningful_words
|
||||
|
||||
def _stem_word(self, word: str) -> str:
|
||||
"""Rudimentary suffix-stripping stemmer for English words."""
|
||||
if len(word) <= 3:
|
||||
return word
|
||||
for suffix in ['ization', 'ation', 'tion', 'sion', 'ment', 'ness', 'ity', 'ing', 'able', 'ible', 'ful', 'less', 'ous', 'ive', 'ally', 'ly', 'er', 'ed', 'es', 's']:
|
||||
if word.endswith(suffix) and len(word) - len(suffix) >= 3:
|
||||
return word[:-len(suffix)]
|
||||
return word
|
||||
|
||||
def _extract_bigrams(self, text: str) -> List[str]:
|
||||
"""Extract meaningful two-word phrases from text."""
|
||||
words = self._extract_meaningful_words(text)
|
||||
if len(words) < 2:
|
||||
return []
|
||||
return [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
|
||||
|
||||
def _calculate_synonym_overlap(self, section_words: List[str], source_words: List[str]) -> float:
|
||||
"""Score overlap via abbreviation/synonym expansion."""
|
||||
section_set = set(section_words)
|
||||
source_set = set(source_words)
|
||||
extra_matches = 0
|
||||
total_terms = len(section_set | source_set) or 1
|
||||
|
||||
for abbr, expansions in self._synonym_map.items():
|
||||
abbr_in_section = abbr in section_set
|
||||
abbr_in_source = abbr in source_set
|
||||
for expansion in expansions:
|
||||
exp_words = set(expansion.split())
|
||||
exp_in_section = exp_words.issubset(section_set)
|
||||
exp_in_source = exp_words.issubset(source_set)
|
||||
if (abbr_in_section and exp_in_source) or (abbr_in_source and exp_in_section):
|
||||
extra_matches += 1
|
||||
|
||||
return min(0.2, extra_matches * 0.05)
|
||||
|
||||
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Calculate phrase similarity boost score."""
|
||||
if not text1 or not text2:
|
||||
|
||||
Reference in New Issue
Block a user