chore: push all remaining changes

- Blog writer enhancements and bug fixes
- Wix integration improvements
- Frontend UI updates
- GSC dashboard docs cleanup
- Image studio assets
- LinkedIn requirements file
- Various dependency updates
This commit is contained in:
ajaysi
2026-06-12 20:32:03 +05:30
parent 63a0df2536
commit d90d441019
78 changed files with 3963 additions and 2899 deletions

View File

@@ -241,9 +241,23 @@ class GroundingContextEngine:
else:
authority_distribution['low'] += 1
# Extract actual high-authority sources from chunks
high_authority_sources = []
for chunk in grounding_metadata.grounding_chunks:
chunk_authority = self._calculate_chunk_authority(chunk)
if chunk_authority >= 0.8:
high_authority_sources.append({
'title': chunk.title if chunk.title else 'Unknown Source',
'url': chunk.url if chunk.url else '',
'score': round(chunk_authority, 3)
})
# Sort by authority score descending, keep top 5
high_authority_sources.sort(key=lambda x: x['score'], reverse=True)
high_authority_sources = high_authority_sources[:5]
return {
'average_authority_score': sum(authority_scores) / len(authority_scores) if authority_scores else 0.0,
'high_authority_sources': [{'title': 'High Authority Source', 'url': 'example.com', 'score': 0.9}], # Placeholder
'high_authority_sources': high_authority_sources,
'authority_distribution': dict(authority_distribution)
}

View File

@@ -52,6 +52,44 @@ class OutlineGenerator:
raw_analysis = research.keyword_analysis if research else {}
return self.keyword_curator.curate(raw_analysis)
def _build_optimization_context(self, research) -> str:
"""Build a compact research context for the outline optimizer.
Provides keywords, competitor data, and top source summaries so
the optimizer doesn't run blind to the research."""
if not research:
return ""
parts = []
kw = research.keyword_analysis if research.keyword_analysis else {}
primary = kw.get('primary', [])
if primary:
parts.append(f"Primary keywords: {', '.join(primary[:5])}")
search_intent = kw.get('search_intent', '')
if search_intent:
parts.append(f"Search intent: {search_intent}")
comp = research.competitor_analysis if research.competitor_analysis else {}
top_competitors = comp.get('top_competitors', [])
if top_competitors:
parts.append(f"Top competitors: {', '.join(str(c) for c in top_competitors[:5])}")
content_gaps = kw.get('content_gaps', [])
if content_gaps:
parts.append(f"Content gaps: {'; '.join(str(g) for g in content_gaps[:5])}")
opportunities = comp.get('opportunities', [])
if opportunities:
parts.append(f"Opportunities: {'; '.join(str(o) for o in opportunities[:5])}")
sources = research.sources if research.sources else []
if sources:
top_sources = sorted(sources, key=lambda s: s.credibility_score or 0.8, reverse=True)[:5]
source_lines = []
for s in top_sources:
line = f"- {s.title}"
if s.summary:
line += f": {s.summary[:150]}"
elif s.excerpt:
line += f": {s.excerpt[:150]}"
source_lines.append(line)
parts.append("Key research sources:\n" + "\n".join(source_lines))
return "\n".join(parts)
async def generate(self, request: BlogOutlineRequest, user_id: str) -> BlogOutlineResponse:
"""
Generate AI-powered outline using research results.
@@ -102,7 +140,7 @@ class OutlineGenerator:
# Run parallel processing for speed optimization (user_id required)
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing_async(
outline_sections, research, user_id
outline_sections, research, user_id, competitive_advantage=selected_competitive_advantage or ""
)
# Enhance sections with grounding insights
@@ -113,7 +151,8 @@ class OutlineGenerator:
# Optimize outline for better flow, SEO, and engagement (user_id required)
logger.info("Optimizing outline for better flow and engagement...")
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
optimization_context = self._build_optimization_context(research)
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id, research_context=optimization_context)
# Rebalance word counts for optimal distribution
target_words = request.word_count or 1500
@@ -202,7 +241,7 @@ class OutlineGenerator:
# Run parallel processing for speed optimization (user_id required for subscription checks)
mapped_sections, grounding_insights = await self.parallel_processor.run_parallel_processing(
outline_sections, research, user_id, task_id
outline_sections, research, user_id, task_id, competitive_advantage=selected_competitive_advantage or ""
)
# Enhance sections with grounding insights (depends on both previous tasks)
@@ -213,7 +252,8 @@ class OutlineGenerator:
# Optimize outline for better flow, SEO, and engagement (user_id required for subscription checks)
await task_manager.update_progress(task_id, "🎯 Optimizing outline for better flow and engagement...")
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id)
optimization_context = self._build_optimization_context(research)
optimized_sections = await self.outline_optimizer.optimize(grounding_enhanced_sections, "comprehensive optimization", user_id, research_context=optimization_context)
# Rebalance word counts for optimal distribution
await task_manager.update_progress(task_id, "⚖️ Rebalancing word count distribution...")

View File

@@ -4,7 +4,7 @@ Outline Optimizer - AI-powered outline optimization and rebalancing.
Optimizes outlines for better flow, SEO, and engagement.
"""
from typing import List
from typing import List, Dict, Any, Optional
from loguru import logger
from models.blog_models import BlogOutlineSection
@@ -13,13 +13,14 @@ from models.blog_models import BlogOutlineSection
class OutlineOptimizer:
"""Optimizes outlines for better flow, SEO, and engagement."""
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str) -> List[BlogOutlineSection]:
async def optimize(self, outline: List[BlogOutlineSection], focus: str, user_id: str, research_context: str = "") -> List[BlogOutlineSection]:
"""Optimize entire outline for better flow, SEO, and engagement.
Args:
outline: List of outline sections to optimize
focus: Optimization focus (e.g., "general optimization")
user_id: User ID (required for subscription checks and usage tracking)
research_context: Optional research context to ground optimization
Returns:
List of optimized outline sections
@@ -40,19 +41,28 @@ Current Outline:
Optimization Focus: {focus}
Goals: Improve narrative flow, enhance SEO, increase engagement, ensure comprehensive coverage.
"""
if research_context:
optimization_prompt += f"""
Research Context (use this to ground your optimization in real data):
{research_context}
Ensure the optimized outline reflects the research insights above — headings should address the key topics, keywords should align with search intent, and sections should cover the most important angles from the research.
"""
optimization_prompt += """
Return JSON format:
{{
{
"outline": [
{{
{
"heading": "Optimized heading",
"subheadings": ["subheading 1", "subheading 2"],
"key_points": ["point 1", "point 2"],
"target_words": 300,
"keywords": ["keyword1", "keyword2"]
}}
}
]
}}"""
}"""
try:
from services.llm_providers.main_text_generation import llm_text_gen
@@ -112,26 +122,34 @@ Return JSON format:
return outline
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""Rebalance word count distribution across sections."""
"""Rebalance word count distribution across sections, weighting by source count."""
total_sections = len(outline)
if total_sections == 0:
return outline
# Calculate target distribution
intro_words = int(target_words * 0.12) # 12% for intro
conclusion_words = int(target_words * 0.12) # 12% for conclusion
intro_words = int(target_words * 0.12)
conclusion_words = int(target_words * 0.12)
main_content_words = target_words - intro_words - conclusion_words
# Distribute main content words across sections
words_per_section = main_content_words // total_sections
remainder = main_content_words % total_sections
# Weight sections by research density (sections with more sources get more words)
main_sections = outline[1:-1] if total_sections > 2 else outline
source_weights = []
for section in main_sections:
ref_count = len(getattr(section, 'references', []) or [])
source_weights.append(1.0 + ref_count * 0.5)
total_weight = sum(source_weights) if source_weights else len(main_sections)
for i, section in enumerate(outline):
if i == 0: # First section (intro)
if i == 0 and total_sections > 2:
section.target_words = intro_words
elif i == total_sections - 1: # Last section (conclusion)
elif i == total_sections - 1 and total_sections > 2:
section.target_words = conclusion_words
else: # Main content sections
section.target_words = words_per_section + (1 if i < remainder else 0)
else:
main_idx = i - 1 if total_sections > 2 else i
if main_idx < len(source_weights):
section.target_words = int(main_content_words * source_weights[main_idx] / total_weight)
else:
section.target_words = main_content_words // max(len(main_sections), 1)
return outline

View File

@@ -233,9 +233,9 @@ class OutlineService:
"""Enhance a section using AI with research context."""
return await self.section_enhancer.enhance(section, focus)
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization") -> List[BlogOutlineSection]:
async def optimize_outline_with_ai(self, outline: List[BlogOutlineSection], focus: str = "general optimization", research_context: str = "") -> List[BlogOutlineSection]:
"""Optimize entire outline for better flow, SEO, and engagement."""
return await self.outline_optimizer.optimize(outline, focus)
return await self.outline_optimizer.optimize(outline, focus, research_context=research_context)
def rebalance_word_counts(self, outline: List[BlogOutlineSection], target_words: int) -> List[BlogOutlineSection]:
"""Rebalance word count distribution across sections."""

View File

@@ -17,7 +17,7 @@ class ParallelProcessor:
self.source_mapper = source_mapper
self.grounding_engine = grounding_engine
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None) -> Tuple[Any, Any]:
async def run_parallel_processing(self, outline_sections, research, user_id: str, task_id: str = None, competitive_advantage: str = "") -> Tuple[Any, Any]:
"""
Run source mapping and grounding insights extraction in parallel.
@@ -26,6 +26,7 @@ class ParallelProcessor:
research: Research data object
user_id: User ID (required for subscription checks and usage tracking)
task_id: Optional task ID for progress updates
competitive_advantage: Selected competitive advantage for preferential source matching
Returns:
Tuple of (mapped_sections, grounding_insights)
@@ -44,7 +45,7 @@ class ParallelProcessor:
# Run these tasks in parallel to save time
source_mapping_task = asyncio.create_task(
self._run_source_mapping(outline_sections, research, task_id, user_id)
self._run_source_mapping(outline_sections, research, task_id, user_id, competitive_advantage)
)
grounding_insights_task = asyncio.create_task(
@@ -59,7 +60,7 @@ class ParallelProcessor:
return mapped_sections, grounding_insights
async def run_parallel_processing_async(self, outline_sections, research, user_id: str) -> Tuple[Any, Any]:
async def run_parallel_processing_async(self, outline_sections, research, user_id: str, competitive_advantage: str = "") -> Tuple[Any, Any]:
"""
Run parallel processing without progress updates (for non-progress methods).
@@ -67,6 +68,7 @@ class ParallelProcessor:
outline_sections: List of outline sections to process
research: Research data object
user_id: User ID (required for subscription checks and usage tracking)
competitive_advantage: Selected competitive advantage for preferential source matching
Returns:
Tuple of (mapped_sections, grounding_insights)
@@ -81,7 +83,7 @@ class ParallelProcessor:
# Run these tasks in parallel to save time
source_mapping_task = asyncio.create_task(
self._run_source_mapping_async(outline_sections, research, user_id)
self._run_source_mapping_async(outline_sections, research, user_id, competitive_advantage)
)
grounding_insights_task = asyncio.create_task(
@@ -96,12 +98,12 @@ class ParallelProcessor:
return mapped_sections, grounding_insights
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str):
async def _run_source_mapping(self, outline_sections, research, task_id, user_id: str, competitive_advantage: str = ""):
"""Run source mapping in parallel."""
if task_id:
from api.blog_writer.task_manager import task_manager
await task_manager.update_progress(task_id, "🔗 Applying intelligent source-to-section mapping...")
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id, competitive_advantage=competitive_advantage)
async def _run_grounding_insights_extraction(self, research, task_id):
"""Run grounding insights extraction in parallel."""
@@ -110,10 +112,10 @@ class ParallelProcessor:
await task_manager.update_progress(task_id, "🧠 Extracting grounding metadata insights...")
return self.grounding_engine.extract_contextual_insights(research.grounding_metadata)
async def _run_source_mapping_async(self, outline_sections, research, user_id: str):
async def _run_source_mapping_async(self, outline_sections, research, user_id: str, competitive_advantage: str = ""):
"""Run source mapping in parallel (async version without progress updates)."""
logger.info("Applying intelligent source-to-section mapping...")
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id)
return self.source_mapper.map_sources_to_sections(outline_sections, research, user_id, competitive_advantage=competitive_advantage)
async def _run_grounding_insights_extraction_async(self, research):
"""Run grounding insights extraction in parallel (async version without progress updates)."""

View File

@@ -37,27 +37,60 @@ class PromptBuilder:
opportunity_text = ', '.join(research.competitor_analysis.get('opportunities', [])) if research and research.competitor_analysis else "Not available"
advantages_text = ', '.join(research.competitor_analysis.get('competitive_advantages', [])) if research and research.competitor_analysis else "Not available"
competitor_headings_text = ', '.join(research.competitor_analysis.get('competitor_headings', [])[:3]) if research and research.competitor_analysis and research.competitor_analysis.get('competitor_headings') else ""
content_gaps_text = ', '.join(research.competitor_analysis.get('content_gaps', [])) if research and research.competitor_analysis and research.competitor_analysis.get('content_gaps') else ""
industry_leaders_text = ', '.join(research.competitor_analysis.get('industry_leaders', [])) if research and research.competitor_analysis and research.competitor_analysis.get('industry_leaders') else ""
# Extract additional UI-mapped context fields
analysis_insights_text = (research.keyword_analysis.get('analysis_insights', '') or '') if research and research.keyword_analysis else ''
market_positioning_text = (research.competitor_analysis.get('market_positioning', '') or '') if research and research.competitor_analysis else ''
difficulty_score = research.keyword_analysis.get('difficulty', None) if research and research.keyword_analysis else None
# Extract search queries as intent signals
search_queries_text = ', '.join(research.search_queries) if research and hasattr(research, 'search_queries') and research.search_queries else ""
# Extract top 3 authoritative source excerpts as factual data points
# Build numbered source list — all sources with index, title, excerpt, and highlights
# The LLM will reference these indices when assigning sources to sections
source_list_text = ""
if sources:
source_lines = []
for i, src in enumerate(sources, 1):
title = getattr(src, 'title', '') or ''
excerpt = getattr(src, 'excerpt', '') or ''
highlights = getattr(src, 'highlights', []) or []
summary = getattr(src, 'summary', '') or ''
source_type = getattr(src, 'source_type', '') or ''
author = getattr(src, 'author', '') or ''
line = f" [{i}] {title}"
if source_type:
line += f" [{source_type}]"
if author:
line += f" by {author}"
if summary:
line += f"{summary[:1000]}"
elif excerpt:
line += f"{excerpt[:1000]}"
if highlights:
line += f" | Key findings: {'; '.join(h[:250] for h in highlights[:3])}"
source_lines.append(line)
if source_lines:
source_list_text = "RESEARCH SOURCES (numbered for reference):\n" + "\n".join(source_lines)
# Top factual excerpts for depth (keep as supplement)
source_excerpts_text = ""
if sources:
sorted_sources = sorted(
[s for s in sources if (s.excerpt or s.summary)],
key=lambda s: s.credibility_score or 0.8, reverse=True
)[:3]
)[:5]
excerpts = []
for i, src in enumerate(sorted_sources, 1):
excerpt = src.excerpt or src.summary or ""
if len(excerpt) > 300:
excerpt = excerpt[:297] + "..."
if len(excerpt) > 500:
excerpt = excerpt[:497] + "..."
excerpts.append(f" {i}. \"{src.title}\"{excerpt}")
if excerpts:
source_excerpts_text = "FACTUAL DATA POINTS FROM RESEARCH:\n" + "\n".join(excerpts)
source_excerpts_text = "DETAILED FACTS FROM TOP SOURCES:\n" + "\n".join(excerpts)
# Extract recency: newest source publication date
newest_date_str = ""
@@ -76,12 +109,12 @@ class PromptBuilder:
grounding_evidence_text = ""
if research and research.grounding_metadata and research.grounding_metadata.grounding_supports:
supports = research.grounding_metadata.grounding_supports
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:3]
top_supports = [s for s in supports if s.segment_text and len(s.segment_text) > 20][:5]
if top_supports:
evidence_parts = []
for i, s in enumerate(top_supports, 1):
text = s.segment_text[:250]
if len(s.segment_text) > 250:
text = s.segment_text[:400]
if len(s.segment_text) > 400:
text += "..."
evidence_parts.append(f" {i}. {text}")
grounding_evidence_text = "VERIFIED EVIDENCE (high-confidence snippets):\n" + "\n".join(evidence_parts)
@@ -151,8 +184,11 @@ Market Opportunities: {opportunity_text}
Competitive Advantages: {advantages_text}
{f"Market Positioning: {market_positioning_text}" if market_positioning_text else ""}
{f"Competitor Headings (AVOID duplicating): {competitor_headings_text}" if competitor_headings_text else ""}
{f"Content Gaps (MUST address these gaps): {content_gaps_text}" if content_gaps_text else ""}
{f"Industry Leaders: {industry_leaders_text}" if industry_leaders_text else ""}
{f"Search Intent Signals: {search_queries_text}" if search_queries_text else ""}
RESEARCH SOURCES: {len(sources)} authoritative sources available
{source_list_text}
{newest_date_str}
{source_excerpts_text}
@@ -168,8 +204,9 @@ STRATEGIC REQUIREMENTS:
- Create SEO-optimized headings with natural keyword integration
- Surface the strongest research-backed angles within the outline
- Build logical narrative flow from problem to solution
- Include data-driven insights from research sources
- Address content gaps and market opportunities
- Include data-driven insights from research sources — use the numbered sources above
- For each section, assign the most relevant source indices using the [N] numbers above
- Address content gaps and market opportunities — if content gaps are listed, dedicate sections to fill those gaps
- Optimize for search intent and user questions
- Ensure engaging, actionable content throughout
@@ -186,7 +223,8 @@ Return JSON format:
"subheadings": ["Subheading 1", "Subheading 2", "Subheading 3"],
"key_points": ["Key point 1", "Key point 2", "Key point 3"],
"target_words": 300,
"keywords": ["keyword 1", "keyword 2"]
"keywords": ["keyword 1", "keyword 2"],
"source_indices": [1, 3, 5]
}}
]
}}"""
@@ -220,9 +258,14 @@ Return JSON format:
"keywords": {
"type": "array",
"items": {"type": "string"}
},
"source_indices": {
"type": "array",
"items": {"type": "integer"},
"description": "Indices of research sources (from the numbered list above) that support this section"
}
},
"required": ["heading", "subheadings", "key_points", "target_words", "keywords"]
"required": ["heading", "subheadings", "key_points", "target_words", "keywords", "source_indices"]
}
}
},

View File

@@ -100,18 +100,37 @@ class ResponseProcessor:
raise ValueError(f"AI outline generation failed: {error_str}")
def convert_to_sections(self, outline_data: Dict[str, Any], sources: List) -> List[BlogOutlineSection]:
"""Convert outline data to BlogOutlineSection objects."""
"""Convert outline data to BlogOutlineSection objects.
If the LLM assigned source_indices to sections, populate references
directly from those indices. Indices are 1-based (matching the [N]
labels in the prompt) — converted to 0-based for list access.
Sections without source_indices will be populated by the algorithmic
source mapper in a later step.
"""
outline_sections = []
for i, section_data in enumerate(outline_data.get('outline', [])):
if not isinstance(section_data, dict) or 'heading' not in section_data:
continue
# Parse LLM-assigned source indices (1-based)
raw_indices = section_data.get('source_indices', [])
section_refs = []
if raw_indices and sources:
for idx in raw_indices:
try:
source_idx = int(idx) - 1 # Convert 1-based → 0-based
if 0 <= source_idx < len(sources):
section_refs.append(sources[source_idx])
except (ValueError, TypeError):
pass
section = BlogOutlineSection(
id=f"s{i+1}",
heading=section_data.get('heading', f'Section {i+1}'),
subheadings=section_data.get('subheadings', []),
key_points=section_data.get('key_points', []),
references=[], # Will be populated by intelligent mapping
references=section_refs, # LLM-assigned if provided, else []
target_words=section_data.get('target_words', 200),
keywords=section_data.get('keywords', [])
)

View File

@@ -41,10 +41,33 @@ class SourceToSectionMapper:
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'much', 'many', 'more', 'most',
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
'over', 'under', 'again', 'further', 'then', 'once'
'over', 'under', 'again', 'further', 'then', 'once', 'also', 'into', 'about', 'between',
'through', 'during', 'before', 'after', 'above', 'below', 'from', 'since', 'until', 'while',
'because', 'however', 'therefore', 'thus', 'hence', 'yet', 'still', 'already', 'even'
}
# Common abbreviation/synonym pairs for fuzzy matching
self._synonym_map = {
'ai': ['artificial intelligence', 'machine intelligence'],
'ml': ['machine learning'],
'dl': ['deep learning'],
'nlp': ['natural language processing'],
'iot': ['internet of things'],
'saas': ['software as a service'],
'b2b': ['business to business'],
'b2c': ['business to consumer'],
'cx': ['customer experience'],
'ux': ['user experience'],
'roi': ['return on investment'],
'kpi': ['key performance indicator'],
'crm': ['customer relationship management'],
'erp': ['enterprise resource planning'],
'seo': ['search engine optimization'],
'cto': ['chief technology officer'],
'vp': ['vice president'],
}
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
@@ -53,15 +76,21 @@ class SourceToSectionMapper:
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse,
user_id: str
user_id: str,
competitive_advantage: str = ""
) -> List[BlogOutlineSection]:
"""
Map research sources to outline sections using intelligent algorithms.
Sections that already have LLM-assigned references (from source_indices
in the outline prompt) are preserved. Algorithmic mapping fills gaps
for sections without LLM-assigned sources.
Args:
sections: List of outline sections to map sources to
research_data: Research data containing sources and metadata
user_id: User ID (required for subscription checks and usage tracking)
competitive_advantage: Selected competitive advantage to preferentially match
Returns:
List of outline sections with intelligently mapped sources
@@ -76,16 +105,39 @@ class SourceToSectionMapper:
logger.warning("No sections or sources to map")
return sections
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
# Separate sections with LLM-assigned references from those without
sections_with_refs = [s for s in sections if s.references]
sections_without_refs = [s for s in sections if not s.references]
# Step 1: Algorithmic mapping
mapping_results = self._algorithmic_source_mapping(sections, research_data)
logger.info(
f"Mapping {len(research_data.sources)} sources to {len(sections)} sections "
f"({len(sections_with_refs)} with LLM-assigned references, "
f"{len(sections_without_refs)} need algorithmic mapping)"
)
# Step 2: AI validation and improvement (single prompt, user_id required for subscription checks)
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
if sections_without_refs:
# Step 1: Algorithmic mapping for sections without LLM-assigned references
mapping_results = self._algorithmic_source_mapping(sections_without_refs, research_data, competitive_advantage)
# Step 2: AI validation and improvement
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
# Step 3: Apply mapping only to sections that need it
mapped_sections_with = self._apply_mapping_to_sections(sections_without_refs, validated_mapping)
else:
mapped_sections_with = []
# Step 3: Apply validated mapping to sections
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
# Combine: keep LLM-assigned sections as-is, add algorithmically mapped ones
mapped_sections = list(sections_with_refs) + mapped_sections_with
# Preserve original ordering
original_ids = [s.id for s in sections]
mapped_sections.sort(key=lambda s: original_ids.index(s.id) if s.id in original_ids else 999)
# Warn if any section still has zero references
for s in mapped_sections:
if not s.references:
logger.warning(f"Section '{s.heading}' (id={s.id}) has ZERO sources — content generator will use keyword-based fallback")
logger.info("✅ Source-to-section mapping completed successfully")
return mapped_sections
@@ -93,7 +145,8 @@ class SourceToSectionMapper:
def _algorithmic_source_mapping(
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse
research_data: BlogResearchResponse,
competitive_advantage: str = ""
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Perform algorithmic mapping of sources to sections.
@@ -101,6 +154,7 @@ class SourceToSectionMapper:
Args:
sections: List of outline sections
research_data: Research data with sources
competitive_advantage: Selected competitive advantage to boost matching
Returns:
Dictionary mapping section IDs to list of (source, score) tuples
@@ -114,7 +168,7 @@ class SourceToSectionMapper:
# Calculate multi-dimensional relevance score
semantic_score = self._calculate_semantic_similarity(section, source)
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
contextual_score = self._calculate_contextual_relevance(section, source, research_data, competitive_advantage)
# Weighted total score
total_score = (
@@ -140,38 +194,54 @@ class SourceToSectionMapper:
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
"""
Calculate semantic similarity between section and source.
Args:
section: Outline section
source: Research source
Returns:
Semantic similarity score (0.0 to 1.0)
Uses word overlap, stem matching, bigram overlap, title-boost, and synonym expansion.
"""
# Extract text content for comparison
section_text = self._extract_section_text(section)
source_text = self._extract_source_text(source)
# Calculate word overlap
section_words = self._extract_meaningful_words(section_text)
source_words = self._extract_meaningful_words(source_text)
if not section_words or not source_words:
return 0.0
# Calculate Jaccard similarity
intersection = len(set(section_words) & set(source_words))
union = len(set(section_words) | set(source_words))
section_set = set(section_words)
source_set = set(source_words)
jaccard_similarity = intersection / union if union > 0 else 0.0
# 1. Jaccard similarity on raw words
intersection = len(section_set & source_set)
union = len(section_set | source_set)
jaccard = intersection / union if union > 0 else 0.0
# Boost score for exact phrase matches
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
# 2. Stem matching — catches word variants (e.g., "running" vs "runs")
section_stems = set(self._stem_word(w) for w in section_words)
source_stems = set(self._stem_word(w) for w in source_words)
stem_intersection = len(section_stems & source_stems)
stem_union = len(section_stems | source_stems)
stem_similarity = stem_intersection / stem_union if stem_union > 0 else 0.0
# Combine Jaccard similarity with phrase boost
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
# 3. Bigram overlap — catches multi-word concepts (e.g., "machine learning")
section_bigrams = set(self._extract_bigrams(section_text))
source_bigrams = set(self._extract_bigrams(source_text))
bigram_overlap = len(section_bigrams & source_bigrams)
bigram_score = min(0.3, bigram_overlap * 0.1) if (section_bigrams or source_bigrams) else 0.0
return semantic_score
# 4. Title-boost — section heading matching source title is a strong signal
heading = (section.heading or '').lower()
source_title = (source.title or '').lower()
heading_words = set(self._extract_meaningful_words(heading))
title_words = set(self._extract_meaningful_words(source_title))
title_overlap = len(heading_words & title_words) / len(heading_words | title_words) if (heading_words or title_words) else 0.0
title_boost = min(0.3, title_overlap * 0.5)
# 5. Synonym expansion — expand abbreviations and match across synonym pairs
synonym_score = self._calculate_synonym_overlap(section_words, source_words)
# Combine: Jaccard + stem give base, bigram + title + synonyms boost
base_similarity = max(jaccard, stem_similarity)
combined = min(1.0, base_similarity + bigram_score + title_boost + synonym_score + 0.0)
return combined
def _calculate_keyword_relevance(
self,
@@ -219,7 +289,8 @@ class SourceToSectionMapper:
self,
section: BlogOutlineSection,
source: ResearchSource,
research_data: BlogResearchResponse
research_data: BlogResearchResponse,
competitive_advantage: str = ""
) -> float:
"""
Calculate contextual relevance based on section content and source context.
@@ -228,6 +299,7 @@ class SourceToSectionMapper:
section: Outline section
source: Research source
research_data: Research data with context
competitive_advantage: Selected competitive advantage to boost matching
Returns:
Contextual relevance score (0.0 to 1.0)
@@ -264,6 +336,15 @@ class SourceToSectionMapper:
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
contextual_score += industry_score * 0.2
# 4. Competitive advantage boost — sources that match the advantage get a score lift
if competitive_advantage:
advantage_words = set(self._extract_meaningful_words(competitive_advantage.lower()))
if advantage_words:
advantage_in_section = sum(1 for w in advantage_words if w in section_text) / len(advantage_words)
advantage_in_source = sum(1 for w in advantage_words if w in source_text) / len(advantage_words)
if advantage_in_section > 0.3 and advantage_in_source > 0.3:
contextual_score += 0.25 * (advantage_in_section + advantage_in_source)
return min(1.0, contextual_score)
def _ai_validate_mapping(
@@ -360,10 +441,15 @@ class SourceToSectionMapper:
return " ".join(text_parts)
def _extract_source_text(self, source: ResearchSource) -> str:
"""Extract all text content from a source."""
"""Extract all text content from a source, including full text for better matching."""
text_parts = [source.title]
if source.summary:
text_parts.append(source.summary)
if source.excerpt:
text_parts.append(source.excerpt)
content = getattr(source, 'content', '') or ''
if content:
text_parts.append(content[:500])
return " ".join(text_parts)
def _extract_meaningful_words(self, text: str) -> List[str]:
@@ -382,6 +468,41 @@ class SourceToSectionMapper:
return meaningful_words
def _stem_word(self, word: str) -> str:
"""Rudimentary suffix-stripping stemmer for English words."""
if len(word) <= 3:
return word
for suffix in ['ization', 'ation', 'tion', 'sion', 'ment', 'ness', 'ity', 'ing', 'able', 'ible', 'ful', 'less', 'ous', 'ive', 'ally', 'ly', 'er', 'ed', 'es', 's']:
if word.endswith(suffix) and len(word) - len(suffix) >= 3:
return word[:-len(suffix)]
return word
def _extract_bigrams(self, text: str) -> List[str]:
"""Extract meaningful two-word phrases from text."""
words = self._extract_meaningful_words(text)
if len(words) < 2:
return []
return [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
def _calculate_synonym_overlap(self, section_words: List[str], source_words: List[str]) -> float:
"""Score overlap via abbreviation/synonym expansion."""
section_set = set(section_words)
source_set = set(source_words)
extra_matches = 0
total_terms = len(section_set | source_set) or 1
for abbr, expansions in self._synonym_map.items():
abbr_in_section = abbr in section_set
abbr_in_source = abbr in source_set
for expansion in expansions:
exp_words = set(expansion.split())
exp_in_section = exp_words.issubset(section_set)
exp_in_source = exp_words.issubset(source_set)
if (abbr_in_section and exp_in_source) or (abbr_in_source and exp_in_section):
extra_matches += 1
return min(0.2, extra_matches * 0.05)
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
"""Calculate phrase similarity boost score."""
if not text1 or not text2: