Files
ALwrity/backend/services/blog_writer/outline/source_mapper.py

691 lines
27 KiB
Python

"""
Source-to-Section Mapper - Intelligent mapping of research sources to outline sections.
This module provides algorithmic mapping of research sources to specific outline sections
based on semantic similarity, keyword relevance, and contextual matching. Uses a hybrid
approach of algorithmic scoring followed by AI validation for optimal results.
"""
from typing import Dict, Any, List, Tuple, Optional
import re
from collections import Counter
from loguru import logger
from models.blog_models import (
BlogOutlineSection,
ResearchSource,
BlogResearchResponse,
)
class SourceToSectionMapper:
"""Maps research sources to outline sections using intelligent algorithms."""
def __init__(self):
"""Initialize the source-to-section mapper."""
self.min_semantic_score = 0.3
self.min_keyword_score = 0.2
self.min_contextual_score = 0.2
self.max_sources_per_section = 3
self.min_total_score = 0.4
# Weight factors for different scoring methods
self.weights = {
'semantic': 0.4, # Semantic similarity weight
'keyword': 0.3, # Keyword matching weight
'contextual': 0.3 # Contextual relevance weight
}
# Common stop words for text processing
self.stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
'over', 'under', 'again', 'further', 'then', 'once'
}
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
def map_sources_to_sections(
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse,
user_id: str
) -> List[BlogOutlineSection]:
"""
Map research sources to outline sections using intelligent algorithms.
Args:
sections: List of outline sections to map sources to
research_data: Research data containing sources and metadata
user_id: User ID (required for subscription checks and usage tracking)
Returns:
List of outline sections with intelligently mapped sources
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for source mapping (subscription checks and usage tracking)")
if not sections or not research_data.sources:
logger.warning("No sections or sources to map")
return sections
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
# Step 1: Algorithmic mapping
mapping_results = self._algorithmic_source_mapping(sections, research_data)
# Step 2: AI validation and improvement (single prompt, user_id required for subscription checks)
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
# Step 3: Apply validated mapping to sections
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
logger.info("✅ Source-to-section mapping completed successfully")
return mapped_sections
def _algorithmic_source_mapping(
self,
sections: List[BlogOutlineSection],
research_data: BlogResearchResponse
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Perform algorithmic mapping of sources to sections.
Args:
sections: List of outline sections
research_data: Research data with sources
Returns:
Dictionary mapping section IDs to list of (source, score) tuples
"""
mapping_results = {}
for section in sections:
section_scores = []
for source in research_data.sources:
# Calculate multi-dimensional relevance score
semantic_score = self._calculate_semantic_similarity(section, source)
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
# Weighted total score
total_score = (
semantic_score * self.weights['semantic'] +
keyword_score * self.weights['keyword'] +
contextual_score * self.weights['contextual']
)
# Only include sources that meet minimum threshold
if total_score >= self.min_total_score:
section_scores.append((source, total_score))
# Sort by score and limit to max sources per section
section_scores.sort(key=lambda x: x[1], reverse=True)
section_scores = section_scores[:self.max_sources_per_section]
mapping_results[section.id] = section_scores
logger.debug(f"Section '{section.heading}': {len(section_scores)} sources mapped")
return mapping_results
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
"""
Calculate semantic similarity between section and source.
Args:
section: Outline section
source: Research source
Returns:
Semantic similarity score (0.0 to 1.0)
"""
# Extract text content for comparison
section_text = self._extract_section_text(section)
source_text = self._extract_source_text(source)
# Calculate word overlap
section_words = self._extract_meaningful_words(section_text)
source_words = self._extract_meaningful_words(source_text)
if not section_words or not source_words:
return 0.0
# Calculate Jaccard similarity
intersection = len(set(section_words) & set(source_words))
union = len(set(section_words) | set(source_words))
jaccard_similarity = intersection / union if union > 0 else 0.0
# Boost score for exact phrase matches
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
# Combine Jaccard similarity with phrase boost
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
return semantic_score
def _calculate_keyword_relevance(
self,
section: BlogOutlineSection,
source: ResearchSource,
research_data: BlogResearchResponse
) -> float:
"""
Calculate keyword-based relevance between section and source.
Args:
section: Outline section
source: Research source
research_data: Research data with keyword analysis
Returns:
Keyword relevance score (0.0 to 1.0)
"""
# Get section keywords
section_keywords = set(section.keywords)
if not section_keywords:
# Extract keywords from section heading and content
section_text = self._extract_section_text(section)
section_keywords = set(self._extract_meaningful_words(section_text))
# Get source keywords from title and excerpt
source_text = f"{source.title} {source.excerpt or ''}"
source_keywords = set(self._extract_meaningful_words(source_text))
# Get research keywords for context
research_keywords = set()
for category in ['primary', 'secondary', 'long_tail', 'semantic_keywords']:
research_keywords.update(research_data.keyword_analysis.get(category, []))
# Calculate keyword overlap scores
section_overlap = len(section_keywords & source_keywords) / len(section_keywords) if section_keywords else 0.0
research_overlap = len(research_keywords & source_keywords) / len(research_keywords) if research_keywords else 0.0
# Weighted combination
keyword_score = (section_overlap * 0.7) + (research_overlap * 0.3)
return min(1.0, keyword_score)
def _calculate_contextual_relevance(
self,
section: BlogOutlineSection,
source: ResearchSource,
research_data: BlogResearchResponse
) -> float:
"""
Calculate contextual relevance based on section content and source context.
Args:
section: Outline section
source: Research source
research_data: Research data with context
Returns:
Contextual relevance score (0.0 to 1.0)
"""
contextual_score = 0.0
# 1. Content angle matching
section_text = self._extract_section_text(section).lower()
source_text = f"{source.title} {source.excerpt or ''}".lower()
# Check for content angle matches
content_angles = research_data.suggested_angles
for angle in content_angles:
angle_words = self._extract_meaningful_words(angle.lower())
if angle_words:
section_angle_match = sum(1 for word in angle_words if word in section_text) / len(angle_words)
source_angle_match = sum(1 for word in angle_words if word in source_text) / len(angle_words)
contextual_score += (section_angle_match + source_angle_match) * 0.3
# 2. Search intent alignment
search_intent = research_data.keyword_analysis.get('search_intent', 'informational')
intent_keywords = self._get_intent_keywords(search_intent)
intent_score = 0.0
for keyword in intent_keywords:
if keyword in section_text or keyword in source_text:
intent_score += 0.1
contextual_score += min(0.3, intent_score)
# 3. Industry/domain relevance
if hasattr(research_data, 'industry') and research_data.industry:
industry_words = self._extract_meaningful_words(research_data.industry.lower())
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
contextual_score += industry_score * 0.2
return min(1.0, contextual_score)
def _ai_validate_mapping(
self,
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
research_data: BlogResearchResponse,
user_id: str
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Use AI to validate and improve the algorithmic mapping results.
Args:
mapping_results: Algorithmic mapping results
research_data: Research data for context
user_id: User ID (required for subscription checks and usage tracking)
Returns:
AI-validated and improved mapping results
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for AI validation (subscription checks and usage tracking)")
try:
logger.info("Starting AI validation of source-to-section mapping...")
# Build AI validation prompt
validation_prompt = self._build_validation_prompt(mapping_results, research_data)
# Get AI validation response (user_id required for subscription checks)
validation_response = self._get_ai_validation_response(validation_prompt, user_id)
# Parse and apply AI validation results
validated_mapping = self._parse_validation_response(validation_response, mapping_results, research_data)
logger.info("✅ AI validation completed successfully")
return validated_mapping
except Exception as e:
logger.warning(f"AI validation failed: {e}. Using algorithmic results as fallback.")
return mapping_results
def _apply_mapping_to_sections(
self,
sections: List[BlogOutlineSection],
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]
) -> List[BlogOutlineSection]:
"""
Apply the mapping results to the outline sections.
Args:
sections: Original outline sections
mapping_results: Mapping results from algorithmic/AI processing
Returns:
Sections with mapped sources
"""
mapped_sections = []
for section in sections:
# Get mapped sources for this section
mapped_sources = mapping_results.get(section.id, [])
# Extract just the sources (without scores)
section_sources = [source for source, score in mapped_sources]
# Create new section with mapped sources
mapped_section = BlogOutlineSection(
id=section.id,
heading=section.heading,
subheadings=section.subheadings,
key_points=section.key_points,
references=section_sources,
target_words=section.target_words,
keywords=section.keywords
)
mapped_sections.append(mapped_section)
logger.debug(f"Applied {len(section_sources)} sources to section '{section.heading}'")
return mapped_sections
# Helper methods
def _extract_section_text(self, section: BlogOutlineSection) -> str:
"""Extract all text content from a section."""
text_parts = [section.heading]
text_parts.extend(section.subheadings)
text_parts.extend(section.key_points)
text_parts.extend(section.keywords)
return " ".join(text_parts)
def _extract_source_text(self, source: ResearchSource) -> str:
"""Extract all text content from a source."""
text_parts = [source.title]
if source.excerpt:
text_parts.append(source.excerpt)
return " ".join(text_parts)
def _extract_meaningful_words(self, text: str) -> List[str]:
"""Extract meaningful words from text, removing stop words and cleaning."""
if not text:
return []
# Clean and tokenize
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
# Remove stop words and short words
meaningful_words = [
word for word in words
if word not in self.stop_words and len(word) > 2
]
return meaningful_words
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
"""Calculate phrase similarity boost score."""
if not text1 or not text2:
return 0.0
text1_lower = text1.lower()
text2_lower = text2.lower()
# Look for 2-3 word phrases
phrase_boost = 0.0
# Extract 2-word phrases
words1 = text1_lower.split()
words2 = text2_lower.split()
for i in range(len(words1) - 1):
phrase = f"{words1[i]} {words1[i+1]}"
if phrase in text2_lower:
phrase_boost += 0.1
# Extract 3-word phrases
for i in range(len(words1) - 2):
phrase = f"{words1[i]} {words1[i+1]} {words1[i+2]}"
if phrase in text2_lower:
phrase_boost += 0.15
return min(0.3, phrase_boost) # Cap at 0.3
def _get_intent_keywords(self, search_intent: str) -> List[str]:
"""Get keywords associated with search intent."""
intent_keywords = {
'informational': ['what', 'how', 'why', 'guide', 'tutorial', 'explain', 'learn', 'understand'],
'navigational': ['find', 'locate', 'search', 'where', 'site', 'website', 'page'],
'transactional': ['buy', 'purchase', 'order', 'price', 'cost', 'deal', 'offer', 'discount'],
'commercial': ['compare', 'review', 'best', 'top', 'vs', 'versus', 'alternative', 'option']
}
return intent_keywords.get(search_intent, [])
def get_mapping_statistics(self, mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]) -> Dict[str, Any]:
"""
Get statistics about the mapping results.
Args:
mapping_results: Mapping results to analyze
Returns:
Dictionary with mapping statistics
"""
total_sections = len(mapping_results)
total_mappings = sum(len(sources) for sources in mapping_results.values())
# Calculate score distribution
all_scores = []
for sources in mapping_results.values():
all_scores.extend([score for source, score in sources])
avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
max_score = max(all_scores) if all_scores else 0.0
min_score = min(all_scores) if all_scores else 0.0
# Count sections with/without sources
sections_with_sources = sum(1 for sources in mapping_results.values() if sources)
sections_without_sources = total_sections - sections_with_sources
return {
'total_sections': total_sections,
'total_mappings': total_mappings,
'sections_with_sources': sections_with_sources,
'sections_without_sources': sections_without_sources,
'average_score': avg_score,
'max_score': max_score,
'min_score': min_score,
'mapping_coverage': sections_with_sources / total_sections if total_sections > 0 else 0.0
}
def _build_validation_prompt(
self,
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
research_data: BlogResearchResponse
) -> str:
"""
Build comprehensive AI validation prompt for source-to-section mapping.
Args:
mapping_results: Algorithmic mapping results
research_data: Research data for context
Returns:
Formatted AI validation prompt
"""
# Extract section information
sections_info = []
for section_id, sources in mapping_results.items():
section_info = {
'id': section_id,
'sources': [
{
'title': source.title,
'url': source.url,
'excerpt': source.excerpt,
'credibility_score': source.credibility_score,
'algorithmic_score': score
}
for source, score in sources
]
}
sections_info.append(section_info)
# Extract research context
research_context = {
'primary_keywords': research_data.keyword_analysis.get('primary', []),
'secondary_keywords': research_data.keyword_analysis.get('secondary', []),
'content_angles': research_data.suggested_angles,
'search_intent': research_data.keyword_analysis.get('search_intent', 'informational'),
'all_sources': [
{
'title': source.title,
'url': source.url,
'excerpt': source.excerpt,
'credibility_score': source.credibility_score
}
for source in research_data.sources
]
}
prompt = f"""
You are an expert content strategist and SEO specialist. Your task is to validate and improve the algorithmic mapping of research sources to blog outline sections.
## CONTEXT
Research Topic: {', '.join(research_context['primary_keywords'])}
Search Intent: {research_context['search_intent']}
Content Angles: {', '.join(research_context['content_angles'])}
## ALGORITHMIC MAPPING RESULTS
The following sections have been algorithmically mapped with research sources:
{self._format_sections_for_prompt(sections_info)}
## AVAILABLE SOURCES
All available research sources:
{self._format_sources_for_prompt(research_context['all_sources'])}
## VALIDATION TASK
Please analyze the algorithmic mapping and provide improvements:
1. **Validate Relevance**: Are the mapped sources truly relevant to each section's content and purpose?
2. **Identify Gaps**: Are there better sources available that weren't mapped?
3. **Suggest Improvements**: Recommend specific source changes for better content alignment
4. **Quality Assessment**: Rate the overall mapping quality (1-10)
## RESPONSE FORMAT
Provide your analysis in the following JSON format:
```json
{{
"overall_quality_score": 8,
"section_improvements": [
{{
"section_id": "s1",
"current_sources": ["source_title_1", "source_title_2"],
"recommended_sources": ["better_source_1", "better_source_2", "better_source_3"],
"reasoning": "Explanation of why these sources are better suited for this section",
"confidence": 0.9
}}
],
"summary": "Overall assessment of the mapping quality and key improvements made"
}}
```
## GUIDELINES
- Prioritize sources that directly support the section's key points and subheadings
- Consider source credibility, recency, and content depth
- Ensure sources provide actionable insights for content creation
- Maintain diversity in source types and perspectives
- Focus on sources that enhance the section's value proposition
Analyze the mapping and provide your recommendations.
"""
return prompt
def _get_ai_validation_response(self, prompt: str, user_id: str) -> str:
"""
Get AI validation response using LLM provider.
Args:
prompt: Validation prompt
user_id: User ID (required for subscription checks and usage tracking)
Returns:
AI validation response
Raises:
ValueError: If user_id is not provided
"""
if not user_id:
raise ValueError("user_id is required for AI validation response (subscription checks and usage tracking)")
try:
from services.llm_providers.main_text_generation import llm_text_gen
response = llm_text_gen(
prompt=prompt,
json_struct=None,
system_prompt=None,
user_id=user_id
)
return response
except Exception as e:
logger.error(f"Failed to get AI validation response: {e}")
raise
def _parse_validation_response(
self,
response: str,
original_mapping: Dict[str, List[Tuple[ResearchSource, float]]],
research_data: BlogResearchResponse
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
"""
Parse AI validation response and apply improvements.
Args:
response: AI validation response
original_mapping: Original algorithmic mapping
research_data: Research data for context
Returns:
Improved mapping based on AI validation
"""
try:
import json
import re
# Extract JSON from response
json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
if not json_match:
# Try to find JSON without code blocks
json_match = re.search(r'(\{.*?\})', response, re.DOTALL)
if not json_match:
logger.warning("Could not extract JSON from AI response")
return original_mapping
validation_data = json.loads(json_match.group(1))
# Create source lookup for quick access
source_lookup = {source.title: source for source in research_data.sources}
# Apply AI improvements
improved_mapping = {}
for improvement in validation_data.get('section_improvements', []):
section_id = improvement['section_id']
recommended_titles = improvement['recommended_sources']
# Map recommended titles to actual sources
recommended_sources = []
for title in recommended_titles:
if title in source_lookup:
source = source_lookup[title]
# Use high confidence score for AI-recommended sources
recommended_sources.append((source, 0.9))
if recommended_sources:
improved_mapping[section_id] = recommended_sources
else:
# Fallback to original mapping if no valid sources found
improved_mapping[section_id] = original_mapping.get(section_id, [])
# Add sections not mentioned in AI response
for section_id, sources in original_mapping.items():
if section_id not in improved_mapping:
improved_mapping[section_id] = sources
logger.info(f"AI validation applied: {len(validation_data.get('section_improvements', []))} sections improved")
return improved_mapping
except Exception as e:
logger.warning(f"Failed to parse AI validation response: {e}")
return original_mapping
def _format_sections_for_prompt(self, sections_info: List[Dict]) -> str:
"""Format sections information for AI prompt."""
formatted = []
for section in sections_info:
section_text = f"**Section {section['id']}:**\n"
section_text += f"Sources mapped: {len(section['sources'])}\n"
for source in section['sources']:
section_text += f"- {source['title']} (Score: {source['algorithmic_score']:.2f})\n"
formatted.append(section_text)
return "\n".join(formatted)
def _format_sources_for_prompt(self, sources: List[Dict]) -> str:
"""Format sources information for AI prompt."""
formatted = []
for i, source in enumerate(sources, 1):
source_text = f"{i}. **{source['title']}**\n"
source_text += f" URL: {source['url']}\n"
source_text += f" Credibility: {source['credibility_score']}\n"
if source['excerpt']:
source_text += f" Excerpt: {source['excerpt'][:200]}...\n"
formatted.append(source_text)
return "\n".join(formatted)