- Blog writer enhancements and bug fixes - Wix integration improvements - Frontend UI updates - GSC dashboard docs cleanup - Image studio assets - LinkedIn requirements file - Various dependency updates
812 lines
34 KiB
Python
812 lines
34 KiB
Python
"""
|
|
Source-to-Section Mapper - Intelligent mapping of research sources to outline sections.
|
|
|
|
This module provides algorithmic mapping of research sources to specific outline sections
|
|
based on semantic similarity, keyword relevance, and contextual matching. Uses a hybrid
|
|
approach of algorithmic scoring followed by AI validation for optimal results.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Tuple, Optional
|
|
import re
|
|
from collections import Counter
|
|
from loguru import logger
|
|
|
|
from models.blog_models import (
|
|
BlogOutlineSection,
|
|
ResearchSource,
|
|
BlogResearchResponse,
|
|
)
|
|
|
|
|
|
class SourceToSectionMapper:
|
|
"""Maps research sources to outline sections using intelligent algorithms."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the source-to-section mapper."""
|
|
self.min_semantic_score = 0.3
|
|
self.min_keyword_score = 0.2
|
|
self.min_contextual_score = 0.2
|
|
self.max_sources_per_section = 3
|
|
self.min_total_score = 0.4
|
|
|
|
# Weight factors for different scoring methods
|
|
self.weights = {
|
|
'semantic': 0.4, # Semantic similarity weight
|
|
'keyword': 0.3, # Keyword matching weight
|
|
'contextual': 0.3 # Contextual relevance weight
|
|
}
|
|
|
|
# Common stop words for text processing
|
|
self.stop_words = {
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
|
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
|
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
|
|
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'much', 'many', 'more', 'most',
|
|
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
|
|
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
|
|
'over', 'under', 'again', 'further', 'then', 'once', 'also', 'into', 'about', 'between',
|
|
'through', 'during', 'before', 'after', 'above', 'below', 'from', 'since', 'until', 'while',
|
|
'because', 'however', 'therefore', 'thus', 'hence', 'yet', 'still', 'already', 'even'
|
|
}
|
|
|
|
# Common abbreviation/synonym pairs for fuzzy matching
|
|
self._synonym_map = {
|
|
'ai': ['artificial intelligence', 'machine intelligence'],
|
|
'ml': ['machine learning'],
|
|
'dl': ['deep learning'],
|
|
'nlp': ['natural language processing'],
|
|
'iot': ['internet of things'],
|
|
'saas': ['software as a service'],
|
|
'b2b': ['business to business'],
|
|
'b2c': ['business to consumer'],
|
|
'cx': ['customer experience'],
|
|
'ux': ['user experience'],
|
|
'roi': ['return on investment'],
|
|
'kpi': ['key performance indicator'],
|
|
'crm': ['customer relationship management'],
|
|
'erp': ['enterprise resource planning'],
|
|
'seo': ['search engine optimization'],
|
|
'cto': ['chief technology officer'],
|
|
'vp': ['vice president'],
|
|
}
|
|
|
|
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
|
|
|
|
def map_sources_to_sections(
|
|
self,
|
|
sections: List[BlogOutlineSection],
|
|
research_data: BlogResearchResponse,
|
|
user_id: str,
|
|
competitive_advantage: str = ""
|
|
) -> List[BlogOutlineSection]:
|
|
"""
|
|
Map research sources to outline sections using intelligent algorithms.
|
|
|
|
Sections that already have LLM-assigned references (from source_indices
|
|
in the outline prompt) are preserved. Algorithmic mapping fills gaps
|
|
for sections without LLM-assigned sources.
|
|
|
|
Args:
|
|
sections: List of outline sections to map sources to
|
|
research_data: Research data containing sources and metadata
|
|
user_id: User ID (required for subscription checks and usage tracking)
|
|
competitive_advantage: Selected competitive advantage to preferentially match
|
|
|
|
Returns:
|
|
List of outline sections with intelligently mapped sources
|
|
|
|
Raises:
|
|
ValueError: If user_id is not provided
|
|
"""
|
|
if not user_id:
|
|
raise ValueError("user_id is required for source mapping (subscription checks and usage tracking)")
|
|
|
|
if not sections or not research_data.sources:
|
|
logger.warning("No sections or sources to map")
|
|
return sections
|
|
|
|
# Separate sections with LLM-assigned references from those without
|
|
sections_with_refs = [s for s in sections if s.references]
|
|
sections_without_refs = [s for s in sections if not s.references]
|
|
|
|
logger.info(
|
|
f"Mapping {len(research_data.sources)} sources to {len(sections)} sections "
|
|
f"({len(sections_with_refs)} with LLM-assigned references, "
|
|
f"{len(sections_without_refs)} need algorithmic mapping)"
|
|
)
|
|
|
|
if sections_without_refs:
|
|
# Step 1: Algorithmic mapping for sections without LLM-assigned references
|
|
mapping_results = self._algorithmic_source_mapping(sections_without_refs, research_data, competitive_advantage)
|
|
|
|
# Step 2: AI validation and improvement
|
|
validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
|
|
|
|
# Step 3: Apply mapping only to sections that need it
|
|
mapped_sections_with = self._apply_mapping_to_sections(sections_without_refs, validated_mapping)
|
|
else:
|
|
mapped_sections_with = []
|
|
|
|
# Combine: keep LLM-assigned sections as-is, add algorithmically mapped ones
|
|
mapped_sections = list(sections_with_refs) + mapped_sections_with
|
|
|
|
# Preserve original ordering
|
|
original_ids = [s.id for s in sections]
|
|
mapped_sections.sort(key=lambda s: original_ids.index(s.id) if s.id in original_ids else 999)
|
|
|
|
# Warn if any section still has zero references
|
|
for s in mapped_sections:
|
|
if not s.references:
|
|
logger.warning(f"Section '{s.heading}' (id={s.id}) has ZERO sources — content generator will use keyword-based fallback")
|
|
|
|
logger.info("✅ Source-to-section mapping completed successfully")
|
|
return mapped_sections
|
|
|
|
def _algorithmic_source_mapping(
|
|
self,
|
|
sections: List[BlogOutlineSection],
|
|
research_data: BlogResearchResponse,
|
|
competitive_advantage: str = ""
|
|
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
|
"""
|
|
Perform algorithmic mapping of sources to sections.
|
|
|
|
Args:
|
|
sections: List of outline sections
|
|
research_data: Research data with sources
|
|
competitive_advantage: Selected competitive advantage to boost matching
|
|
|
|
Returns:
|
|
Dictionary mapping section IDs to list of (source, score) tuples
|
|
"""
|
|
mapping_results = {}
|
|
|
|
for section in sections:
|
|
section_scores = []
|
|
|
|
for source in research_data.sources:
|
|
# Calculate multi-dimensional relevance score
|
|
semantic_score = self._calculate_semantic_similarity(section, source)
|
|
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
|
|
contextual_score = self._calculate_contextual_relevance(section, source, research_data, competitive_advantage)
|
|
|
|
# Weighted total score
|
|
total_score = (
|
|
semantic_score * self.weights['semantic'] +
|
|
keyword_score * self.weights['keyword'] +
|
|
contextual_score * self.weights['contextual']
|
|
)
|
|
|
|
# Only include sources that meet minimum threshold
|
|
if total_score >= self.min_total_score:
|
|
section_scores.append((source, total_score))
|
|
|
|
# Sort by score and limit to max sources per section
|
|
section_scores.sort(key=lambda x: x[1], reverse=True)
|
|
section_scores = section_scores[:self.max_sources_per_section]
|
|
|
|
mapping_results[section.id] = section_scores
|
|
|
|
logger.debug(f"Section '{section.heading}': {len(section_scores)} sources mapped")
|
|
|
|
return mapping_results
|
|
|
|
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
|
|
"""
|
|
Calculate semantic similarity between section and source.
|
|
Uses word overlap, stem matching, bigram overlap, title-boost, and synonym expansion.
|
|
"""
|
|
section_text = self._extract_section_text(section)
|
|
source_text = self._extract_source_text(source)
|
|
|
|
section_words = self._extract_meaningful_words(section_text)
|
|
source_words = self._extract_meaningful_words(source_text)
|
|
|
|
if not section_words or not source_words:
|
|
return 0.0
|
|
|
|
section_set = set(section_words)
|
|
source_set = set(source_words)
|
|
|
|
# 1. Jaccard similarity on raw words
|
|
intersection = len(section_set & source_set)
|
|
union = len(section_set | source_set)
|
|
jaccard = intersection / union if union > 0 else 0.0
|
|
|
|
# 2. Stem matching — catches word variants (e.g., "running" vs "runs")
|
|
section_stems = set(self._stem_word(w) for w in section_words)
|
|
source_stems = set(self._stem_word(w) for w in source_words)
|
|
stem_intersection = len(section_stems & source_stems)
|
|
stem_union = len(section_stems | source_stems)
|
|
stem_similarity = stem_intersection / stem_union if stem_union > 0 else 0.0
|
|
|
|
# 3. Bigram overlap — catches multi-word concepts (e.g., "machine learning")
|
|
section_bigrams = set(self._extract_bigrams(section_text))
|
|
source_bigrams = set(self._extract_bigrams(source_text))
|
|
bigram_overlap = len(section_bigrams & source_bigrams)
|
|
bigram_score = min(0.3, bigram_overlap * 0.1) if (section_bigrams or source_bigrams) else 0.0
|
|
|
|
# 4. Title-boost — section heading matching source title is a strong signal
|
|
heading = (section.heading or '').lower()
|
|
source_title = (source.title or '').lower()
|
|
heading_words = set(self._extract_meaningful_words(heading))
|
|
title_words = set(self._extract_meaningful_words(source_title))
|
|
title_overlap = len(heading_words & title_words) / len(heading_words | title_words) if (heading_words or title_words) else 0.0
|
|
title_boost = min(0.3, title_overlap * 0.5)
|
|
|
|
# 5. Synonym expansion — expand abbreviations and match across synonym pairs
|
|
synonym_score = self._calculate_synonym_overlap(section_words, source_words)
|
|
|
|
# Combine: Jaccard + stem give base, bigram + title + synonyms boost
|
|
base_similarity = max(jaccard, stem_similarity)
|
|
combined = min(1.0, base_similarity + bigram_score + title_boost + synonym_score + 0.0)
|
|
|
|
return combined
|
|
|
|
def _calculate_keyword_relevance(
|
|
self,
|
|
section: BlogOutlineSection,
|
|
source: ResearchSource,
|
|
research_data: BlogResearchResponse
|
|
) -> float:
|
|
"""
|
|
Calculate keyword-based relevance between section and source.
|
|
|
|
Args:
|
|
section: Outline section
|
|
source: Research source
|
|
research_data: Research data with keyword analysis
|
|
|
|
Returns:
|
|
Keyword relevance score (0.0 to 1.0)
|
|
"""
|
|
# Get section keywords
|
|
section_keywords = set(section.keywords)
|
|
if not section_keywords:
|
|
# Extract keywords from section heading and content
|
|
section_text = self._extract_section_text(section)
|
|
section_keywords = set(self._extract_meaningful_words(section_text))
|
|
|
|
# Get source keywords from title and excerpt
|
|
source_text = f"{source.title} {source.excerpt or ''}"
|
|
source_keywords = set(self._extract_meaningful_words(source_text))
|
|
|
|
# Get research keywords for context
|
|
research_keywords = set()
|
|
for category in ['primary', 'secondary', 'long_tail', 'semantic_keywords']:
|
|
research_keywords.update(research_data.keyword_analysis.get(category, []))
|
|
|
|
# Calculate keyword overlap scores
|
|
section_overlap = len(section_keywords & source_keywords) / len(section_keywords) if section_keywords else 0.0
|
|
research_overlap = len(research_keywords & source_keywords) / len(research_keywords) if research_keywords else 0.0
|
|
|
|
# Weighted combination
|
|
keyword_score = (section_overlap * 0.7) + (research_overlap * 0.3)
|
|
|
|
return min(1.0, keyword_score)
|
|
|
|
def _calculate_contextual_relevance(
|
|
self,
|
|
section: BlogOutlineSection,
|
|
source: ResearchSource,
|
|
research_data: BlogResearchResponse,
|
|
competitive_advantage: str = ""
|
|
) -> float:
|
|
"""
|
|
Calculate contextual relevance based on section content and source context.
|
|
|
|
Args:
|
|
section: Outline section
|
|
source: Research source
|
|
research_data: Research data with context
|
|
competitive_advantage: Selected competitive advantage to boost matching
|
|
|
|
Returns:
|
|
Contextual relevance score (0.0 to 1.0)
|
|
"""
|
|
contextual_score = 0.0
|
|
|
|
# 1. Content angle matching
|
|
section_text = self._extract_section_text(section).lower()
|
|
source_text = f"{source.title} {source.excerpt or ''}".lower()
|
|
|
|
# Check for content angle matches
|
|
content_angles = research_data.suggested_angles
|
|
for angle in content_angles:
|
|
angle_words = self._extract_meaningful_words(angle.lower())
|
|
if angle_words:
|
|
section_angle_match = sum(1 for word in angle_words if word in section_text) / len(angle_words)
|
|
source_angle_match = sum(1 for word in angle_words if word in source_text) / len(angle_words)
|
|
contextual_score += (section_angle_match + source_angle_match) * 0.3
|
|
|
|
# 2. Search intent alignment
|
|
search_intent = research_data.keyword_analysis.get('search_intent', 'informational')
|
|
intent_keywords = self._get_intent_keywords(search_intent)
|
|
|
|
intent_score = 0.0
|
|
for keyword in intent_keywords:
|
|
if keyword in section_text or keyword in source_text:
|
|
intent_score += 0.1
|
|
|
|
contextual_score += min(0.3, intent_score)
|
|
|
|
# 3. Industry/domain relevance
|
|
if hasattr(research_data, 'industry') and research_data.industry:
|
|
industry_words = self._extract_meaningful_words(research_data.industry.lower())
|
|
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
|
|
contextual_score += industry_score * 0.2
|
|
|
|
# 4. Competitive advantage boost — sources that match the advantage get a score lift
|
|
if competitive_advantage:
|
|
advantage_words = set(self._extract_meaningful_words(competitive_advantage.lower()))
|
|
if advantage_words:
|
|
advantage_in_section = sum(1 for w in advantage_words if w in section_text) / len(advantage_words)
|
|
advantage_in_source = sum(1 for w in advantage_words if w in source_text) / len(advantage_words)
|
|
if advantage_in_section > 0.3 and advantage_in_source > 0.3:
|
|
contextual_score += 0.25 * (advantage_in_section + advantage_in_source)
|
|
|
|
return min(1.0, contextual_score)
|
|
|
|
def _ai_validate_mapping(
|
|
self,
|
|
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
|
research_data: BlogResearchResponse,
|
|
user_id: str
|
|
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
|
"""
|
|
Use AI to validate and improve the algorithmic mapping results.
|
|
|
|
Args:
|
|
mapping_results: Algorithmic mapping results
|
|
research_data: Research data for context
|
|
user_id: User ID (required for subscription checks and usage tracking)
|
|
|
|
Returns:
|
|
AI-validated and improved mapping results
|
|
|
|
Raises:
|
|
ValueError: If user_id is not provided
|
|
"""
|
|
if not user_id:
|
|
raise ValueError("user_id is required for AI validation (subscription checks and usage tracking)")
|
|
|
|
try:
|
|
logger.info("Starting AI validation of source-to-section mapping...")
|
|
|
|
# Build AI validation prompt
|
|
validation_prompt = self._build_validation_prompt(mapping_results, research_data)
|
|
|
|
# Get AI validation response (user_id required for subscription checks)
|
|
validation_response = self._get_ai_validation_response(validation_prompt, user_id)
|
|
|
|
# Parse and apply AI validation results
|
|
validated_mapping = self._parse_validation_response(validation_response, mapping_results, research_data)
|
|
|
|
logger.info("✅ AI validation completed successfully")
|
|
return validated_mapping
|
|
|
|
except Exception as e:
|
|
logger.warning(f"AI validation failed: {e}. Using algorithmic results as fallback.")
|
|
return mapping_results
|
|
|
|
def _apply_mapping_to_sections(
|
|
self,
|
|
sections: List[BlogOutlineSection],
|
|
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]
|
|
) -> List[BlogOutlineSection]:
|
|
"""
|
|
Apply the mapping results to the outline sections.
|
|
|
|
Args:
|
|
sections: Original outline sections
|
|
mapping_results: Mapping results from algorithmic/AI processing
|
|
|
|
Returns:
|
|
Sections with mapped sources
|
|
"""
|
|
mapped_sections = []
|
|
|
|
for section in sections:
|
|
# Get mapped sources for this section
|
|
mapped_sources = mapping_results.get(section.id, [])
|
|
|
|
# Extract just the sources (without scores)
|
|
section_sources = [source for source, score in mapped_sources]
|
|
|
|
# Create new section with mapped sources
|
|
mapped_section = BlogOutlineSection(
|
|
id=section.id,
|
|
heading=section.heading,
|
|
subheadings=section.subheadings,
|
|
key_points=section.key_points,
|
|
references=section_sources,
|
|
target_words=section.target_words,
|
|
keywords=section.keywords
|
|
)
|
|
|
|
mapped_sections.append(mapped_section)
|
|
|
|
logger.debug(f"Applied {len(section_sources)} sources to section '{section.heading}'")
|
|
|
|
return mapped_sections
|
|
|
|
# Helper methods
|
|
|
|
def _extract_section_text(self, section: BlogOutlineSection) -> str:
|
|
"""Extract all text content from a section."""
|
|
text_parts = [section.heading]
|
|
text_parts.extend(section.subheadings)
|
|
text_parts.extend(section.key_points)
|
|
text_parts.extend(section.keywords)
|
|
return " ".join(text_parts)
|
|
|
|
def _extract_source_text(self, source: ResearchSource) -> str:
|
|
"""Extract all text content from a source, including full text for better matching."""
|
|
text_parts = [source.title]
|
|
if source.summary:
|
|
text_parts.append(source.summary)
|
|
if source.excerpt:
|
|
text_parts.append(source.excerpt)
|
|
content = getattr(source, 'content', '') or ''
|
|
if content:
|
|
text_parts.append(content[:500])
|
|
return " ".join(text_parts)
|
|
|
|
def _extract_meaningful_words(self, text: str) -> List[str]:
|
|
"""Extract meaningful words from text, removing stop words and cleaning."""
|
|
if not text:
|
|
return []
|
|
|
|
# Clean and tokenize
|
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
|
|
|
# Remove stop words and short words
|
|
meaningful_words = [
|
|
word for word in words
|
|
if word not in self.stop_words and len(word) > 2
|
|
]
|
|
|
|
return meaningful_words
|
|
|
|
def _stem_word(self, word: str) -> str:
|
|
"""Rudimentary suffix-stripping stemmer for English words."""
|
|
if len(word) <= 3:
|
|
return word
|
|
for suffix in ['ization', 'ation', 'tion', 'sion', 'ment', 'ness', 'ity', 'ing', 'able', 'ible', 'ful', 'less', 'ous', 'ive', 'ally', 'ly', 'er', 'ed', 'es', 's']:
|
|
if word.endswith(suffix) and len(word) - len(suffix) >= 3:
|
|
return word[:-len(suffix)]
|
|
return word
|
|
|
|
def _extract_bigrams(self, text: str) -> List[str]:
|
|
"""Extract meaningful two-word phrases from text."""
|
|
words = self._extract_meaningful_words(text)
|
|
if len(words) < 2:
|
|
return []
|
|
return [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
|
|
|
|
def _calculate_synonym_overlap(self, section_words: List[str], source_words: List[str]) -> float:
|
|
"""Score overlap via abbreviation/synonym expansion."""
|
|
section_set = set(section_words)
|
|
source_set = set(source_words)
|
|
extra_matches = 0
|
|
total_terms = len(section_set | source_set) or 1
|
|
|
|
for abbr, expansions in self._synonym_map.items():
|
|
abbr_in_section = abbr in section_set
|
|
abbr_in_source = abbr in source_set
|
|
for expansion in expansions:
|
|
exp_words = set(expansion.split())
|
|
exp_in_section = exp_words.issubset(section_set)
|
|
exp_in_source = exp_words.issubset(source_set)
|
|
if (abbr_in_section and exp_in_source) or (abbr_in_source and exp_in_section):
|
|
extra_matches += 1
|
|
|
|
return min(0.2, extra_matches * 0.05)
|
|
|
|
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
|
|
"""Calculate phrase similarity boost score."""
|
|
if not text1 or not text2:
|
|
return 0.0
|
|
|
|
text1_lower = text1.lower()
|
|
text2_lower = text2.lower()
|
|
|
|
# Look for 2-3 word phrases
|
|
phrase_boost = 0.0
|
|
|
|
# Extract 2-word phrases
|
|
words1 = text1_lower.split()
|
|
words2 = text2_lower.split()
|
|
|
|
for i in range(len(words1) - 1):
|
|
phrase = f"{words1[i]} {words1[i+1]}"
|
|
if phrase in text2_lower:
|
|
phrase_boost += 0.1
|
|
|
|
# Extract 3-word phrases
|
|
for i in range(len(words1) - 2):
|
|
phrase = f"{words1[i]} {words1[i+1]} {words1[i+2]}"
|
|
if phrase in text2_lower:
|
|
phrase_boost += 0.15
|
|
|
|
return min(0.3, phrase_boost) # Cap at 0.3
|
|
|
|
def _get_intent_keywords(self, search_intent: str) -> List[str]:
|
|
"""Get keywords associated with search intent."""
|
|
intent_keywords = {
|
|
'informational': ['what', 'how', 'why', 'guide', 'tutorial', 'explain', 'learn', 'understand'],
|
|
'navigational': ['find', 'locate', 'search', 'where', 'site', 'website', 'page'],
|
|
'transactional': ['buy', 'purchase', 'order', 'price', 'cost', 'deal', 'offer', 'discount'],
|
|
'commercial': ['compare', 'review', 'best', 'top', 'vs', 'versus', 'alternative', 'option']
|
|
}
|
|
|
|
return intent_keywords.get(search_intent, [])
|
|
|
|
def get_mapping_statistics(self, mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]) -> Dict[str, Any]:
|
|
"""
|
|
Get statistics about the mapping results.
|
|
|
|
Args:
|
|
mapping_results: Mapping results to analyze
|
|
|
|
Returns:
|
|
Dictionary with mapping statistics
|
|
"""
|
|
total_sections = len(mapping_results)
|
|
total_mappings = sum(len(sources) for sources in mapping_results.values())
|
|
|
|
# Calculate score distribution
|
|
all_scores = []
|
|
for sources in mapping_results.values():
|
|
all_scores.extend([score for source, score in sources])
|
|
|
|
avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
|
max_score = max(all_scores) if all_scores else 0.0
|
|
min_score = min(all_scores) if all_scores else 0.0
|
|
|
|
# Count sections with/without sources
|
|
sections_with_sources = sum(1 for sources in mapping_results.values() if sources)
|
|
sections_without_sources = total_sections - sections_with_sources
|
|
|
|
return {
|
|
'total_sections': total_sections,
|
|
'total_mappings': total_mappings,
|
|
'sections_with_sources': sections_with_sources,
|
|
'sections_without_sources': sections_without_sources,
|
|
'average_score': avg_score,
|
|
'max_score': max_score,
|
|
'min_score': min_score,
|
|
'mapping_coverage': sections_with_sources / total_sections if total_sections > 0 else 0.0
|
|
}
|
|
|
|
def _build_validation_prompt(
|
|
self,
|
|
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
|
research_data: BlogResearchResponse
|
|
) -> str:
|
|
"""
|
|
Build comprehensive AI validation prompt for source-to-section mapping.
|
|
|
|
Args:
|
|
mapping_results: Algorithmic mapping results
|
|
research_data: Research data for context
|
|
|
|
Returns:
|
|
Formatted AI validation prompt
|
|
"""
|
|
# Extract section information
|
|
sections_info = []
|
|
for section_id, sources in mapping_results.items():
|
|
section_info = {
|
|
'id': section_id,
|
|
'sources': [
|
|
{
|
|
'title': source.title,
|
|
'url': source.url,
|
|
'excerpt': source.excerpt,
|
|
'credibility_score': source.credibility_score,
|
|
'algorithmic_score': score
|
|
}
|
|
for source, score in sources
|
|
]
|
|
}
|
|
sections_info.append(section_info)
|
|
|
|
# Extract research context
|
|
research_context = {
|
|
'primary_keywords': research_data.keyword_analysis.get('primary', []),
|
|
'secondary_keywords': research_data.keyword_analysis.get('secondary', []),
|
|
'content_angles': research_data.suggested_angles,
|
|
'search_intent': research_data.keyword_analysis.get('search_intent', 'informational'),
|
|
'all_sources': [
|
|
{
|
|
'title': source.title,
|
|
'url': source.url,
|
|
'excerpt': source.excerpt,
|
|
'credibility_score': source.credibility_score
|
|
}
|
|
for source in research_data.sources
|
|
]
|
|
}
|
|
|
|
prompt = f"""
|
|
You are an expert content strategist and SEO specialist. Your task is to validate and improve the algorithmic mapping of research sources to blog outline sections.
|
|
|
|
## CONTEXT
|
|
Research Topic: {', '.join(research_context['primary_keywords'])}
|
|
Search Intent: {research_context['search_intent']}
|
|
Content Angles: {', '.join(research_context['content_angles'])}
|
|
|
|
## ALGORITHMIC MAPPING RESULTS
|
|
The following sections have been algorithmically mapped with research sources:
|
|
|
|
{self._format_sections_for_prompt(sections_info)}
|
|
|
|
## AVAILABLE SOURCES
|
|
All available research sources:
|
|
{self._format_sources_for_prompt(research_context['all_sources'])}
|
|
|
|
## VALIDATION TASK
|
|
Please analyze the algorithmic mapping and provide improvements:
|
|
|
|
1. **Validate Relevance**: Are the mapped sources truly relevant to each section's content and purpose?
|
|
2. **Identify Gaps**: Are there better sources available that weren't mapped?
|
|
3. **Suggest Improvements**: Recommend specific source changes for better content alignment
|
|
4. **Quality Assessment**: Rate the overall mapping quality (1-10)
|
|
|
|
## RESPONSE FORMAT
|
|
Provide your analysis in the following JSON format:
|
|
|
|
```json
|
|
{{
|
|
"overall_quality_score": 8,
|
|
"section_improvements": [
|
|
{{
|
|
"section_id": "s1",
|
|
"current_sources": ["source_title_1", "source_title_2"],
|
|
"recommended_sources": ["better_source_1", "better_source_2", "better_source_3"],
|
|
"reasoning": "Explanation of why these sources are better suited for this section",
|
|
"confidence": 0.9
|
|
}}
|
|
],
|
|
"summary": "Overall assessment of the mapping quality and key improvements made"
|
|
}}
|
|
```
|
|
|
|
## GUIDELINES
|
|
- Prioritize sources that directly support the section's key points and subheadings
|
|
- Consider source credibility, recency, and content depth
|
|
- Ensure sources provide actionable insights for content creation
|
|
- Maintain diversity in source types and perspectives
|
|
- Focus on sources that enhance the section's value proposition
|
|
|
|
Analyze the mapping and provide your recommendations.
|
|
"""
|
|
|
|
return prompt
|
|
|
|
def _get_ai_validation_response(self, prompt: str, user_id: str) -> str:
|
|
"""
|
|
Get AI validation response using LLM provider.
|
|
|
|
Args:
|
|
prompt: Validation prompt
|
|
user_id: User ID (required for subscription checks and usage tracking)
|
|
|
|
Returns:
|
|
AI validation response
|
|
|
|
Raises:
|
|
ValueError: If user_id is not provided
|
|
"""
|
|
if not user_id:
|
|
raise ValueError("user_id is required for AI validation response (subscription checks and usage tracking)")
|
|
|
|
try:
|
|
from services.llm_providers.main_text_generation import llm_text_gen
|
|
|
|
response = llm_text_gen(
|
|
prompt=prompt,
|
|
json_struct=None,
|
|
system_prompt=None,
|
|
user_id=user_id
|
|
)
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get AI validation response: {e}")
|
|
raise
|
|
|
|
def _parse_validation_response(
|
|
self,
|
|
response: str,
|
|
original_mapping: Dict[str, List[Tuple[ResearchSource, float]]],
|
|
research_data: BlogResearchResponse
|
|
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
|
"""
|
|
Parse AI validation response and apply improvements.
|
|
|
|
Args:
|
|
response: AI validation response
|
|
original_mapping: Original algorithmic mapping
|
|
research_data: Research data for context
|
|
|
|
Returns:
|
|
Improved mapping based on AI validation
|
|
"""
|
|
try:
|
|
import json
|
|
import re
|
|
|
|
# Extract JSON from response
|
|
json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
|
|
if not json_match:
|
|
# Try to find JSON without code blocks
|
|
json_match = re.search(r'(\{.*?\})', response, re.DOTALL)
|
|
|
|
if not json_match:
|
|
logger.warning("Could not extract JSON from AI response")
|
|
return original_mapping
|
|
|
|
validation_data = json.loads(json_match.group(1))
|
|
|
|
# Create source lookup for quick access
|
|
source_lookup = {source.title: source for source in research_data.sources}
|
|
|
|
# Apply AI improvements
|
|
improved_mapping = {}
|
|
|
|
for improvement in validation_data.get('section_improvements', []):
|
|
section_id = improvement['section_id']
|
|
recommended_titles = improvement['recommended_sources']
|
|
|
|
# Map recommended titles to actual sources
|
|
recommended_sources = []
|
|
for title in recommended_titles:
|
|
if title in source_lookup:
|
|
source = source_lookup[title]
|
|
# Use high confidence score for AI-recommended sources
|
|
recommended_sources.append((source, 0.9))
|
|
|
|
if recommended_sources:
|
|
improved_mapping[section_id] = recommended_sources
|
|
else:
|
|
# Fallback to original mapping if no valid sources found
|
|
improved_mapping[section_id] = original_mapping.get(section_id, [])
|
|
|
|
# Add sections not mentioned in AI response
|
|
for section_id, sources in original_mapping.items():
|
|
if section_id not in improved_mapping:
|
|
improved_mapping[section_id] = sources
|
|
|
|
logger.info(f"AI validation applied: {len(validation_data.get('section_improvements', []))} sections improved")
|
|
return improved_mapping
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse AI validation response: {e}")
|
|
return original_mapping
|
|
|
|
def _format_sections_for_prompt(self, sections_info: List[Dict]) -> str:
|
|
"""Format sections information for AI prompt."""
|
|
formatted = []
|
|
for section in sections_info:
|
|
section_text = f"**Section {section['id']}:**\n"
|
|
section_text += f"Sources mapped: {len(section['sources'])}\n"
|
|
for source in section['sources']:
|
|
section_text += f"- {source['title']} (Score: {source['algorithmic_score']:.2f})\n"
|
|
formatted.append(section_text)
|
|
return "\n".join(formatted)
|
|
|
|
def _format_sources_for_prompt(self, sources: List[Dict]) -> str:
|
|
"""Format sources information for AI prompt."""
|
|
formatted = []
|
|
for i, source in enumerate(sources, 1):
|
|
source_text = f"{i}. **{source['title']}**\n"
|
|
source_text += f" URL: {source['url']}\n"
|
|
source_text += f" Credibility: {source['credibility_score']}\n"
|
|
if source['excerpt']:
|
|
source_text += f" Excerpt: {source['excerpt'][:200]}...\n"
|
|
formatted.append(source_text)
|
|
return "\n".join(formatted)
|