ALwrity AI Blog Writer - Added Google Grounding UI Implementation
This commit is contained in:
669
backend/services/blog_writer/outline/source_mapper.py
Normal file
669
backend/services/blog_writer/outline/source_mapper.py
Normal file
@@ -0,0 +1,669 @@
|
||||
"""
|
||||
Source-to-Section Mapper - Intelligent mapping of research sources to outline sections.
|
||||
|
||||
This module provides algorithmic mapping of research sources to specific outline sections
|
||||
based on semantic similarity, keyword relevance, and contextual matching. Uses a hybrid
|
||||
approach of algorithmic scoring followed by AI validation for optimal results.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Tuple, Optional
|
||||
import re
|
||||
from collections import Counter
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import (
|
||||
BlogOutlineSection,
|
||||
ResearchSource,
|
||||
BlogResearchResponse,
|
||||
)
|
||||
|
||||
|
||||
class SourceToSectionMapper:
|
||||
"""Maps research sources to outline sections using intelligent algorithms."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the source-to-section mapper."""
|
||||
self.min_semantic_score = 0.3
|
||||
self.min_keyword_score = 0.2
|
||||
self.min_contextual_score = 0.2
|
||||
self.max_sources_per_section = 3
|
||||
self.min_total_score = 0.4
|
||||
|
||||
# Weight factors for different scoring methods
|
||||
self.weights = {
|
||||
'semantic': 0.4, # Semantic similarity weight
|
||||
'keyword': 0.3, # Keyword matching weight
|
||||
'contextual': 0.3 # Contextual relevance weight
|
||||
}
|
||||
|
||||
# Common stop words for text processing
|
||||
self.stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
||||
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
||||
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
|
||||
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
|
||||
'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
|
||||
'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
|
||||
'over', 'under', 'again', 'further', 'then', 'once'
|
||||
}
|
||||
|
||||
logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
|
||||
|
||||
def map_sources_to_sections(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Map research sources to outline sections using intelligent algorithms.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections to map sources to
|
||||
research_data: Research data containing sources and metadata
|
||||
|
||||
Returns:
|
||||
List of outline sections with intelligently mapped sources
|
||||
"""
|
||||
if not sections or not research_data.sources:
|
||||
logger.warning("No sections or sources to map")
|
||||
return sections
|
||||
|
||||
logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
|
||||
|
||||
# Step 1: Algorithmic mapping
|
||||
mapping_results = self._algorithmic_source_mapping(sections, research_data)
|
||||
|
||||
# Step 2: AI validation and improvement (single prompt)
|
||||
validated_mapping = self._ai_validate_mapping(mapping_results, research_data)
|
||||
|
||||
# Step 3: Apply validated mapping to sections
|
||||
mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
|
||||
|
||||
logger.info("✅ Source-to-section mapping completed successfully")
|
||||
return mapped_sections
|
||||
|
||||
def _algorithmic_source_mapping(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Perform algorithmic mapping of sources to sections.
|
||||
|
||||
Args:
|
||||
sections: List of outline sections
|
||||
research_data: Research data with sources
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section IDs to list of (source, score) tuples
|
||||
"""
|
||||
mapping_results = {}
|
||||
|
||||
for section in sections:
|
||||
section_scores = []
|
||||
|
||||
for source in research_data.sources:
|
||||
# Calculate multi-dimensional relevance score
|
||||
semantic_score = self._calculate_semantic_similarity(section, source)
|
||||
keyword_score = self._calculate_keyword_relevance(section, source, research_data)
|
||||
contextual_score = self._calculate_contextual_relevance(section, source, research_data)
|
||||
|
||||
# Weighted total score
|
||||
total_score = (
|
||||
semantic_score * self.weights['semantic'] +
|
||||
keyword_score * self.weights['keyword'] +
|
||||
contextual_score * self.weights['contextual']
|
||||
)
|
||||
|
||||
# Only include sources that meet minimum threshold
|
||||
if total_score >= self.min_total_score:
|
||||
section_scores.append((source, total_score))
|
||||
|
||||
# Sort by score and limit to max sources per section
|
||||
section_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
section_scores = section_scores[:self.max_sources_per_section]
|
||||
|
||||
mapping_results[section.id] = section_scores
|
||||
|
||||
logger.debug(f"Section '{section.heading}': {len(section_scores)} sources mapped")
|
||||
|
||||
return mapping_results
|
||||
|
||||
def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
|
||||
"""
|
||||
Calculate semantic similarity between section and source.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
|
||||
Returns:
|
||||
Semantic similarity score (0.0 to 1.0)
|
||||
"""
|
||||
# Extract text content for comparison
|
||||
section_text = self._extract_section_text(section)
|
||||
source_text = self._extract_source_text(source)
|
||||
|
||||
# Calculate word overlap
|
||||
section_words = self._extract_meaningful_words(section_text)
|
||||
source_words = self._extract_meaningful_words(source_text)
|
||||
|
||||
if not section_words or not source_words:
|
||||
return 0.0
|
||||
|
||||
# Calculate Jaccard similarity
|
||||
intersection = len(set(section_words) & set(source_words))
|
||||
union = len(set(section_words) | set(source_words))
|
||||
|
||||
jaccard_similarity = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Boost score for exact phrase matches
|
||||
phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
|
||||
|
||||
# Combine Jaccard similarity with phrase boost
|
||||
semantic_score = min(1.0, jaccard_similarity + phrase_boost)
|
||||
|
||||
return semantic_score
|
||||
|
||||
def _calculate_keyword_relevance(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
source: ResearchSource,
|
||||
research_data: BlogResearchResponse
|
||||
) -> float:
|
||||
"""
|
||||
Calculate keyword-based relevance between section and source.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
research_data: Research data with keyword analysis
|
||||
|
||||
Returns:
|
||||
Keyword relevance score (0.0 to 1.0)
|
||||
"""
|
||||
# Get section keywords
|
||||
section_keywords = set(section.keywords)
|
||||
if not section_keywords:
|
||||
# Extract keywords from section heading and content
|
||||
section_text = self._extract_section_text(section)
|
||||
section_keywords = set(self._extract_meaningful_words(section_text))
|
||||
|
||||
# Get source keywords from title and excerpt
|
||||
source_text = f"{source.title} {source.excerpt or ''}"
|
||||
source_keywords = set(self._extract_meaningful_words(source_text))
|
||||
|
||||
# Get research keywords for context
|
||||
research_keywords = set()
|
||||
for category in ['primary', 'secondary', 'long_tail', 'semantic_keywords']:
|
||||
research_keywords.update(research_data.keyword_analysis.get(category, []))
|
||||
|
||||
# Calculate keyword overlap scores
|
||||
section_overlap = len(section_keywords & source_keywords) / len(section_keywords) if section_keywords else 0.0
|
||||
research_overlap = len(research_keywords & source_keywords) / len(research_keywords) if research_keywords else 0.0
|
||||
|
||||
# Weighted combination
|
||||
keyword_score = (section_overlap * 0.7) + (research_overlap * 0.3)
|
||||
|
||||
return min(1.0, keyword_score)
|
||||
|
||||
def _calculate_contextual_relevance(
|
||||
self,
|
||||
section: BlogOutlineSection,
|
||||
source: ResearchSource,
|
||||
research_data: BlogResearchResponse
|
||||
) -> float:
|
||||
"""
|
||||
Calculate contextual relevance based on section content and source context.
|
||||
|
||||
Args:
|
||||
section: Outline section
|
||||
source: Research source
|
||||
research_data: Research data with context
|
||||
|
||||
Returns:
|
||||
Contextual relevance score (0.0 to 1.0)
|
||||
"""
|
||||
contextual_score = 0.0
|
||||
|
||||
# 1. Content angle matching
|
||||
section_text = self._extract_section_text(section).lower()
|
||||
source_text = f"{source.title} {source.excerpt or ''}".lower()
|
||||
|
||||
# Check for content angle matches
|
||||
content_angles = research_data.suggested_angles
|
||||
for angle in content_angles:
|
||||
angle_words = self._extract_meaningful_words(angle.lower())
|
||||
if angle_words:
|
||||
section_angle_match = sum(1 for word in angle_words if word in section_text) / len(angle_words)
|
||||
source_angle_match = sum(1 for word in angle_words if word in source_text) / len(angle_words)
|
||||
contextual_score += (section_angle_match + source_angle_match) * 0.3
|
||||
|
||||
# 2. Search intent alignment
|
||||
search_intent = research_data.keyword_analysis.get('search_intent', 'informational')
|
||||
intent_keywords = self._get_intent_keywords(search_intent)
|
||||
|
||||
intent_score = 0.0
|
||||
for keyword in intent_keywords:
|
||||
if keyword in section_text or keyword in source_text:
|
||||
intent_score += 0.1
|
||||
|
||||
contextual_score += min(0.3, intent_score)
|
||||
|
||||
# 3. Industry/domain relevance
|
||||
if hasattr(research_data, 'industry') and research_data.industry:
|
||||
industry_words = self._extract_meaningful_words(research_data.industry.lower())
|
||||
industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
|
||||
contextual_score += industry_score * 0.2
|
||||
|
||||
return min(1.0, contextual_score)
|
||||
|
||||
def _ai_validate_mapping(
|
||||
self,
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Use AI to validate and improve the algorithmic mapping results.
|
||||
|
||||
Args:
|
||||
mapping_results: Algorithmic mapping results
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
AI-validated and improved mapping results
|
||||
"""
|
||||
try:
|
||||
logger.info("Starting AI validation of source-to-section mapping...")
|
||||
|
||||
# Build AI validation prompt
|
||||
validation_prompt = self._build_validation_prompt(mapping_results, research_data)
|
||||
|
||||
# Get AI validation response
|
||||
validation_response = self._get_ai_validation_response(validation_prompt)
|
||||
|
||||
# Parse and apply AI validation results
|
||||
validated_mapping = self._parse_validation_response(validation_response, mapping_results, research_data)
|
||||
|
||||
logger.info("✅ AI validation completed successfully")
|
||||
return validated_mapping
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"AI validation failed: {e}. Using algorithmic results as fallback.")
|
||||
return mapping_results
|
||||
|
||||
def _apply_mapping_to_sections(
|
||||
self,
|
||||
sections: List[BlogOutlineSection],
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]
|
||||
) -> List[BlogOutlineSection]:
|
||||
"""
|
||||
Apply the mapping results to the outline sections.
|
||||
|
||||
Args:
|
||||
sections: Original outline sections
|
||||
mapping_results: Mapping results from algorithmic/AI processing
|
||||
|
||||
Returns:
|
||||
Sections with mapped sources
|
||||
"""
|
||||
mapped_sections = []
|
||||
|
||||
for section in sections:
|
||||
# Get mapped sources for this section
|
||||
mapped_sources = mapping_results.get(section.id, [])
|
||||
|
||||
# Extract just the sources (without scores)
|
||||
section_sources = [source for source, score in mapped_sources]
|
||||
|
||||
# Create new section with mapped sources
|
||||
mapped_section = BlogOutlineSection(
|
||||
id=section.id,
|
||||
heading=section.heading,
|
||||
subheadings=section.subheadings,
|
||||
key_points=section.key_points,
|
||||
references=section_sources,
|
||||
target_words=section.target_words,
|
||||
keywords=section.keywords
|
||||
)
|
||||
|
||||
mapped_sections.append(mapped_section)
|
||||
|
||||
logger.debug(f"Applied {len(section_sources)} sources to section '{section.heading}'")
|
||||
|
||||
return mapped_sections
|
||||
|
||||
# Helper methods
|
||||
|
||||
def _extract_section_text(self, section: BlogOutlineSection) -> str:
|
||||
"""Extract all text content from a section."""
|
||||
text_parts = [section.heading]
|
||||
text_parts.extend(section.subheadings)
|
||||
text_parts.extend(section.key_points)
|
||||
text_parts.extend(section.keywords)
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_source_text(self, source: ResearchSource) -> str:
|
||||
"""Extract all text content from a source."""
|
||||
text_parts = [source.title]
|
||||
if source.excerpt:
|
||||
text_parts.append(source.excerpt)
|
||||
return " ".join(text_parts)
|
||||
|
||||
def _extract_meaningful_words(self, text: str) -> List[str]:
|
||||
"""Extract meaningful words from text, removing stop words and cleaning."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Clean and tokenize
|
||||
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
||||
|
||||
# Remove stop words and short words
|
||||
meaningful_words = [
|
||||
word for word in words
|
||||
if word not in self.stop_words and len(word) > 2
|
||||
]
|
||||
|
||||
return meaningful_words
|
||||
|
||||
def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Calculate phrase similarity boost score."""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
text1_lower = text1.lower()
|
||||
text2_lower = text2.lower()
|
||||
|
||||
# Look for 2-3 word phrases
|
||||
phrase_boost = 0.0
|
||||
|
||||
# Extract 2-word phrases
|
||||
words1 = text1_lower.split()
|
||||
words2 = text2_lower.split()
|
||||
|
||||
for i in range(len(words1) - 1):
|
||||
phrase = f"{words1[i]} {words1[i+1]}"
|
||||
if phrase in text2_lower:
|
||||
phrase_boost += 0.1
|
||||
|
||||
# Extract 3-word phrases
|
||||
for i in range(len(words1) - 2):
|
||||
phrase = f"{words1[i]} {words1[i+1]} {words1[i+2]}"
|
||||
if phrase in text2_lower:
|
||||
phrase_boost += 0.15
|
||||
|
||||
return min(0.3, phrase_boost) # Cap at 0.3
|
||||
|
||||
def _get_intent_keywords(self, search_intent: str) -> List[str]:
|
||||
"""Get keywords associated with search intent."""
|
||||
intent_keywords = {
|
||||
'informational': ['what', 'how', 'why', 'guide', 'tutorial', 'explain', 'learn', 'understand'],
|
||||
'navigational': ['find', 'locate', 'search', 'where', 'site', 'website', 'page'],
|
||||
'transactional': ['buy', 'purchase', 'order', 'price', 'cost', 'deal', 'offer', 'discount'],
|
||||
'commercial': ['compare', 'review', 'best', 'top', 'vs', 'versus', 'alternative', 'option']
|
||||
}
|
||||
|
||||
return intent_keywords.get(search_intent, [])
|
||||
|
||||
def get_mapping_statistics(self, mapping_results: Dict[str, List[Tuple[ResearchSource, float]]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Get statistics about the mapping results.
|
||||
|
||||
Args:
|
||||
mapping_results: Mapping results to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary with mapping statistics
|
||||
"""
|
||||
total_sections = len(mapping_results)
|
||||
total_mappings = sum(len(sources) for sources in mapping_results.values())
|
||||
|
||||
# Calculate score distribution
|
||||
all_scores = []
|
||||
for sources in mapping_results.values():
|
||||
all_scores.extend([score for source, score in sources])
|
||||
|
||||
avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
||||
max_score = max(all_scores) if all_scores else 0.0
|
||||
min_score = min(all_scores) if all_scores else 0.0
|
||||
|
||||
# Count sections with/without sources
|
||||
sections_with_sources = sum(1 for sources in mapping_results.values() if sources)
|
||||
sections_without_sources = total_sections - sections_with_sources
|
||||
|
||||
return {
|
||||
'total_sections': total_sections,
|
||||
'total_mappings': total_mappings,
|
||||
'sections_with_sources': sections_with_sources,
|
||||
'sections_without_sources': sections_without_sources,
|
||||
'average_score': avg_score,
|
||||
'max_score': max_score,
|
||||
'min_score': min_score,
|
||||
'mapping_coverage': sections_with_sources / total_sections if total_sections > 0 else 0.0
|
||||
}
|
||||
|
||||
def _build_validation_prompt(
|
||||
self,
|
||||
mapping_results: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> str:
|
||||
"""
|
||||
Build comprehensive AI validation prompt for source-to-section mapping.
|
||||
|
||||
Args:
|
||||
mapping_results: Algorithmic mapping results
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Formatted AI validation prompt
|
||||
"""
|
||||
# Extract section information
|
||||
sections_info = []
|
||||
for section_id, sources in mapping_results.items():
|
||||
section_info = {
|
||||
'id': section_id,
|
||||
'sources': [
|
||||
{
|
||||
'title': source.title,
|
||||
'url': source.url,
|
||||
'excerpt': source.excerpt,
|
||||
'credibility_score': source.credibility_score,
|
||||
'algorithmic_score': score
|
||||
}
|
||||
for source, score in sources
|
||||
]
|
||||
}
|
||||
sections_info.append(section_info)
|
||||
|
||||
# Extract research context
|
||||
research_context = {
|
||||
'primary_keywords': research_data.keyword_analysis.get('primary', []),
|
||||
'secondary_keywords': research_data.keyword_analysis.get('secondary', []),
|
||||
'content_angles': research_data.suggested_angles,
|
||||
'search_intent': research_data.keyword_analysis.get('search_intent', 'informational'),
|
||||
'all_sources': [
|
||||
{
|
||||
'title': source.title,
|
||||
'url': source.url,
|
||||
'excerpt': source.excerpt,
|
||||
'credibility_score': source.credibility_score
|
||||
}
|
||||
for source in research_data.sources
|
||||
]
|
||||
}
|
||||
|
||||
prompt = f"""
|
||||
You are an expert content strategist and SEO specialist. Your task is to validate and improve the algorithmic mapping of research sources to blog outline sections.
|
||||
|
||||
## CONTEXT
|
||||
Research Topic: {', '.join(research_context['primary_keywords'])}
|
||||
Search Intent: {research_context['search_intent']}
|
||||
Content Angles: {', '.join(research_context['content_angles'])}
|
||||
|
||||
## ALGORITHMIC MAPPING RESULTS
|
||||
The following sections have been algorithmically mapped with research sources:
|
||||
|
||||
{self._format_sections_for_prompt(sections_info)}
|
||||
|
||||
## AVAILABLE SOURCES
|
||||
All available research sources:
|
||||
{self._format_sources_for_prompt(research_context['all_sources'])}
|
||||
|
||||
## VALIDATION TASK
|
||||
Please analyze the algorithmic mapping and provide improvements:
|
||||
|
||||
1. **Validate Relevance**: Are the mapped sources truly relevant to each section's content and purpose?
|
||||
2. **Identify Gaps**: Are there better sources available that weren't mapped?
|
||||
3. **Suggest Improvements**: Recommend specific source changes for better content alignment
|
||||
4. **Quality Assessment**: Rate the overall mapping quality (1-10)
|
||||
|
||||
## RESPONSE FORMAT
|
||||
Provide your analysis in the following JSON format:
|
||||
|
||||
```json
|
||||
{{
|
||||
"overall_quality_score": 8,
|
||||
"section_improvements": [
|
||||
{{
|
||||
"section_id": "s1",
|
||||
"current_sources": ["source_title_1", "source_title_2"],
|
||||
"recommended_sources": ["better_source_1", "better_source_2", "better_source_3"],
|
||||
"reasoning": "Explanation of why these sources are better suited for this section",
|
||||
"confidence": 0.9
|
||||
}}
|
||||
],
|
||||
"summary": "Overall assessment of the mapping quality and key improvements made"
|
||||
}}
|
||||
```
|
||||
|
||||
## GUIDELINES
|
||||
- Prioritize sources that directly support the section's key points and subheadings
|
||||
- Consider source credibility, recency, and content depth
|
||||
- Ensure sources provide actionable insights for content creation
|
||||
- Maintain diversity in source types and perspectives
|
||||
- Focus on sources that enhance the section's value proposition
|
||||
|
||||
Analyze the mapping and provide your recommendations.
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _get_ai_validation_response(self, prompt: str) -> str:
|
||||
"""
|
||||
Get AI validation response using LLM provider.
|
||||
|
||||
Args:
|
||||
prompt: Validation prompt
|
||||
|
||||
Returns:
|
||||
AI validation response
|
||||
"""
|
||||
try:
|
||||
from services.llm_providers.gemini_provider import gemini_text_response
|
||||
|
||||
response = gemini_text_response(
|
||||
prompt=prompt,
|
||||
temperature=0.3,
|
||||
top_p=0.9,
|
||||
n=1,
|
||||
max_tokens=2000,
|
||||
system_prompt=None
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get AI validation response: {e}")
|
||||
raise
|
||||
|
||||
def _parse_validation_response(
|
||||
self,
|
||||
response: str,
|
||||
original_mapping: Dict[str, List[Tuple[ResearchSource, float]]],
|
||||
research_data: BlogResearchResponse
|
||||
) -> Dict[str, List[Tuple[ResearchSource, float]]]:
|
||||
"""
|
||||
Parse AI validation response and apply improvements.
|
||||
|
||||
Args:
|
||||
response: AI validation response
|
||||
original_mapping: Original algorithmic mapping
|
||||
research_data: Research data for context
|
||||
|
||||
Returns:
|
||||
Improved mapping based on AI validation
|
||||
"""
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Extract JSON from response
|
||||
json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
|
||||
if not json_match:
|
||||
# Try to find JSON without code blocks
|
||||
json_match = re.search(r'(\{.*?\})', response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
logger.warning("Could not extract JSON from AI response")
|
||||
return original_mapping
|
||||
|
||||
validation_data = json.loads(json_match.group(1))
|
||||
|
||||
# Create source lookup for quick access
|
||||
source_lookup = {source.title: source for source in research_data.sources}
|
||||
|
||||
# Apply AI improvements
|
||||
improved_mapping = {}
|
||||
|
||||
for improvement in validation_data.get('section_improvements', []):
|
||||
section_id = improvement['section_id']
|
||||
recommended_titles = improvement['recommended_sources']
|
||||
|
||||
# Map recommended titles to actual sources
|
||||
recommended_sources = []
|
||||
for title in recommended_titles:
|
||||
if title in source_lookup:
|
||||
source = source_lookup[title]
|
||||
# Use high confidence score for AI-recommended sources
|
||||
recommended_sources.append((source, 0.9))
|
||||
|
||||
if recommended_sources:
|
||||
improved_mapping[section_id] = recommended_sources
|
||||
else:
|
||||
# Fallback to original mapping if no valid sources found
|
||||
improved_mapping[section_id] = original_mapping.get(section_id, [])
|
||||
|
||||
# Add sections not mentioned in AI response
|
||||
for section_id, sources in original_mapping.items():
|
||||
if section_id not in improved_mapping:
|
||||
improved_mapping[section_id] = sources
|
||||
|
||||
logger.info(f"AI validation applied: {len(validation_data.get('section_improvements', []))} sections improved")
|
||||
return improved_mapping
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse AI validation response: {e}")
|
||||
return original_mapping
|
||||
|
||||
def _format_sections_for_prompt(self, sections_info: List[Dict]) -> str:
|
||||
"""Format sections information for AI prompt."""
|
||||
formatted = []
|
||||
for section in sections_info:
|
||||
section_text = f"**Section {section['id']}:**\n"
|
||||
section_text += f"Sources mapped: {len(section['sources'])}\n"
|
||||
for source in section['sources']:
|
||||
section_text += f"- {source['title']} (Score: {source['algorithmic_score']:.2f})\n"
|
||||
formatted.append(section_text)
|
||||
return "\n".join(formatted)
|
||||
|
||||
def _format_sources_for_prompt(self, sources: List[Dict]) -> str:
|
||||
"""Format sources information for AI prompt."""
|
||||
formatted = []
|
||||
for i, source in enumerate(sources, 1):
|
||||
source_text = f"{i}. **{source['title']}**\n"
|
||||
source_text += f" URL: {source['url']}\n"
|
||||
source_text += f" Credibility: {source['credibility_score']}\n"
|
||||
if source['excerpt']:
|
||||
source_text += f" Excerpt: {source['excerpt'][:200]}...\n"
|
||||
formatted.append(source_text)
|
||||
return "\n".join(formatted)
|
||||
Reference in New Issue
Block a user