Files
moreminimore-marketing/backend/test/test_source_mapper.py
Kunthawat Greethong c35fa52117 Base code
2026-01-08 22:39:53 +07:00

516 lines
23 KiB
Python

"""
Unit tests for SourceToSectionMapper.
Tests the intelligent source-to-section mapping functionality.
"""
import pytest
from typing import List
from models.blog_models import (
BlogOutlineSection,
ResearchSource,
BlogResearchResponse,
GroundingMetadata,
)
from services.blog_writer.outline.source_mapper import SourceToSectionMapper
class TestSourceToSectionMapper:
"""Test cases for SourceToSectionMapper."""
def setup_method(self):
"""Set up test fixtures."""
self.mapper = SourceToSectionMapper()
# Create sample research sources
self.sample_sources = [
ResearchSource(
title="AI Trends in 2025: Machine Learning Revolution",
url="https://example.com/ai-trends-2025",
excerpt="Comprehensive analysis of artificial intelligence trends in 2025, focusing on machine learning advancements, deep learning breakthroughs, and AI automation in enterprise environments.",
credibility_score=0.95,
published_at="2025-08-15",
index=0,
source_type="web"
),
ResearchSource(
title="Enterprise AI Implementation Guide",
url="https://example.com/enterprise-ai-guide",
excerpt="Step-by-step guide for implementing artificial intelligence solutions in enterprise environments, including best practices, challenges, and success stories from leading companies.",
credibility_score=0.9,
published_at="2025-08-01",
index=1,
source_type="web"
),
ResearchSource(
title="Machine Learning Algorithms Explained",
url="https://example.com/ml-algorithms",
excerpt="Detailed explanation of various machine learning algorithms including supervised learning, unsupervised learning, and reinforcement learning techniques with practical examples.",
credibility_score=0.85,
published_at="2025-07-20",
index=2,
source_type="web"
),
ResearchSource(
title="AI Ethics and Responsible Development",
url="https://example.com/ai-ethics",
excerpt="Discussion of ethical considerations in artificial intelligence development, including bias mitigation, transparency, and responsible AI practices for developers and organizations.",
credibility_score=0.88,
published_at="2025-07-10",
index=3,
source_type="web"
),
ResearchSource(
title="Deep Learning Neural Networks Tutorial",
url="https://example.com/deep-learning-tutorial",
excerpt="Comprehensive tutorial on deep learning neural networks, covering convolutional neural networks, recurrent neural networks, and transformer architectures with code examples.",
credibility_score=0.92,
published_at="2025-06-15",
index=4,
source_type="web"
)
]
# Create sample outline sections
self.sample_sections = [
BlogOutlineSection(
id="s1",
heading="Introduction to AI and Machine Learning",
subheadings=["What is AI?", "Types of Machine Learning", "AI Applications"],
key_points=["AI definition and scope", "ML vs traditional programming", "Real-world AI examples"],
references=[],
target_words=300,
keywords=["artificial intelligence", "machine learning", "AI basics", "introduction"]
),
BlogOutlineSection(
id="s2",
heading="Enterprise AI Implementation Strategies",
subheadings=["Planning Phase", "Implementation Steps", "Best Practices"],
key_points=["Strategic planning", "Technology selection", "Change management", "ROI measurement"],
references=[],
target_words=400,
keywords=["enterprise AI", "implementation", "strategies", "business"]
),
BlogOutlineSection(
id="s3",
heading="Machine Learning Algorithms Deep Dive",
subheadings=["Supervised Learning", "Unsupervised Learning", "Deep Learning"],
key_points=["Algorithm types", "Use cases", "Performance metrics", "Model selection"],
references=[],
target_words=500,
keywords=["machine learning algorithms", "supervised learning", "deep learning", "neural networks"]
),
BlogOutlineSection(
id="s4",
heading="AI Ethics and Responsible Development",
subheadings=["Ethical Considerations", "Bias and Fairness", "Transparency"],
key_points=["Ethical frameworks", "Bias detection", "Explainable AI", "Regulatory compliance"],
references=[],
target_words=350,
keywords=["AI ethics", "responsible AI", "bias", "transparency"]
)
]
# Create sample research response
self.sample_research = BlogResearchResponse(
success=True,
sources=self.sample_sources,
keyword_analysis={
'primary': ['artificial intelligence', 'machine learning', 'AI implementation'],
'secondary': ['enterprise AI', 'deep learning', 'AI ethics'],
'long_tail': ['AI trends 2025', 'enterprise AI implementation guide', 'machine learning algorithms explained'],
'semantic_keywords': ['AI', 'ML', 'neural networks', 'automation'],
'trending_terms': ['AI 2025', 'generative AI', 'AI automation'],
'search_intent': 'informational',
'content_gaps': ['AI implementation challenges', 'ML algorithm comparison']
},
competitor_analysis={
'top_competitors': ['TechCorp AI', 'DataScience Inc', 'AI Solutions Ltd'],
'opportunities': ['Enterprise market gap', 'SME AI adoption'],
'competitive_advantages': ['Comprehensive coverage', 'Practical examples']
},
suggested_angles=[
'AI trends in 2025',
'Enterprise AI implementation',
'Machine learning fundamentals',
'AI ethics and responsibility'
],
search_widget="<div>Search widget HTML</div>",
search_queries=["AI trends 2025", "enterprise AI implementation", "machine learning guide"],
grounding_metadata=GroundingMetadata(
grounding_chunks=[],
grounding_supports=[],
citations=[],
search_entry_point="AI trends and implementation",
web_search_queries=["AI trends 2025", "enterprise AI"]
)
)
def test_semantic_similarity_calculation(self):
"""Test semantic similarity calculation between sections and sources."""
section = self.sample_sections[0] # AI Introduction section
source = self.sample_sources[0] # AI Trends source
similarity = self.mapper._calculate_semantic_similarity(section, source)
# Should have high similarity due to AI-related content
assert 0.0 <= similarity <= 1.0
assert similarity > 0.3 # Should be reasonably high for AI-related content
def test_keyword_relevance_calculation(self):
"""Test keyword-based relevance calculation."""
section = self.sample_sections[1] # Enterprise AI section
source = self.sample_sources[1] # Enterprise AI Guide source
relevance = self.mapper._calculate_keyword_relevance(section, source, self.sample_research)
# Should have reasonable relevance due to enterprise AI keywords
assert 0.0 <= relevance <= 1.0
assert relevance > 0.1 # Should be reasonable for matching enterprise AI content
def test_contextual_relevance_calculation(self):
"""Test contextual relevance calculation."""
section = self.sample_sections[2] # ML Algorithms section
source = self.sample_sources[2] # ML Algorithms source
relevance = self.mapper._calculate_contextual_relevance(section, source, self.sample_research)
# Should have high relevance due to matching content angles
assert 0.0 <= relevance <= 1.0
assert relevance > 0.2 # Should be reasonable for matching content
def test_algorithmic_source_mapping(self):
"""Test the complete algorithmic mapping process."""
mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
# Should have mapping results for all sections
assert len(mapping_results) == len(self.sample_sections)
# Each section should have some mapped sources
for section_id, sources in mapping_results.items():
assert isinstance(sources, list)
# Each source should be a tuple of (source, score)
for source, score in sources:
assert isinstance(source, ResearchSource)
assert isinstance(score, float)
assert 0.0 <= score <= 1.0
def test_source_mapping_quality(self):
"""Test that sources are mapped to relevant sections."""
mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
# Enterprise AI section should have enterprise AI source
enterprise_section = mapping_results["s2"]
enterprise_source_titles = [source.title for source, score in enterprise_section]
assert any("Enterprise" in title for title in enterprise_source_titles)
# ML Algorithms section should have ML algorithms source
ml_section = mapping_results["s3"]
ml_source_titles = [source.title for source, score in ml_section]
assert any("Machine Learning" in title or "Algorithms" in title for title in ml_source_titles)
# AI Ethics section should have AI ethics source
ethics_section = mapping_results["s4"]
ethics_source_titles = [source.title for source, score in ethics_section]
assert any("Ethics" in title for title in ethics_source_titles)
def test_complete_mapping_pipeline(self):
"""Test the complete mapping pipeline from sections to mapped sections."""
mapped_sections = self.mapper.map_sources_to_sections(self.sample_sections, self.sample_research)
# Should return same number of sections
assert len(mapped_sections) == len(self.sample_sections)
# Each section should have mapped sources
for section in mapped_sections:
assert isinstance(section.references, list)
assert len(section.references) <= self.mapper.max_sources_per_section
# All references should be ResearchSource objects
for source in section.references:
assert isinstance(source, ResearchSource)
def test_mapping_with_empty_sources(self):
"""Test mapping behavior with empty sources list."""
empty_research = BlogResearchResponse(
success=True,
sources=[],
keyword_analysis={},
competitor_analysis={},
suggested_angles=[],
search_widget="",
search_queries=[],
grounding_metadata=None
)
mapped_sections = self.mapper.map_sources_to_sections(self.sample_sections, empty_research)
# Should return sections with empty references
for section in mapped_sections:
assert section.references == []
def test_mapping_with_empty_sections(self):
"""Test mapping behavior with empty sections list."""
mapped_sections = self.mapper.map_sources_to_sections([], self.sample_research)
# Should return empty list
assert mapped_sections == []
def test_meaningful_words_extraction(self):
"""Test extraction of meaningful words from text."""
text = "Artificial Intelligence and Machine Learning are transforming the world of technology and business applications."
words = self.mapper._extract_meaningful_words(text)
# Should extract meaningful words and remove stop words
assert "artificial" in words
assert "intelligence" in words
assert "machine" in words
assert "learning" in words
assert "the" not in words # Stop word should be removed
assert "and" not in words # Stop word should be removed
def test_phrase_similarity_calculation(self):
"""Test phrase similarity calculation."""
text1 = "machine learning algorithms"
text2 = "This article covers machine learning algorithms and their applications"
similarity = self.mapper._calculate_phrase_similarity(text1, text2)
# Should find phrase matches
assert similarity > 0.0
assert similarity <= 0.3 # Should be capped at 0.3
def test_intent_keywords_extraction(self):
"""Test extraction of intent-specific keywords."""
informational_keywords = self.mapper._get_intent_keywords("informational")
transactional_keywords = self.mapper._get_intent_keywords("transactional")
# Should return appropriate keywords for each intent
assert "what" in informational_keywords
assert "how" in informational_keywords
assert "guide" in informational_keywords
assert "buy" in transactional_keywords
assert "purchase" in transactional_keywords
assert "price" in transactional_keywords
def test_mapping_statistics(self):
"""Test mapping statistics calculation."""
mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
stats = self.mapper.get_mapping_statistics(mapping_results)
# Should have valid statistics
assert stats['total_sections'] == len(self.sample_sections)
assert stats['total_mappings'] > 0
assert stats['sections_with_sources'] > 0
assert 0.0 <= stats['average_score'] <= 1.0
assert 0.0 <= stats['max_score'] <= 1.0
assert 0.0 <= stats['min_score'] <= 1.0
assert 0.0 <= stats['mapping_coverage'] <= 1.0
def test_source_quality_filtering(self):
"""Test that low-quality sources are filtered out."""
# Create a low-quality source
low_quality_source = ResearchSource(
title="Random Article",
url="https://example.com/random",
excerpt="This is a completely unrelated article about cooking recipes and gardening tips.",
credibility_score=0.3,
published_at="2025-08-01",
index=5,
source_type="web"
)
# Add to research data
research_with_low_quality = BlogResearchResponse(
success=True,
sources=self.sample_sources + [low_quality_source],
keyword_analysis=self.sample_research.keyword_analysis,
competitor_analysis=self.sample_research.competitor_analysis,
suggested_angles=self.sample_research.suggested_angles,
search_widget=self.sample_research.search_widget,
search_queries=self.sample_research.search_queries,
grounding_metadata=self.sample_research.grounding_metadata
)
mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, research_with_low_quality)
# Low-quality source should not be mapped to any section
all_mapped_sources = []
for sources in mapping_results.values():
all_mapped_sources.extend([source for source, score in sources])
assert low_quality_source not in all_mapped_sources
def test_max_sources_per_section_limit(self):
"""Test that the maximum sources per section limit is enforced."""
# Create many sources
many_sources = self.sample_sources * 3 # 15 sources
research_with_many_sources = BlogResearchResponse(
success=True,
sources=many_sources,
keyword_analysis=self.sample_research.keyword_analysis,
competitor_analysis=self.sample_research.competitor_analysis,
suggested_angles=self.sample_research.suggested_angles,
search_widget=self.sample_research.search_widget,
search_queries=self.sample_research.search_queries,
grounding_metadata=self.sample_research.grounding_metadata
)
mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, research_with_many_sources)
# Each section should have at most max_sources_per_section sources
for section_id, sources in mapping_results.items():
assert len(sources) <= self.mapper.max_sources_per_section
def test_ai_validation_prompt_building(self):
"""Test AI validation prompt building."""
mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
prompt = self.mapper._build_validation_prompt(mapping_results, self.sample_research)
# Should contain key elements
assert "expert content strategist" in prompt
assert "Research Topic:" in prompt
assert "ALGORITHMIC MAPPING RESULTS" in prompt
assert "AVAILABLE SOURCES" in prompt
assert "VALIDATION TASK" in prompt
assert "RESPONSE FORMAT" in prompt
assert "overall_quality_score" in prompt
assert "section_improvements" in prompt
def test_ai_validation_response_parsing(self):
"""Test AI validation response parsing."""
# Mock AI response
mock_response = """
Here's my analysis of the source-to-section mapping:
```json
{
"overall_quality_score": 8,
"section_improvements": [
{
"section_id": "s1",
"current_sources": ["AI Trends in 2025: Machine Learning Revolution"],
"recommended_sources": ["AI Trends in 2025: Machine Learning Revolution", "Machine Learning Algorithms Explained"],
"reasoning": "Adding ML algorithms source provides more technical depth",
"confidence": 0.9
}
],
"summary": "Good mapping overall, minor improvements suggested"
}
```
"""
original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
parsed_mapping = self.mapper._parse_validation_response(mock_response, original_mapping, self.sample_research)
# Should have improved mapping
assert "s1" in parsed_mapping
assert len(parsed_mapping["s1"]) > 0
# Should maintain other sections
assert len(parsed_mapping) == len(original_mapping)
def test_ai_validation_fallback_handling(self):
"""Test AI validation fallback when parsing fails."""
# Mock invalid AI response
invalid_response = "This is not a valid JSON response"
original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
parsed_mapping = self.mapper._parse_validation_response(invalid_response, original_mapping, self.sample_research)
# Should fallback to original mapping
assert parsed_mapping == original_mapping
def test_ai_validation_with_missing_sources(self):
"""Test AI validation when recommended sources don't exist."""
# Mock AI response with non-existent source
mock_response = """
```json
{
"overall_quality_score": 7,
"section_improvements": [
{
"section_id": "s1",
"current_sources": ["AI Trends in 2025: Machine Learning Revolution"],
"recommended_sources": ["Non-existent Source", "Another Fake Source"],
"reasoning": "These sources would be better",
"confidence": 0.8
}
],
"summary": "Suggested improvements"
}
```
"""
original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
parsed_mapping = self.mapper._parse_validation_response(mock_response, original_mapping, self.sample_research)
# Should fallback to original mapping for s1 since no valid sources found
assert parsed_mapping["s1"] == original_mapping["s1"]
def test_ai_validation_integration(self):
"""Test complete AI validation integration (with mocked LLM)."""
# This test would require mocking the LLM provider
# For now, we'll test that the method doesn't crash
mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
# Test that AI validation method exists and can be called
# (In real implementation, this would call the actual LLM)
try:
# This will fail in test environment due to no LLM, but should not crash
validated_mapping = self.mapper._ai_validate_mapping(mapping_results, self.sample_research)
# If it doesn't crash, it should return the original mapping as fallback
assert validated_mapping == mapping_results
except Exception as e:
# Expected to fail in test environment, but should be handled gracefully
assert "AI validation failed" in str(e) or "Failed to get AI validation response" in str(e)
def test_format_sections_for_prompt(self):
"""Test formatting of sections for AI prompt."""
sections_info = [
{
'id': 's1',
'sources': [
{
'title': 'Test Source 1',
'algorithmic_score': 0.85
}
]
}
]
formatted = self.mapper._format_sections_for_prompt(sections_info)
assert "Section s1:" in formatted
assert "Test Source 1" in formatted
assert "0.85" in formatted
def test_format_sources_for_prompt(self):
"""Test formatting of sources for AI prompt."""
sources = [
{
'title': 'Test Source',
'url': 'https://example.com',
'credibility_score': 0.9,
'excerpt': 'This is a test excerpt for the source.'
}
]
formatted = self.mapper._format_sources_for_prompt(sources)
assert "Test Source" in formatted
assert "https://example.com" in formatted
assert "0.9" in formatted
assert "This is a test excerpt" in formatted
if __name__ == '__main__':
pytest.main([__file__])