""" Unit tests for SourceToSectionMapper. Tests the intelligent source-to-section mapping functionality. """ import pytest from typing import List from models.blog_models import ( BlogOutlineSection, ResearchSource, BlogResearchResponse, GroundingMetadata, ) from services.blog_writer.outline.source_mapper import SourceToSectionMapper class TestSourceToSectionMapper: """Test cases for SourceToSectionMapper.""" def setup_method(self): """Set up test fixtures.""" self.mapper = SourceToSectionMapper() # Create sample research sources self.sample_sources = [ ResearchSource( title="AI Trends in 2025: Machine Learning Revolution", url="https://example.com/ai-trends-2025", excerpt="Comprehensive analysis of artificial intelligence trends in 2025, focusing on machine learning advancements, deep learning breakthroughs, and AI automation in enterprise environments.", credibility_score=0.95, published_at="2025-08-15", index=0, source_type="web" ), ResearchSource( title="Enterprise AI Implementation Guide", url="https://example.com/enterprise-ai-guide", excerpt="Step-by-step guide for implementing artificial intelligence solutions in enterprise environments, including best practices, challenges, and success stories from leading companies.", credibility_score=0.9, published_at="2025-08-01", index=1, source_type="web" ), ResearchSource( title="Machine Learning Algorithms Explained", url="https://example.com/ml-algorithms", excerpt="Detailed explanation of various machine learning algorithms including supervised learning, unsupervised learning, and reinforcement learning techniques with practical examples.", credibility_score=0.85, published_at="2025-07-20", index=2, source_type="web" ), ResearchSource( title="AI Ethics and Responsible Development", url="https://example.com/ai-ethics", excerpt="Discussion of ethical considerations in artificial intelligence development, including bias mitigation, transparency, and responsible AI practices for developers and organizations.", credibility_score=0.88, published_at="2025-07-10", index=3, source_type="web" ), ResearchSource( title="Deep Learning Neural Networks Tutorial", url="https://example.com/deep-learning-tutorial", excerpt="Comprehensive tutorial on deep learning neural networks, covering convolutional neural networks, recurrent neural networks, and transformer architectures with code examples.", credibility_score=0.92, published_at="2025-06-15", index=4, source_type="web" ) ] # Create sample outline sections self.sample_sections = [ BlogOutlineSection( id="s1", heading="Introduction to AI and Machine Learning", subheadings=["What is AI?", "Types of Machine Learning", "AI Applications"], key_points=["AI definition and scope", "ML vs traditional programming", "Real-world AI examples"], references=[], target_words=300, keywords=["artificial intelligence", "machine learning", "AI basics", "introduction"] ), BlogOutlineSection( id="s2", heading="Enterprise AI Implementation Strategies", subheadings=["Planning Phase", "Implementation Steps", "Best Practices"], key_points=["Strategic planning", "Technology selection", "Change management", "ROI measurement"], references=[], target_words=400, keywords=["enterprise AI", "implementation", "strategies", "business"] ), BlogOutlineSection( id="s3", heading="Machine Learning Algorithms Deep Dive", subheadings=["Supervised Learning", "Unsupervised Learning", "Deep Learning"], key_points=["Algorithm types", "Use cases", "Performance metrics", "Model selection"], references=[], target_words=500, keywords=["machine learning algorithms", "supervised learning", "deep learning", "neural networks"] ), BlogOutlineSection( id="s4", heading="AI Ethics and Responsible Development", subheadings=["Ethical Considerations", "Bias and Fairness", "Transparency"], key_points=["Ethical frameworks", "Bias detection", "Explainable AI", "Regulatory compliance"], references=[], target_words=350, keywords=["AI ethics", "responsible AI", "bias", "transparency"] ) ] # Create sample research response self.sample_research = BlogResearchResponse( success=True, sources=self.sample_sources, keyword_analysis={ 'primary': ['artificial intelligence', 'machine learning', 'AI implementation'], 'secondary': ['enterprise AI', 'deep learning', 'AI ethics'], 'long_tail': ['AI trends 2025', 'enterprise AI implementation guide', 'machine learning algorithms explained'], 'semantic_keywords': ['AI', 'ML', 'neural networks', 'automation'], 'trending_terms': ['AI 2025', 'generative AI', 'AI automation'], 'search_intent': 'informational', 'content_gaps': ['AI implementation challenges', 'ML algorithm comparison'] }, competitor_analysis={ 'top_competitors': ['TechCorp AI', 'DataScience Inc', 'AI Solutions Ltd'], 'opportunities': ['Enterprise market gap', 'SME AI adoption'], 'competitive_advantages': ['Comprehensive coverage', 'Practical examples'] }, suggested_angles=[ 'AI trends in 2025', 'Enterprise AI implementation', 'Machine learning fundamentals', 'AI ethics and responsibility' ], search_widget="
Search widget HTML
", search_queries=["AI trends 2025", "enterprise AI implementation", "machine learning guide"], grounding_metadata=GroundingMetadata( grounding_chunks=[], grounding_supports=[], citations=[], search_entry_point="AI trends and implementation", web_search_queries=["AI trends 2025", "enterprise AI"] ) ) def test_semantic_similarity_calculation(self): """Test semantic similarity calculation between sections and sources.""" section = self.sample_sections[0] # AI Introduction section source = self.sample_sources[0] # AI Trends source similarity = self.mapper._calculate_semantic_similarity(section, source) # Should have high similarity due to AI-related content assert 0.0 <= similarity <= 1.0 assert similarity > 0.3 # Should be reasonably high for AI-related content def test_keyword_relevance_calculation(self): """Test keyword-based relevance calculation.""" section = self.sample_sections[1] # Enterprise AI section source = self.sample_sources[1] # Enterprise AI Guide source relevance = self.mapper._calculate_keyword_relevance(section, source, self.sample_research) # Should have reasonable relevance due to enterprise AI keywords assert 0.0 <= relevance <= 1.0 assert relevance > 0.1 # Should be reasonable for matching enterprise AI content def test_contextual_relevance_calculation(self): """Test contextual relevance calculation.""" section = self.sample_sections[2] # ML Algorithms section source = self.sample_sources[2] # ML Algorithms source relevance = self.mapper._calculate_contextual_relevance(section, source, self.sample_research) # Should have high relevance due to matching content angles assert 0.0 <= relevance <= 1.0 assert relevance > 0.2 # Should be reasonable for matching content def test_algorithmic_source_mapping(self): """Test the complete algorithmic mapping process.""" mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) # Should have mapping results for all sections assert len(mapping_results) == len(self.sample_sections) # Each section should have some mapped sources for section_id, sources in mapping_results.items(): assert isinstance(sources, list) # Each source should be a tuple of (source, score) for source, score in sources: assert isinstance(source, ResearchSource) assert isinstance(score, float) assert 0.0 <= score <= 1.0 def test_source_mapping_quality(self): """Test that sources are mapped to relevant sections.""" mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) # Enterprise AI section should have enterprise AI source enterprise_section = mapping_results["s2"] enterprise_source_titles = [source.title for source, score in enterprise_section] assert any("Enterprise" in title for title in enterprise_source_titles) # ML Algorithms section should have ML algorithms source ml_section = mapping_results["s3"] ml_source_titles = [source.title for source, score in ml_section] assert any("Machine Learning" in title or "Algorithms" in title for title in ml_source_titles) # AI Ethics section should have AI ethics source ethics_section = mapping_results["s4"] ethics_source_titles = [source.title for source, score in ethics_section] assert any("Ethics" in title for title in ethics_source_titles) def test_complete_mapping_pipeline(self): """Test the complete mapping pipeline from sections to mapped sections.""" mapped_sections = self.mapper.map_sources_to_sections(self.sample_sections, self.sample_research) # Should return same number of sections assert len(mapped_sections) == len(self.sample_sections) # Each section should have mapped sources for section in mapped_sections: assert isinstance(section.references, list) assert len(section.references) <= self.mapper.max_sources_per_section # All references should be ResearchSource objects for source in section.references: assert isinstance(source, ResearchSource) def test_mapping_with_empty_sources(self): """Test mapping behavior with empty sources list.""" empty_research = BlogResearchResponse( success=True, sources=[], keyword_analysis={}, competitor_analysis={}, suggested_angles=[], search_widget="", search_queries=[], grounding_metadata=None ) mapped_sections = self.mapper.map_sources_to_sections(self.sample_sections, empty_research) # Should return sections with empty references for section in mapped_sections: assert section.references == [] def test_mapping_with_empty_sections(self): """Test mapping behavior with empty sections list.""" mapped_sections = self.mapper.map_sources_to_sections([], self.sample_research) # Should return empty list assert mapped_sections == [] def test_meaningful_words_extraction(self): """Test extraction of meaningful words from text.""" text = "Artificial Intelligence and Machine Learning are transforming the world of technology and business applications." words = self.mapper._extract_meaningful_words(text) # Should extract meaningful words and remove stop words assert "artificial" in words assert "intelligence" in words assert "machine" in words assert "learning" in words assert "the" not in words # Stop word should be removed assert "and" not in words # Stop word should be removed def test_phrase_similarity_calculation(self): """Test phrase similarity calculation.""" text1 = "machine learning algorithms" text2 = "This article covers machine learning algorithms and their applications" similarity = self.mapper._calculate_phrase_similarity(text1, text2) # Should find phrase matches assert similarity > 0.0 assert similarity <= 0.3 # Should be capped at 0.3 def test_intent_keywords_extraction(self): """Test extraction of intent-specific keywords.""" informational_keywords = self.mapper._get_intent_keywords("informational") transactional_keywords = self.mapper._get_intent_keywords("transactional") # Should return appropriate keywords for each intent assert "what" in informational_keywords assert "how" in informational_keywords assert "guide" in informational_keywords assert "buy" in transactional_keywords assert "purchase" in transactional_keywords assert "price" in transactional_keywords def test_mapping_statistics(self): """Test mapping statistics calculation.""" mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) stats = self.mapper.get_mapping_statistics(mapping_results) # Should have valid statistics assert stats['total_sections'] == len(self.sample_sections) assert stats['total_mappings'] > 0 assert stats['sections_with_sources'] > 0 assert 0.0 <= stats['average_score'] <= 1.0 assert 0.0 <= stats['max_score'] <= 1.0 assert 0.0 <= stats['min_score'] <= 1.0 assert 0.0 <= stats['mapping_coverage'] <= 1.0 def test_source_quality_filtering(self): """Test that low-quality sources are filtered out.""" # Create a low-quality source low_quality_source = ResearchSource( title="Random Article", url="https://example.com/random", excerpt="This is a completely unrelated article about cooking recipes and gardening tips.", credibility_score=0.3, published_at="2025-08-01", index=5, source_type="web" ) # Add to research data research_with_low_quality = BlogResearchResponse( success=True, sources=self.sample_sources + [low_quality_source], keyword_analysis=self.sample_research.keyword_analysis, competitor_analysis=self.sample_research.competitor_analysis, suggested_angles=self.sample_research.suggested_angles, search_widget=self.sample_research.search_widget, search_queries=self.sample_research.search_queries, grounding_metadata=self.sample_research.grounding_metadata ) mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, research_with_low_quality) # Low-quality source should not be mapped to any section all_mapped_sources = [] for sources in mapping_results.values(): all_mapped_sources.extend([source for source, score in sources]) assert low_quality_source not in all_mapped_sources def test_max_sources_per_section_limit(self): """Test that the maximum sources per section limit is enforced.""" # Create many sources many_sources = self.sample_sources * 3 # 15 sources research_with_many_sources = BlogResearchResponse( success=True, sources=many_sources, keyword_analysis=self.sample_research.keyword_analysis, competitor_analysis=self.sample_research.competitor_analysis, suggested_angles=self.sample_research.suggested_angles, search_widget=self.sample_research.search_widget, search_queries=self.sample_research.search_queries, grounding_metadata=self.sample_research.grounding_metadata ) mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, research_with_many_sources) # Each section should have at most max_sources_per_section sources for section_id, sources in mapping_results.items(): assert len(sources) <= self.mapper.max_sources_per_section def test_ai_validation_prompt_building(self): """Test AI validation prompt building.""" mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) prompt = self.mapper._build_validation_prompt(mapping_results, self.sample_research) # Should contain key elements assert "expert content strategist" in prompt assert "Research Topic:" in prompt assert "ALGORITHMIC MAPPING RESULTS" in prompt assert "AVAILABLE SOURCES" in prompt assert "VALIDATION TASK" in prompt assert "RESPONSE FORMAT" in prompt assert "overall_quality_score" in prompt assert "section_improvements" in prompt def test_ai_validation_response_parsing(self): """Test AI validation response parsing.""" # Mock AI response mock_response = """ Here's my analysis of the source-to-section mapping: ```json { "overall_quality_score": 8, "section_improvements": [ { "section_id": "s1", "current_sources": ["AI Trends in 2025: Machine Learning Revolution"], "recommended_sources": ["AI Trends in 2025: Machine Learning Revolution", "Machine Learning Algorithms Explained"], "reasoning": "Adding ML algorithms source provides more technical depth", "confidence": 0.9 } ], "summary": "Good mapping overall, minor improvements suggested" } ``` """ original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) parsed_mapping = self.mapper._parse_validation_response(mock_response, original_mapping, self.sample_research) # Should have improved mapping assert "s1" in parsed_mapping assert len(parsed_mapping["s1"]) > 0 # Should maintain other sections assert len(parsed_mapping) == len(original_mapping) def test_ai_validation_fallback_handling(self): """Test AI validation fallback when parsing fails.""" # Mock invalid AI response invalid_response = "This is not a valid JSON response" original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) parsed_mapping = self.mapper._parse_validation_response(invalid_response, original_mapping, self.sample_research) # Should fallback to original mapping assert parsed_mapping == original_mapping def test_ai_validation_with_missing_sources(self): """Test AI validation when recommended sources don't exist.""" # Mock AI response with non-existent source mock_response = """ ```json { "overall_quality_score": 7, "section_improvements": [ { "section_id": "s1", "current_sources": ["AI Trends in 2025: Machine Learning Revolution"], "recommended_sources": ["Non-existent Source", "Another Fake Source"], "reasoning": "These sources would be better", "confidence": 0.8 } ], "summary": "Suggested improvements" } ``` """ original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) parsed_mapping = self.mapper._parse_validation_response(mock_response, original_mapping, self.sample_research) # Should fallback to original mapping for s1 since no valid sources found assert parsed_mapping["s1"] == original_mapping["s1"] def test_ai_validation_integration(self): """Test complete AI validation integration (with mocked LLM).""" # This test would require mocking the LLM provider # For now, we'll test that the method doesn't crash mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research) # Test that AI validation method exists and can be called # (In real implementation, this would call the actual LLM) try: # This will fail in test environment due to no LLM, but should not crash validated_mapping = self.mapper._ai_validate_mapping(mapping_results, self.sample_research) # If it doesn't crash, it should return the original mapping as fallback assert validated_mapping == mapping_results except Exception as e: # Expected to fail in test environment, but should be handled gracefully assert "AI validation failed" in str(e) or "Failed to get AI validation response" in str(e) def test_format_sections_for_prompt(self): """Test formatting of sections for AI prompt.""" sections_info = [ { 'id': 's1', 'sources': [ { 'title': 'Test Source 1', 'algorithmic_score': 0.85 } ] } ] formatted = self.mapper._format_sections_for_prompt(sections_info) assert "Section s1:" in formatted assert "Test Source 1" in formatted assert "0.85" in formatted def test_format_sources_for_prompt(self): """Test formatting of sources for AI prompt.""" sources = [ { 'title': 'Test Source', 'url': 'https://example.com', 'credibility_score': 0.9, 'excerpt': 'This is a test excerpt for the source.' } ] formatted = self.mapper._format_sources_for_prompt(sources) assert "Test Source" in formatted assert "https://example.com" in formatted assert "0.9" in formatted assert "This is a test excerpt" in formatted if __name__ == '__main__': pytest.main([__file__])