moreminimore-marketing/backend/test/test_source_mapper.py

"""
Unit tests for SourceToSectionMapper.

Tests the intelligent source-to-section mapping functionality.
"""

import pytest
from typing import List

from models.blog_models import (
    BlogOutlineSection,
    ResearchSource,
    BlogResearchResponse,
    GroundingMetadata,
)
from services.blog_writer.outline.source_mapper import SourceToSectionMapper


class TestSourceToSectionMapper:
    """Test cases for SourceToSectionMapper."""

    def setup_method(self):
        """Set up test fixtures."""
        self.mapper = SourceToSectionMapper()

        # Create sample research sources
        self.sample_sources = [
            ResearchSource(
                title="AI Trends in 2025: Machine Learning Revolution",
                url="https://example.com/ai-trends-2025",
                excerpt="Comprehensive analysis of artificial intelligence trends in 2025, focusing on machine learning advancements, deep learning breakthroughs, and AI automation in enterprise environments.",
                credibility_score=0.95,
                published_at="2025-08-15",
                index=0,
                source_type="web"
            ),
            ResearchSource(
                title="Enterprise AI Implementation Guide",
                url="https://example.com/enterprise-ai-guide",
                excerpt="Step-by-step guide for implementing artificial intelligence solutions in enterprise environments, including best practices, challenges, and success stories from leading companies.",
                credibility_score=0.9,
                published_at="2025-08-01",
                index=1,
                source_type="web"
            ),
            ResearchSource(
                title="Machine Learning Algorithms Explained",
                url="https://example.com/ml-algorithms",
                excerpt="Detailed explanation of various machine learning algorithms including supervised learning, unsupervised learning, and reinforcement learning techniques with practical examples.",
                credibility_score=0.85,
                published_at="2025-07-20",
                index=2,
                source_type="web"
            ),
            ResearchSource(
                title="AI Ethics and Responsible Development",
                url="https://example.com/ai-ethics",
                excerpt="Discussion of ethical considerations in artificial intelligence development, including bias mitigation, transparency, and responsible AI practices for developers and organizations.",
                credibility_score=0.88,
                published_at="2025-07-10",
                index=3,
                source_type="web"
            ),
            ResearchSource(
                title="Deep Learning Neural Networks Tutorial",
                url="https://example.com/deep-learning-tutorial",
                excerpt="Comprehensive tutorial on deep learning neural networks, covering convolutional neural networks, recurrent neural networks, and transformer architectures with code examples.",
                credibility_score=0.92,
                published_at="2025-06-15",
                index=4,
                source_type="web"
            )
        ]

        # Create sample outline sections
        self.sample_sections = [
            BlogOutlineSection(
                id="s1",
                heading="Introduction to AI and Machine Learning",
                subheadings=["What is AI?", "Types of Machine Learning", "AI Applications"],
                key_points=["AI definition and scope", "ML vs traditional programming", "Real-world AI examples"],
                references=[],
                target_words=300,
                keywords=["artificial intelligence", "machine learning", "AI basics", "introduction"]
            ),
            BlogOutlineSection(
                id="s2",
                heading="Enterprise AI Implementation Strategies",
                subheadings=["Planning Phase", "Implementation Steps", "Best Practices"],
                key_points=["Strategic planning", "Technology selection", "Change management", "ROI measurement"],
                references=[],
                target_words=400,
                keywords=["enterprise AI", "implementation", "strategies", "business"]
            ),
            BlogOutlineSection(
                id="s3",
                heading="Machine Learning Algorithms Deep Dive",
                subheadings=["Supervised Learning", "Unsupervised Learning", "Deep Learning"],
                key_points=["Algorithm types", "Use cases", "Performance metrics", "Model selection"],
                references=[],
                target_words=500,
                keywords=["machine learning algorithms", "supervised learning", "deep learning", "neural networks"]
            ),
            BlogOutlineSection(
                id="s4",
                heading="AI Ethics and Responsible Development",
                subheadings=["Ethical Considerations", "Bias and Fairness", "Transparency"],
                key_points=["Ethical frameworks", "Bias detection", "Explainable AI", "Regulatory compliance"],
                references=[],
                target_words=350,
                keywords=["AI ethics", "responsible AI", "bias", "transparency"]
            )
        ]

        # Create sample research response
        self.sample_research = BlogResearchResponse(
            success=True,
            sources=self.sample_sources,
            keyword_analysis={
                'primary': ['artificial intelligence', 'machine learning', 'AI implementation'],
                'secondary': ['enterprise AI', 'deep learning', 'AI ethics'],
                'long_tail': ['AI trends 2025', 'enterprise AI implementation guide', 'machine learning algorithms explained'],
                'semantic_keywords': ['AI', 'ML', 'neural networks', 'automation'],
                'trending_terms': ['AI 2025', 'generative AI', 'AI automation'],
                'search_intent': 'informational',
                'content_gaps': ['AI implementation challenges', 'ML algorithm comparison']
            },
            competitor_analysis={
                'top_competitors': ['TechCorp AI', 'DataScience Inc', 'AI Solutions Ltd'],
                'opportunities': ['Enterprise market gap', 'SME AI adoption'],
                'competitive_advantages': ['Comprehensive coverage', 'Practical examples']
            },
            suggested_angles=[
                'AI trends in 2025',
                'Enterprise AI implementation',
                'Machine learning fundamentals',
                'AI ethics and responsibility'
            ],
            search_widget="<div>Search widget HTML</div>",
            search_queries=["AI trends 2025", "enterprise AI implementation", "machine learning guide"],
            grounding_metadata=GroundingMetadata(
                grounding_chunks=[],
                grounding_supports=[],
                citations=[],
                search_entry_point="AI trends and implementation",
                web_search_queries=["AI trends 2025", "enterprise AI"]
            )
        )

    def test_semantic_similarity_calculation(self):
        """Test semantic similarity calculation between sections and sources."""
        section = self.sample_sections[0]  # AI Introduction section
        source = self.sample_sources[0]    # AI Trends source

        similarity = self.mapper._calculate_semantic_similarity(section, source)

        # Should have high similarity due to AI-related content
        assert 0.0 <= similarity <= 1.0
        assert similarity > 0.3  # Should be reasonably high for AI-related content

    def test_keyword_relevance_calculation(self):
        """Test keyword-based relevance calculation."""
        section = self.sample_sections[1]  # Enterprise AI section
        source = self.sample_sources[1]    # Enterprise AI Guide source

        relevance = self.mapper._calculate_keyword_relevance(section, source, self.sample_research)

        # Should have reasonable relevance due to enterprise AI keywords
        assert 0.0 <= relevance <= 1.0
        assert relevance > 0.1  # Should be reasonable for matching enterprise AI content

    def test_contextual_relevance_calculation(self):
        """Test contextual relevance calculation."""
        section = self.sample_sections[2]  # ML Algorithms section
        source = self.sample_sources[2]    # ML Algorithms source

        relevance = self.mapper._calculate_contextual_relevance(section, source, self.sample_research)

        # Should have high relevance due to matching content angles
        assert 0.0 <= relevance <= 1.0
        assert relevance > 0.2  # Should be reasonable for matching content

    def test_algorithmic_source_mapping(self):
        """Test the complete algorithmic mapping process."""
        mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)

        # Should have mapping results for all sections
        assert len(mapping_results) == len(self.sample_sections)

        # Each section should have some mapped sources
        for section_id, sources in mapping_results.items():
            assert isinstance(sources, list)
            # Each source should be a tuple of (source, score)
            for source, score in sources:
                assert isinstance(source, ResearchSource)
                assert isinstance(score, float)
                assert 0.0 <= score <= 1.0

    def test_source_mapping_quality(self):
        """Test that sources are mapped to relevant sections."""
        mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)

        # Enterprise AI section should have enterprise AI source
        enterprise_section = mapping_results["s2"]
        enterprise_source_titles = [source.title for source, score in enterprise_section]
        assert any("Enterprise" in title for title in enterprise_source_titles)

        # ML Algorithms section should have ML algorithms source
        ml_section = mapping_results["s3"]
        ml_source_titles = [source.title for source, score in ml_section]
        assert any("Machine Learning" in title or "Algorithms" in title for title in ml_source_titles)

        # AI Ethics section should have AI ethics source
        ethics_section = mapping_results["s4"]
        ethics_source_titles = [source.title for source, score in ethics_section]
        assert any("Ethics" in title for title in ethics_source_titles)

    def test_complete_mapping_pipeline(self):
        """Test the complete mapping pipeline from sections to mapped sections."""
        mapped_sections = self.mapper.map_sources_to_sections(self.sample_sections, self.sample_research)

        # Should return same number of sections
        assert len(mapped_sections) == len(self.sample_sections)

        # Each section should have mapped sources
        for section in mapped_sections:
            assert isinstance(section.references, list)
            assert len(section.references) <= self.mapper.max_sources_per_section

            # All references should be ResearchSource objects
            for source in section.references:
                assert isinstance(source, ResearchSource)

    def test_mapping_with_empty_sources(self):
        """Test mapping behavior with empty sources list."""
        empty_research = BlogResearchResponse(
            success=True,
            sources=[],
            keyword_analysis={},
            competitor_analysis={},
            suggested_angles=[],
            search_widget="",
            search_queries=[],
            grounding_metadata=None
        )

        mapped_sections = self.mapper.map_sources_to_sections(self.sample_sections, empty_research)

        # Should return sections with empty references
        for section in mapped_sections:
            assert section.references == []

    def test_mapping_with_empty_sections(self):
        """Test mapping behavior with empty sections list."""
        mapped_sections = self.mapper.map_sources_to_sections([], self.sample_research)

        # Should return empty list
        assert mapped_sections == []

    def test_meaningful_words_extraction(self):
        """Test extraction of meaningful words from text."""
        text = "Artificial Intelligence and Machine Learning are transforming the world of technology and business applications."
        words = self.mapper._extract_meaningful_words(text)

        # Should extract meaningful words and remove stop words
        assert "artificial" in words
        assert "intelligence" in words
        assert "machine" in words
        assert "learning" in words
        assert "the" not in words  # Stop word should be removed
        assert "and" not in words  # Stop word should be removed

    def test_phrase_similarity_calculation(self):
        """Test phrase similarity calculation."""
        text1 = "machine learning algorithms"
        text2 = "This article covers machine learning algorithms and their applications"

        similarity = self.mapper._calculate_phrase_similarity(text1, text2)

        # Should find phrase matches
        assert similarity > 0.0
        assert similarity <= 0.3  # Should be capped at 0.3

    def test_intent_keywords_extraction(self):
        """Test extraction of intent-specific keywords."""
        informational_keywords = self.mapper._get_intent_keywords("informational")
        transactional_keywords = self.mapper._get_intent_keywords("transactional")

        # Should return appropriate keywords for each intent
        assert "what" in informational_keywords
        assert "how" in informational_keywords
        assert "guide" in informational_keywords

        assert "buy" in transactional_keywords
        assert "purchase" in transactional_keywords
        assert "price" in transactional_keywords

    def test_mapping_statistics(self):
        """Test mapping statistics calculation."""
        mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)
        stats = self.mapper.get_mapping_statistics(mapping_results)

        # Should have valid statistics
        assert stats['total_sections'] == len(self.sample_sections)
        assert stats['total_mappings'] > 0
        assert stats['sections_with_sources'] > 0
        assert 0.0 <= stats['average_score'] <= 1.0
        assert 0.0 <= stats['max_score'] <= 1.0
        assert 0.0 <= stats['min_score'] <= 1.0
        assert 0.0 <= stats['mapping_coverage'] <= 1.0

    def test_source_quality_filtering(self):
        """Test that low-quality sources are filtered out."""
        # Create a low-quality source
        low_quality_source = ResearchSource(
            title="Random Article",
            url="https://example.com/random",
            excerpt="This is a completely unrelated article about cooking recipes and gardening tips.",
            credibility_score=0.3,
            published_at="2025-08-01",
            index=5,
            source_type="web"
        )

        # Add to research data
        research_with_low_quality = BlogResearchResponse(
            success=True,
            sources=self.sample_sources + [low_quality_source],
            keyword_analysis=self.sample_research.keyword_analysis,
            competitor_analysis=self.sample_research.competitor_analysis,
            suggested_angles=self.sample_research.suggested_angles,
            search_widget=self.sample_research.search_widget,
            search_queries=self.sample_research.search_queries,
            grounding_metadata=self.sample_research.grounding_metadata
        )

        mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, research_with_low_quality)

        # Low-quality source should not be mapped to any section
        all_mapped_sources = []
        for sources in mapping_results.values():
            all_mapped_sources.extend([source for source, score in sources])

        assert low_quality_source not in all_mapped_sources

    def test_max_sources_per_section_limit(self):
        """Test that the maximum sources per section limit is enforced."""
        # Create many sources
        many_sources = self.sample_sources * 3  # 15 sources

        research_with_many_sources = BlogResearchResponse(
            success=True,
            sources=many_sources,
            keyword_analysis=self.sample_research.keyword_analysis,
            competitor_analysis=self.sample_research.competitor_analysis,
            suggested_angles=self.sample_research.suggested_angles,
            search_widget=self.sample_research.search_widget,
            search_queries=self.sample_research.search_queries,
            grounding_metadata=self.sample_research.grounding_metadata
        )

        mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, research_with_many_sources)

        # Each section should have at most max_sources_per_section sources
        for section_id, sources in mapping_results.items():
            assert len(sources) <= self.mapper.max_sources_per_section

    def test_ai_validation_prompt_building(self):
        """Test AI validation prompt building."""
        mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)

        prompt = self.mapper._build_validation_prompt(mapping_results, self.sample_research)

        # Should contain key elements
        assert "expert content strategist" in prompt
        assert "Research Topic:" in prompt
        assert "ALGORITHMIC MAPPING RESULTS" in prompt
        assert "AVAILABLE SOURCES" in prompt
        assert "VALIDATION TASK" in prompt
        assert "RESPONSE FORMAT" in prompt
        assert "overall_quality_score" in prompt
        assert "section_improvements" in prompt

    def test_ai_validation_response_parsing(self):
        """Test AI validation response parsing."""
        # Mock AI response
        mock_response = """
        Here's my analysis of the source-to-section mapping:

        ```json
        {
            "overall_quality_score": 8,
            "section_improvements": [
                {
                    "section_id": "s1",
                    "current_sources": ["AI Trends in 2025: Machine Learning Revolution"],
                    "recommended_sources": ["AI Trends in 2025: Machine Learning Revolution", "Machine Learning Algorithms Explained"],
                    "reasoning": "Adding ML algorithms source provides more technical depth",
                    "confidence": 0.9
                }
            ],
            "summary": "Good mapping overall, minor improvements suggested"
        }
        ```
        """

        original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)

        parsed_mapping = self.mapper._parse_validation_response(mock_response, original_mapping, self.sample_research)

        # Should have improved mapping
        assert "s1" in parsed_mapping
        assert len(parsed_mapping["s1"]) > 0

        # Should maintain other sections
        assert len(parsed_mapping) == len(original_mapping)

    def test_ai_validation_fallback_handling(self):
        """Test AI validation fallback when parsing fails."""
        # Mock invalid AI response
        invalid_response = "This is not a valid JSON response"

        original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)

        parsed_mapping = self.mapper._parse_validation_response(invalid_response, original_mapping, self.sample_research)

        # Should fallback to original mapping
        assert parsed_mapping == original_mapping

    def test_ai_validation_with_missing_sources(self):
        """Test AI validation when recommended sources don't exist."""
        # Mock AI response with non-existent source
        mock_response = """
        ```json
        {
            "overall_quality_score": 7,
            "section_improvements": [
                {
                    "section_id": "s1",
                    "current_sources": ["AI Trends in 2025: Machine Learning Revolution"],
                    "recommended_sources": ["Non-existent Source", "Another Fake Source"],
                    "reasoning": "These sources would be better",
                    "confidence": 0.8
                }
            ],
            "summary": "Suggested improvements"
        }
        ```
        """

        original_mapping = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)

        parsed_mapping = self.mapper._parse_validation_response(mock_response, original_mapping, self.sample_research)

        # Should fallback to original mapping for s1 since no valid sources found
        assert parsed_mapping["s1"] == original_mapping["s1"]

    def test_ai_validation_integration(self):
        """Test complete AI validation integration (with mocked LLM)."""
        # This test would require mocking the LLM provider
        # For now, we'll test that the method doesn't crash
        mapping_results = self.mapper._algorithmic_source_mapping(self.sample_sections, self.sample_research)

        # Test that AI validation method exists and can be called
        # (In real implementation, this would call the actual LLM)
        try:
            # This will fail in test environment due to no LLM, but should not crash
            validated_mapping = self.mapper._ai_validate_mapping(mapping_results, self.sample_research)
            # If it doesn't crash, it should return the original mapping as fallback
            assert validated_mapping == mapping_results
        except Exception as e:
            # Expected to fail in test environment, but should be handled gracefully
            assert "AI validation failed" in str(e) or "Failed to get AI validation response" in str(e)

    def test_format_sections_for_prompt(self):
        """Test formatting of sections for AI prompt."""
        sections_info = [
            {
                'id': 's1',
                'sources': [
                    {
                        'title': 'Test Source 1',
                        'algorithmic_score': 0.85
                    }
                ]
            }
        ]

        formatted = self.mapper._format_sections_for_prompt(sections_info)

        assert "Section s1:" in formatted
        assert "Test Source 1" in formatted
        assert "0.85" in formatted

    def test_format_sources_for_prompt(self):
        """Test formatting of sources for AI prompt."""
        sources = [
            {
                'title': 'Test Source',
                'url': 'https://example.com',
                'credibility_score': 0.9,
                'excerpt': 'This is a test excerpt for the source.'
            }
        ]

        formatted = self.mapper._format_sources_for_prompt(sources)

        assert "Test Source" in formatted
        assert "https://example.com" in formatted
        assert "0.9" in formatted
        assert "This is a test excerpt" in formatted


if __name__ == '__main__':
    pytest.main([__file__])