chore: push all remaining changes

- Blog writer enhancements and bug fixes - Wix integration improvements - Frontend UI updates - GSC dashboard docs cleanup - Image studio assets - LinkedIn requirements file - Various dependency updates
2026-06-12 20:32:03 +05:30
parent 63a0df2536
commit d90d441019
78 changed files with 3963 additions and 2899 deletions
--- a/backend/services/blog_writer/outline/source_mapper.py
+++ b/backend/services/blog_writer/outline/source_mapper.py
@@ -41,10 +41,33 @@ class SourceToSectionMapper:
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
            'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
            'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
-            'how', 'what', 'when', 'where', 'why', 'who', 'which', 'how', 'much', 'many', 'more', 'most',
+            'how', 'what', 'when', 'where', 'why', 'who', 'which', 'much', 'many', 'more', 'most',
            'some', 'any', 'all', 'each', 'every', 'other', 'another', 'such', 'no', 'not', 'only', 'own',
            'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'up', 'down', 'out', 'off',
-            'over', 'under', 'again', 'further', 'then', 'once'
+            'over', 'under', 'again', 'further', 'then', 'once', 'also', 'into', 'about', 'between',
+            'through', 'during', 'before', 'after', 'above', 'below', 'from', 'since', 'until', 'while',
+            'because', 'however', 'therefore', 'thus', 'hence', 'yet', 'still', 'already', 'even'
+        }
+        
+        # Common abbreviation/synonym pairs for fuzzy matching
+        self._synonym_map = {
+            'ai': ['artificial intelligence', 'machine intelligence'],
+            'ml': ['machine learning'],
+            'dl': ['deep learning'],
+            'nlp': ['natural language processing'],
+            'iot': ['internet of things'],
+            'saas': ['software as a service'],
+            'b2b': ['business to business'],
+            'b2c': ['business to consumer'],
+            'cx': ['customer experience'],
+            'ux': ['user experience'],
+            'roi': ['return on investment'],
+            'kpi': ['key performance indicator'],
+            'crm': ['customer relationship management'],
+            'erp': ['enterprise resource planning'],
+            'seo': ['search engine optimization'],
+            'cto': ['chief technology officer'],
+            'vp': ['vice president'],
        }
        
        logger.info("✅ SourceToSectionMapper initialized with intelligent mapping algorithms")
@@ -53,15 +76,21 @@ class SourceToSectionMapper:
        self, 
        sections: List[BlogOutlineSection], 
        research_data: BlogResearchResponse,
-        user_id: str
+        user_id: str,
+        competitive_advantage: str = ""
    ) -> List[BlogOutlineSection]:
        """
        Map research sources to outline sections using intelligent algorithms.
        
+        Sections that already have LLM-assigned references (from source_indices
+        in the outline prompt) are preserved. Algorithmic mapping fills gaps
+        for sections without LLM-assigned sources.
+        
        Args:
            sections: List of outline sections to map sources to
            research_data: Research data containing sources and metadata
            user_id: User ID (required for subscription checks and usage tracking)
+            competitive_advantage: Selected competitive advantage to preferentially match
            
        Returns:
            List of outline sections with intelligently mapped sources
@@ -76,16 +105,39 @@ class SourceToSectionMapper:
            logger.warning("No sections or sources to map")
            return sections
        
-        logger.info(f"Mapping {len(research_data.sources)} sources to {len(sections)} sections")
+        # Separate sections with LLM-assigned references from those without
+        sections_with_refs = [s for s in sections if s.references]
+        sections_without_refs = [s for s in sections if not s.references]
        
-        # Step 1: Algorithmic mapping
-        mapping_results = self._algorithmic_source_mapping(sections, research_data)
+        logger.info(
+            f"Mapping {len(research_data.sources)} sources to {len(sections)} sections "
+            f"({len(sections_with_refs)} with LLM-assigned references, "
+            f"{len(sections_without_refs)} need algorithmic mapping)"
+        )
        
-        # Step 2: AI validation and improvement (single prompt, user_id required for subscription checks)
-        validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
+        if sections_without_refs:
+            # Step 1: Algorithmic mapping for sections without LLM-assigned references
+            mapping_results = self._algorithmic_source_mapping(sections_without_refs, research_data, competitive_advantage)
+            
+            # Step 2: AI validation and improvement
+            validated_mapping = self._ai_validate_mapping(mapping_results, research_data, user_id)
+            
+            # Step 3: Apply mapping only to sections that need it
+            mapped_sections_with = self._apply_mapping_to_sections(sections_without_refs, validated_mapping)
+        else:
+            mapped_sections_with = []
        
-        # Step 3: Apply validated mapping to sections
-        mapped_sections = self._apply_mapping_to_sections(sections, validated_mapping)
+        # Combine: keep LLM-assigned sections as-is, add algorithmically mapped ones
+        mapped_sections = list(sections_with_refs) + mapped_sections_with
+        
+        # Preserve original ordering
+        original_ids = [s.id for s in sections]
+        mapped_sections.sort(key=lambda s: original_ids.index(s.id) if s.id in original_ids else 999)
+        
+        # Warn if any section still has zero references
+        for s in mapped_sections:
+            if not s.references:
+                logger.warning(f"Section '{s.heading}' (id={s.id}) has ZERO sources — content generator will use keyword-based fallback")
        
        logger.info("✅ Source-to-section mapping completed successfully")
        return mapped_sections
@@ -93,7 +145,8 @@ class SourceToSectionMapper:
    def _algorithmic_source_mapping(
        self, 
        sections: List[BlogOutlineSection], 
-        research_data: BlogResearchResponse
+        research_data: BlogResearchResponse,
+        competitive_advantage: str = ""
    ) -> Dict[str, List[Tuple[ResearchSource, float]]]:
        """
        Perform algorithmic mapping of sources to sections.
@@ -101,6 +154,7 @@ class SourceToSectionMapper:
        Args:
            sections: List of outline sections
            research_data: Research data with sources
+            competitive_advantage: Selected competitive advantage to boost matching
            
        Returns:
            Dictionary mapping section IDs to list of (source, score) tuples
@@ -114,7 +168,7 @@ class SourceToSectionMapper:
                # Calculate multi-dimensional relevance score
                semantic_score = self._calculate_semantic_similarity(section, source)
                keyword_score = self._calculate_keyword_relevance(section, source, research_data)
-                contextual_score = self._calculate_contextual_relevance(section, source, research_data)
+                contextual_score = self._calculate_contextual_relevance(section, source, research_data, competitive_advantage)
                
                # Weighted total score
                total_score = (
@@ -140,38 +194,54 @@ class SourceToSectionMapper:
    def _calculate_semantic_similarity(self, section: BlogOutlineSection, source: ResearchSource) -> float:
        """
        Calculate semantic similarity between section and source.
-        
-        Args:
-            section: Outline section
-            source: Research source
-            
-        Returns:
-            Semantic similarity score (0.0 to 1.0)
+        Uses word overlap, stem matching, bigram overlap, title-boost, and synonym expansion.
        """
-        # Extract text content for comparison
        section_text = self._extract_section_text(section)
        source_text = self._extract_source_text(source)
        
-        # Calculate word overlap
        section_words = self._extract_meaningful_words(section_text)
        source_words = self._extract_meaningful_words(source_text)
        
        if not section_words or not source_words:
            return 0.0
        
-        # Calculate Jaccard similarity
-        intersection = len(set(section_words) & set(source_words))
-        union = len(set(section_words) | set(source_words))
+        section_set = set(section_words)
+        source_set = set(source_words)
        
-        jaccard_similarity = intersection / union if union > 0 else 0.0
+        # 1. Jaccard similarity on raw words
+        intersection = len(section_set & source_set)
+        union = len(section_set | source_set)
+        jaccard = intersection / union if union > 0 else 0.0
        
-        # Boost score for exact phrase matches
-        phrase_boost = self._calculate_phrase_similarity(section_text, source_text)
+        # 2. Stem matching — catches word variants (e.g., "running" vs "runs")
+        section_stems = set(self._stem_word(w) for w in section_words)
+        source_stems = set(self._stem_word(w) for w in source_words)
+        stem_intersection = len(section_stems & source_stems)
+        stem_union = len(section_stems | source_stems)
+        stem_similarity = stem_intersection / stem_union if stem_union > 0 else 0.0
        
-        # Combine Jaccard similarity with phrase boost
-        semantic_score = min(1.0, jaccard_similarity + phrase_boost)
+        # 3. Bigram overlap — catches multi-word concepts (e.g., "machine learning")
+        section_bigrams = set(self._extract_bigrams(section_text))
+        source_bigrams = set(self._extract_bigrams(source_text))
+        bigram_overlap = len(section_bigrams & source_bigrams)
+        bigram_score = min(0.3, bigram_overlap * 0.1) if (section_bigrams or source_bigrams) else 0.0
        
-        return semantic_score
+        # 4. Title-boost — section heading matching source title is a strong signal
+        heading = (section.heading or '').lower()
+        source_title = (source.title or '').lower()
+        heading_words = set(self._extract_meaningful_words(heading))
+        title_words = set(self._extract_meaningful_words(source_title))
+        title_overlap = len(heading_words & title_words) / len(heading_words | title_words) if (heading_words or title_words) else 0.0
+        title_boost = min(0.3, title_overlap * 0.5)
+        
+        # 5. Synonym expansion — expand abbreviations and match across synonym pairs
+        synonym_score = self._calculate_synonym_overlap(section_words, source_words)
+        
+        # Combine: Jaccard + stem give base, bigram + title + synonyms boost
+        base_similarity = max(jaccard, stem_similarity)
+        combined = min(1.0, base_similarity + bigram_score + title_boost + synonym_score + 0.0)
+        
+        return combined
    
    def _calculate_keyword_relevance(
        self, 
@@ -219,7 +289,8 @@ class SourceToSectionMapper:
        self, 
        section: BlogOutlineSection, 
        source: ResearchSource, 
-        research_data: BlogResearchResponse
+        research_data: BlogResearchResponse,
+        competitive_advantage: str = ""
    ) -> float:
        """
        Calculate contextual relevance based on section content and source context.
@@ -228,6 +299,7 @@ class SourceToSectionMapper:
            section: Outline section
            source: Research source
            research_data: Research data with context
+            competitive_advantage: Selected competitive advantage to boost matching
            
        Returns:
            Contextual relevance score (0.0 to 1.0)
@@ -264,6 +336,15 @@ class SourceToSectionMapper:
            industry_score = sum(1 for word in industry_words if word in source_text) / len(industry_words) if industry_words else 0.0
            contextual_score += industry_score * 0.2
        
+        # 4. Competitive advantage boost — sources that match the advantage get a score lift
+        if competitive_advantage:
+            advantage_words = set(self._extract_meaningful_words(competitive_advantage.lower()))
+            if advantage_words:
+                advantage_in_section = sum(1 for w in advantage_words if w in section_text) / len(advantage_words)
+                advantage_in_source = sum(1 for w in advantage_words if w in source_text) / len(advantage_words)
+                if advantage_in_section > 0.3 and advantage_in_source > 0.3:
+                    contextual_score += 0.25 * (advantage_in_section + advantage_in_source)
+        
        return min(1.0, contextual_score)
    
    def _ai_validate_mapping(
@@ -360,10 +441,15 @@ class SourceToSectionMapper:
        return " ".join(text_parts)
    
    def _extract_source_text(self, source: ResearchSource) -> str:
-        """Extract all text content from a source."""
+        """Extract all text content from a source, including full text for better matching."""
        text_parts = [source.title]
+        if source.summary:
+            text_parts.append(source.summary)
        if source.excerpt:
            text_parts.append(source.excerpt)
+        content = getattr(source, 'content', '') or ''
+        if content:
+            text_parts.append(content[:500])
        return " ".join(text_parts)
    
    def _extract_meaningful_words(self, text: str) -> List[str]:
@@ -382,6 +468,41 @@ class SourceToSectionMapper:
        
        return meaningful_words
    
+    def _stem_word(self, word: str) -> str:
+        """Rudimentary suffix-stripping stemmer for English words."""
+        if len(word) <= 3:
+            return word
+        for suffix in ['ization', 'ation', 'tion', 'sion', 'ment', 'ness', 'ity', 'ing', 'able', 'ible', 'ful', 'less', 'ous', 'ive', 'ally', 'ly', 'er', 'ed', 'es', 's']:
+            if word.endswith(suffix) and len(word) - len(suffix) >= 3:
+                return word[:-len(suffix)]
+        return word
+    
+    def _extract_bigrams(self, text: str) -> List[str]:
+        """Extract meaningful two-word phrases from text."""
+        words = self._extract_meaningful_words(text)
+        if len(words) < 2:
+            return []
+        return [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
+    
+    def _calculate_synonym_overlap(self, section_words: List[str], source_words: List[str]) -> float:
+        """Score overlap via abbreviation/synonym expansion."""
+        section_set = set(section_words)
+        source_set = set(source_words)
+        extra_matches = 0
+        total_terms = len(section_set | source_set) or 1
+        
+        for abbr, expansions in self._synonym_map.items():
+            abbr_in_section = abbr in section_set
+            abbr_in_source = abbr in source_set
+            for expansion in expansions:
+                exp_words = set(expansion.split())
+                exp_in_section = exp_words.issubset(section_set)
+                exp_in_source = exp_words.issubset(source_set)
+                if (abbr_in_section and exp_in_source) or (abbr_in_source and exp_in_section):
+                    extra_matches += 1
+        
+        return min(0.2, extra_matches * 0.05)
+    
    def _calculate_phrase_similarity(self, text1: str, text2: str) -> float:
        """Calculate phrase similarity boost score."""
        if not text1 or not text2: