AI Analysis and Content Strategy fixes. Enhanced Strategy Routes refactoring.

2026-01-10 19:32:50 +05:30
parent 0b63ae7fc1
commit 8193cdba67
298 changed files with 45678 additions and 10952 deletions
--- a/backend/services/research/intent/query_deduplicator.py
+++ b/backend/services/research/intent/query_deduplicator.py
@@ -0,0 +1,121 @@
+"""
+Query deduplication logic for unified research analyzer.
+
+Removes redundant queries that would return similar results
+and ensures queries are linked to intent fields.
+"""
+
+from typing import List
+from loguru import logger
+
+from models.research_intent_models import ResearchIntent, ResearchQuery
+
+
+def deduplicate_queries(
+    queries: List[ResearchQuery], 
+    intent: ResearchIntent
+) -> List[ResearchQuery]:
+    """
+    Remove redundant queries that would return similar results.
+    
+    Rules:
+    1. If two queries are semantically very similar (same keywords, same purpose), merge them
+    2. If a query can answer multiple secondary questions, combine them
+    3. If focus areas overlap significantly, don't create separate queries
+    4. Maximum 8 queries - prioritize by importance
+    5. Always keep the primary query (addresses_primary_question=True)
+    """
+    if len(queries) <= 8:
+        # Still check for exact duplicates
+        seen_queries = set()
+        deduplicated = []
+        for query in queries:
+            query_key = (query.query.lower().strip(), query.provider)
+            if query_key not in seen_queries:
+                seen_queries.add(query_key)
+                deduplicated.append(query)
+        return deduplicated
+    
+    # Sort by priority (highest first)
+    queries.sort(key=lambda q: q.priority, reverse=True)
+    
+    # Always keep primary query
+    primary_queries = [q for q in queries if q.addresses_primary_question]
+    other_queries = [q for q in queries if not q.addresses_primary_question]
+    
+    deduplicated = []
+    seen_keywords = set()
+    
+    # Add primary queries first (should be only one, but handle multiple)
+    for query in primary_queries:
+        query_key = (query.query.lower().strip(), query.provider)
+        if query_key not in seen_keywords:
+            seen_keywords.add(query_key)
+            deduplicated.append(query)
+    
+    # Process other queries with similarity checking
+    for query in other_queries:
+        query_key = (query.query.lower().strip(), query.provider)
+        
+        # Check for exact duplicate
+        if query_key in seen_keywords:
+            continue
+        
+        # Check for semantic similarity with existing queries
+        query_words = set(query.query.lower().split())
+        is_duplicate = False
+        
+        for existing in deduplicated:
+            existing_words = set(existing.query.lower().split())
+            
+            # Calculate Jaccard similarity (intersection over union)
+            intersection = query_words & existing_words
+            union = query_words | existing_words
+            similarity = len(intersection) / len(union) if union else 0
+            
+            # CRITICAL: Don't merge queries that target different focus areas or also_answering topics
+            # These should remain separate even if they're similar
+            query_focus_areas = set(query.targets_focus_areas)
+            existing_focus_areas = set(existing.targets_focus_areas)
+            query_also_answering = set(query.covers_also_answering)
+            existing_also_answering = set(existing.covers_also_answering)
+            
+            # If queries target different focus areas, keep them separate
+            if query_focus_areas and existing_focus_areas and query_focus_areas != existing_focus_areas:
+                continue  # Keep separate - different focus areas
+            
+            # If queries cover different also_answering topics, keep them separate
+            if query_also_answering and existing_also_answering and query_also_answering != existing_also_answering:
+                continue  # Keep separate - different also_answering topics
+            
+            # Only consider duplicate if >90% similarity (increased from 80%) AND same purpose/provider AND same focus/also_answering
+            # This is more strict to avoid over-deduplication
+            if similarity > 0.9 and query.purpose == existing.purpose and query.provider == existing.provider:
+                # Only merge if they truly target the same things
+                if query_focus_areas == existing_focus_areas and query_also_answering == existing_also_answering:
+                    is_duplicate = True
+                    # Merge: update existing query's linking arrays
+                    existing.addresses_secondary_questions = list(set(
+                        existing.addresses_secondary_questions + query.addresses_secondary_questions
+                    ))
+                    existing.targets_focus_areas = list(set(
+                        existing.targets_focus_areas + query.targets_focus_areas
+                    ))
+                    existing.covers_also_answering = list(set(
+                        existing.covers_also_answering + query.covers_also_answering
+                    ))
+                    # Update expected_results to reflect merged coverage
+                    if query.expected_results and query.expected_results not in existing.expected_results:
+                        existing.expected_results += f" Also covers: {query.expected_results}"
+                    break
+        
+        if not is_duplicate:
+            deduplicated.append(query)
+            seen_keywords.add(query_key)
+        
+        # Limit to 8 queries total
+        if len(deduplicated) >= 8:
+            break
+    
+    logger.info(f"Deduplicated queries: {len(queries)} -> {len(deduplicated)}")
+    return deduplicated