ALwrity/backend/services/research/intent/query_deduplicator.py

"""
Query deduplication logic for unified research analyzer.

Removes redundant queries that would return similar results
and ensures queries are linked to intent fields.
"""

from typing import List
from loguru import logger

from models.research_intent_models import ResearchIntent, ResearchQuery


def deduplicate_queries(
    queries: List[ResearchQuery],
    intent: ResearchIntent
) -> List[ResearchQuery]:
    """
    Remove redundant queries that would return similar results.

    Rules:
    1. If two queries are semantically very similar (same keywords, same purpose), merge them
    2. If a query can answer multiple secondary questions, combine them
    3. If focus areas overlap significantly, don't create separate queries
    4. Maximum 8 queries - prioritize by importance
    5. Always keep the primary query (addresses_primary_question=True)
    """
    if len(queries) <= 8:
        # Still check for exact duplicates
        seen_queries = set()
        deduplicated = []
        for query in queries:
            query_key = (query.query.lower().strip(), query.provider)
            if query_key not in seen_queries:
                seen_queries.add(query_key)
                deduplicated.append(query)
        return deduplicated

    # Sort by priority (highest first)
    queries.sort(key=lambda q: q.priority, reverse=True)

    # Always keep primary query
    primary_queries = [q for q in queries if q.addresses_primary_question]
    other_queries = [q for q in queries if not q.addresses_primary_question]

    deduplicated = []
    seen_keywords = set()

    # Add primary queries first (should be only one, but handle multiple)
    for query in primary_queries:
        query_key = (query.query.lower().strip(), query.provider)
        if query_key not in seen_keywords:
            seen_keywords.add(query_key)
            deduplicated.append(query)

    # Process other queries with similarity checking
    for query in other_queries:
        query_key = (query.query.lower().strip(), query.provider)

        # Check for exact duplicate
        if query_key in seen_keywords:
            continue

        # Check for semantic similarity with existing queries
        query_words = set(query.query.lower().split())
        is_duplicate = False

        for existing in deduplicated:
            existing_words = set(existing.query.lower().split())

            # Calculate Jaccard similarity (intersection over union)
            intersection = query_words & existing_words
            union = query_words | existing_words
            similarity = len(intersection) / len(union) if union else 0

            # CRITICAL: Don't merge queries that target different focus areas or also_answering topics
            # These should remain separate even if they're similar
            query_focus_areas = set(query.targets_focus_areas)
            existing_focus_areas = set(existing.targets_focus_areas)
            query_also_answering = set(query.covers_also_answering)
            existing_also_answering = set(existing.covers_also_answering)

            # If queries target different focus areas, keep them separate
            if query_focus_areas and existing_focus_areas and query_focus_areas != existing_focus_areas:
                continue  # Keep separate - different focus areas

            # If queries cover different also_answering topics, keep them separate
            if query_also_answering and existing_also_answering and query_also_answering != existing_also_answering:
                continue  # Keep separate - different also_answering topics

            # Only consider duplicate if >90% similarity (increased from 80%) AND same purpose/provider AND same focus/also_answering
            # This is more strict to avoid over-deduplication
            if similarity > 0.9 and query.purpose == existing.purpose and query.provider == existing.provider:
                # Only merge if they truly target the same things
                if query_focus_areas == existing_focus_areas and query_also_answering == existing_also_answering:
                    is_duplicate = True
                    # Merge: update existing query's linking arrays
                    existing.addresses_secondary_questions = list(set(
                        existing.addresses_secondary_questions + query.addresses_secondary_questions
                    ))
                    existing.targets_focus_areas = list(set(
                        existing.targets_focus_areas + query.targets_focus_areas
                    ))
                    existing.covers_also_answering = list(set(
                        existing.covers_also_answering + query.covers_also_answering
                    ))
                    # Update expected_results to reflect merged coverage
                    if query.expected_results and query.expected_results not in existing.expected_results:
                        existing.expected_results += f" Also covers: {query.expected_results}"
                    break

        if not is_duplicate:
            deduplicated.append(query)
            seen_keywords.add(query_key)

        # Limit to 8 queries total
        if len(deduplicated) >= 8:
            break

    logger.info(f"Deduplicated queries: {len(queries)} -> {len(deduplicated)}")
    return deduplicated