ALwrity/backend/services/research/intent/unified_result_parser.py

"""
Result parsing logic for unified research analyzer.

Parses LLM response into structured ResearchIntent, ResearchQuery,
and configuration dictionaries.
"""

from typing import Dict, Any, List
from loguru import logger

from models.research_intent_models import (
    ResearchIntent, ResearchQuery,
    ResearchPurpose, ContentOutput, ExpectedDeliverable,
    ResearchDepthLevel, InputType
)
from .query_deduplicator import deduplicate_queries


def _normalize_purpose(value: str) -> str:
    """Normalize purpose value to enum."""
    if not value or not isinstance(value, str):
        return "learn"
    value_lower = value.lower()
    # Check for exact match
    for purpose in ResearchPurpose:
        if value_lower == purpose.value or value_lower == purpose.name.lower():
            return purpose.value
    # Check for keywords in description
    if "content" in value_lower or "write" in value_lower or "create" in value_lower or "blog" in value_lower:
        return "create_content"
    elif "compare" in value_lower or "comparison" in value_lower:
        return "compare"
    elif "decision" in value_lower or "choose" in value_lower:
        return "make_decision"
    elif "problem" in value_lower or "solve" in value_lower:
        return "solve_problem"
    elif "data" in value_lower or "statistic" in value_lower or "fact" in value_lower:
        return "find_data"
    elif "trend" in value_lower:
        return "explore_trends"
    elif "validat" in value_lower or "verify" in value_lower:
        return "validate"
    elif "idea" in value_lower or "brainstorm" in value_lower:
        return "generate_ideas"
    return "learn"


def _normalize_content_output(value: str) -> str:
    """Normalize content_output value to enum."""
    if not value or not isinstance(value, str):
        return "general"
    value_lower = value.lower()
    # Check for exact match
    for output in ContentOutput:
        if value_lower == output.value or value_lower == output.name.lower():
            return output.value
    # Check for keywords
    if "blog" in value_lower or "article" in value_lower:
        return "blog"
    elif "podcast" in value_lower:
        return "podcast"
    elif "video" in value_lower:
        return "video"
    elif "social" in value_lower or "post" in value_lower:
        return "social_post"
    elif "newsletter" in value_lower:
        return "newsletter"
    elif "presentation" in value_lower or "slide" in value_lower:
        return "presentation"
    elif "report" in value_lower:
        return "report"
    elif "whitepaper" in value_lower or "white paper" in value_lower:
        return "whitepaper"
    elif "email" in value_lower:
        return "email"
    return "general"


def _normalize_deliverable(value: str) -> str:
    """Normalize deliverable value to enum."""
    if not value or not isinstance(value, str):
        return "key_statistics"
    value_lower = value.lower().strip()
    # Check for exact match first
    for deliverable in ExpectedDeliverable:
        if value_lower == deliverable.value or value_lower == deliverable.name.lower():
            return deliverable.value
    # Check for keywords (more aggressive matching)
    if "statistic" in value_lower or "data" in value_lower or "number" in value_lower or "metric" in value_lower or "report" in value_lower:
        return "key_statistics"
    elif "quote" in value_lower or "expert" in value_lower:
        return "expert_quotes"
    elif "case" in value_lower or "study" in value_lower:
        return "case_studies"
    elif "compar" in value_lower or "compare" in value_lower or "landscape" in value_lower or "matrix" in value_lower:
        return "comparisons"
    elif "trend" in value_lower or "keyword" in value_lower or "seo" in value_lower:
        return "trends"
    elif "practice" in value_lower or "best" in value_lower or "guideline" in value_lower or "recommendation" in value_lower or "calendar" in value_lower:
        return "best_practices"
    elif "step" in value_lower or "how" in value_lower or "process" in value_lower or "guide" in value_lower or "outline" in value_lower or "heading" in value_lower:
        return "step_by_step"
    elif ("pro" in value_lower and "con" in value_lower) or "advantage" in value_lower or "disadvantage" in value_lower:
        return "pros_cons"
    elif "defin" in value_lower or "explain" in value_lower:
        return "definitions"
    elif "citation" in value_lower or "source" in value_lower or "reference" in value_lower:
        return "citations"
    elif "example" in value_lower or "sample" in value_lower:
        return "examples"
    elif "prediction" in value_lower or "future" in value_lower or "outlook" in value_lower:
        return "predictions"
    # Default fallback
    return "key_statistics"


def parse_unified_result(result: Dict[str, Any], user_input: str) -> Dict[str, Any]:
    """
    Parse the unified LLM result into structured response.

    Args:
        result: Raw LLM response dictionary
        user_input: Original user input for fallback values

    Returns:
        Structured response with intent, queries, configs, etc.
    """
    intent_data = result.get("intent", {})

    # Normalize enum values
    purpose_value = _normalize_purpose(intent_data.get("purpose", "learn"))
    content_output_value = _normalize_content_output(intent_data.get("content_output", "general"))

    # Normalize deliverables list
    deliverables_raw = intent_data.get("expected_deliverables", ["key_statistics"])
    if not isinstance(deliverables_raw, list):
        deliverables_raw = [deliverables_raw] if deliverables_raw else ["key_statistics"]
    normalized_deliverables = [_normalize_deliverable(d) for d in deliverables_raw if d]
    if not normalized_deliverables:
        normalized_deliverables = ["key_statistics"]

    # Build ResearchIntent
    try:
        intent = ResearchIntent(
            primary_question=intent_data.get("primary_question", user_input),
            secondary_questions=intent_data.get("secondary_questions", []),
            purpose=purpose_value,
            content_output=content_output_value,
            expected_deliverables=normalized_deliverables,
            depth=intent_data.get("depth", "detailed"),
            focus_areas=intent_data.get("focus_areas", []),
            also_answering=intent_data.get("also_answering", []),
            perspective=intent_data.get("perspective"),
            time_sensitivity=intent_data.get("time_sensitivity"),
            input_type=intent_data.get("input_type", "keywords"),
            original_input=user_input,
            confidence=float(intent_data.get("confidence", 0.7)),
            confidence_reason=intent_data.get("confidence_reason"),
            great_example=intent_data.get("great_example"),
            needs_clarification=intent_data.get("needs_clarification", False),
            clarifying_questions=intent_data.get("clarifying_questions", []),
        )
    except Exception as e:
        logger.error(f"Failed to parse intent: {e}, intent_data: {intent_data}")
        # Return fallback intent
        from .unified_analyzer_utils import create_fallback_response
        return create_fallback_response(user_input, [])

    # Build queries
    queries = []
    for q in result.get("queries", []):
        try:
            # Normalize query purpose
            query_purpose = _normalize_deliverable(q.get("purpose", "key_statistics"))
            queries.append(ResearchQuery(
                query=q.get("query", ""),
                purpose=query_purpose,
                provider=q.get("provider", "exa"),
                priority=int(q.get("priority", 3)),
                expected_results=q.get("expected_results", ""),
                addresses_primary_question=q.get("addresses_primary_question", False),
                addresses_secondary_questions=q.get("addresses_secondary_questions", []),
                targets_focus_areas=q.get("targets_focus_areas", []),
                covers_also_answering=q.get("covers_also_answering", []),
                justification=q.get("justification"),
            ))
        except Exception as e:
            logger.warning(f"Failed to parse query: {e}, query: {q}")

    # Deduplicate queries to avoid redundant API calls
    queries = deduplicate_queries(queries, intent)

    # Log warning if no queries after parsing
    if not queries:
        logger.warning("No valid queries parsed from LLM response")

    return {
        "success": True,
        "intent": intent,
        "queries": queries,
        "enhanced_keywords": result.get("enhanced_keywords", []),
        "research_angles": result.get("research_angles", []),
        "recommended_provider": result.get("recommended_provider", "exa"),
        "provider_justification": result.get("provider_justification", ""),
        "exa_config": result.get("exa_config", {}),
        "tavily_config": result.get("tavily_config", {}),
        "trends_config": result.get("trends_config", {}),  # Google Trends configuration
        "analysis_summary": intent_data.get("analysis_summary", ""),
    }