ALwrity/backend/services/research/intent/unified_research_analyzer.py

"""
Unified Research Analyzer

Combines intent inference, query generation, and parameter optimization
into a single AI call with justifications for each decision.

This reduces 2 LLM calls to 1, improves coherence, and provides
user-friendly justifications for all settings.

Author: ALwrity Team
Version: 1.0
"""

import json
from typing import Dict, Any, List, Optional, Tuple
from loguru import logger

from models.research_intent_models import (
    ResearchIntent,
    ResearchQuery,
    IntentInferenceResponse,
    ResearchPurpose,
    ContentOutput,
    ExpectedDeliverable,
    ResearchDepthLevel,
    InputType,
)
from models.research_persona_models import ResearchPersona


class UnifiedResearchAnalyzer:
    """
    Unified AI-driven analyzer that performs:
    1. Intent inference (what user wants)
    2. Query generation (how to search)
    3. Parameter optimization (Exa/Tavily settings)

    All in a single LLM call with justifications.
    """

    def __init__(self):
        """Initialize the unified analyzer."""
        logger.info("UnifiedResearchAnalyzer initialized")

    async def analyze(
        self,
        user_input: str,
        keywords: Optional[List[str]] = None,
        research_persona: Optional[ResearchPersona] = None,
        competitor_data: Optional[List[Dict]] = None,
        industry: Optional[str] = None,
        target_audience: Optional[str] = None,
        user_id: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Perform unified analysis of user research request.

        Returns:
            Dict containing:
            - intent: ResearchIntent
            - queries: List[ResearchQuery]
            - exa_config: Dict with settings and justifications
            - tavily_config: Dict with settings and justifications
            - recommended_provider: str
            - provider_justification: str
        """
        try:
            logger.info(f"Unified analysis for: {user_input[:100]}...")

            keywords = keywords or []

            # Build the unified prompt
            prompt = self._build_unified_prompt(
                user_input=user_input,
                keywords=keywords,
                research_persona=research_persona,
                competitor_data=competitor_data,
                industry=industry,
                target_audience=target_audience,
            )

            # Define the comprehensive JSON schema
            unified_schema = self._build_unified_schema()

            # Call LLM (single call for everything)
            from services.llm_providers.main_text_generation import llm_text_gen

            result = llm_text_gen(
                prompt=prompt,
                json_struct=unified_schema,
                user_id=user_id
            )

            if isinstance(result, dict) and "error" in result:
                logger.error(f"Unified analysis failed: {result.get('error')}")
                return self._create_fallback_response(user_input, keywords)

            # Parse the unified result
            return self._parse_unified_result(result, user_input)

        except Exception as e:
            logger.error(f"Error in unified analysis: {e}")
            return self._create_fallback_response(user_input, keywords or [])

    def _build_unified_prompt(
        self,
        user_input: str,
        keywords: List[str],
        research_persona: Optional[ResearchPersona] = None,
        competitor_data: Optional[List[Dict]] = None,
        industry: Optional[str] = None,
        target_audience: Optional[str] = None,
    ) -> str:
        """Build the unified prompt for intent + queries + parameters."""

        # Build persona context
        persona_context = self._build_persona_context(research_persona, industry, target_audience)

        # Build competitor context
        competitor_context = self._build_competitor_context(competitor_data)

        prompt = f'''You are an expert AI research strategist. Analyze the user's research request and provide a complete research plan including intent understanding, search queries, and optimal API settings.

## USER INPUT
"{user_input}"
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}

## USER CONTEXT
{persona_context}
{competitor_context}

## YOUR TASK: Provide a Complete Research Plan

### PART 1: INTENT ANALYSIS
Understand what the user really wants from their research.

### PART 2: SEARCH QUERIES
Generate 4-8 targeted search queries optimized for semantic search.

### PART 3: PROVIDER SETTINGS
Configure Exa and Tavily API parameters with justifications.

### PART 4: GOOGLE TRENDS KEYWORDS (if trends in deliverables)
If "trends" is in expected_deliverables OR purpose is "explore_trends":
- Suggest 1-3 optimized keywords for Google Trends analysis
- These may differ from research queries (trends need broader, searchable terms)
- Consider: What keywords will show meaningful trends over time?
- Consider: What timeframe will show relevant trends? (1 year, 12 months, etc.)
- Consider: What geographic region is most relevant for the user?
- Explain what insights trends will uncover for content generation:
  * Search interest trends over time (optimal publication timing)
  * Regional interest distribution (audience targeting)
  * Related topics for content expansion
  * Related queries for FAQ sections
  * Rising topics for timely content opportunities

---

## AVAILABLE PROVIDER OPTIONS

### EXA API OPTIONS (Semantic Search Engine)
| Parameter | Options | Description |
|-----------|---------|-------------|
| type | "auto", "neural", "fast", "deep" | "neural" = semantic understanding, "deep" = comprehensive with query expansion |
| category | "company", "research paper", "news", "github", "tweet", "personal site", "pdf", "financial report", "people" | Focus on specific content types |
| numResults | 5-25 | Number of results (10 recommended) |
| includeDomains | string[] | Domains to include (e.g., ["arxiv.org", "nature.com"]) |
| excludeDomains | string[] | Domains to exclude |
| startPublishedDate | ISO date | Filter by publish date (e.g., "2024-01-01T00:00:00.000Z") |
| text | boolean | Include full text content |
| highlights | boolean | Extract key highlights |
| context | boolean | Return as single context string for RAG |

**WHEN TO USE EXA:**
- Semantic understanding needed (finding similar content)
- Academic/research papers
- Company/competitor research
- Deep, comprehensive results
- Historical content

### TAVILY API OPTIONS (AI-Powered Search)
| Parameter | Options | Description |
|-----------|---------|-------------|
| topic | "general", "news", "finance" | Search topic category |
| search_depth | "basic", "advanced" | "advanced" = multiple semantic snippets per URL |
| include_answer | false, true, "basic", "advanced" | AI-generated answer from results |
| include_raw_content | false, true, "markdown", "text" | Raw page content format |
| time_range | "day", "week", "month", "year" | Filter by recency |
| max_results | 5-20 | Number of results |
| include_domains | string[] | Domains to include |
| exclude_domains | string[] | Domains to exclude |

**WHEN TO USE TAVILY:**
- Real-time/current events
- News and trending topics
- Quick facts with AI answers
- Financial data
- Recent time-sensitive content

---

## OUTPUT FORMAT

Return a JSON object with this exact structure:

```json
{{
    "intent": {{
        "input_type": "keywords|question|goal|mixed",
        "primary_question": "The main question to answer",
        "secondary_questions": ["question 1", "question 2"],
        "purpose": "learn|create_content|make_decision|compare|solve_problem|find_data|explore_trends|validate|generate_ideas",
        "content_output": "blog|podcast|video|social_post|newsletter|presentation|report|whitepaper|email|general",
        "expected_deliverables": ["key_statistics", "expert_quotes", "case_studies", "trends", "best_practices"],
        "depth": "overview|detailed|expert",
        "focus_areas": ["area1", "area2"],
        "perspective": "target perspective or null",
        "time_sensitivity": "real_time|recent|historical|evergreen",
        "confidence": 0.85,
        "confidence_reason": "Why this confidence level",
        "great_example": "Example of better input if confidence < 0.8",
        "needs_clarification": false,
        "clarifying_questions": [],
        "analysis_summary": "Brief summary of research plan"
    }},
    "queries": [
        {{
            "query": "Optimized search query string",
            "purpose": "key_statistics|expert_quotes|case_studies|trends|etc",
            "provider": "exa|tavily",
            "priority": 5,
            "expected_results": "What we expect to find",
            "justification": "Why this query and provider"
        }}
    ],
    "enhanced_keywords": ["expanded", "related", "keywords"],
    "research_angles": ["Angle 1: ...", "Angle 2: ..."],
    "recommended_provider": "exa|tavily",
    "provider_justification": "Why this provider is best for this research",
    "exa_config": {{
        "enabled": true,
        "type": "auto|neural|fast|deep",
        "type_justification": "Why this search type",
        "category": "news|research paper|company|etc or null",
        "category_justification": "Why this category or null",
        "numResults": 10,
        "numResults_justification": "Why this number",
        "includeDomains": [],
        "includeDomains_justification": "Why these domains or empty",
        "startPublishedDate": "2024-01-01T00:00:00.000Z or null",
        "date_justification": "Why this date filter or null",
        "highlights": true,
        "highlights_justification": "Why enable/disable highlights",
        "context": true,
        "context_justification": "Why enable/disable context string"
    }},
    "tavily_config": {{
        "enabled": true,
        "topic": "general|news|finance",
        "topic_justification": "Why this topic",
        "search_depth": "basic|advanced",
        "search_depth_justification": "Why this depth",
        "include_answer": "true|false|basic|advanced",
        "include_answer_justification": "Why this answer mode",
        "time_range": "day|week|month|year|null",
        "time_range_justification": "Why this time range or null",
        "max_results": 10,
        "max_results_justification": "Why this number",
        "include_raw_content": "false|true|markdown|text",
        "include_raw_content_justification": "Why this content mode"
    }},
    "trends_config": {{
        "enabled": true|false,
        "keywords": ["keyword1", "keyword2"],
        "keywords_justification": "Why these keywords for trends analysis",
        "timeframe": "today 1-y|today 12-m|all",
        "timeframe_justification": "Why this timeframe",
        "geo": "US|GB|IN|etc",
        "geo_justification": "Why this geographic region",
        "expected_insights": [
            "Search interest trends over the past year",
            "Regional interest distribution",
            "Related topics for content expansion",
            "Related queries for FAQ sections",
            "Optimal publication timing based on interest peaks"
        ]
    }}
}}
```

## DECISION RULES

1. **Provider Selection:**
   - Use EXA for: academic research, competitor analysis, deep understanding, finding similar content
   - Use TAVILY for: news, current events, quick facts, financial data, real-time info

2. **Query Optimization:**
   - Include relevant keywords for semantic matching
   - Add context words based on deliverables (e.g., "statistics 2024" for key_statistics)
   - Match query style to provider (natural language for Exa, keyword-rich for Tavily)

3. **Parameter Selection:**
   - ALWAYS provide justification for each parameter choice
   - Consider time sensitivity when setting date filters
   - Match category/topic to content type
   - Use "advanced" depth when quality matters more than speed

4. **Google Trends Keywords (if trends enabled):**
   - Suggest 1-3 keywords optimized for trends analysis
   - Keywords should be broader than research queries (e.g., "AI marketing" vs "AI marketing tools for small businesses")
   - Consider what will show meaningful search interest trends
   - Choose timeframe based on content type (12 months for blogs, 1 year for comprehensive)
   - Select geo based on user's target audience or industry
   - List specific insights trends will uncover

5. **Justifications:**
   - Keep justifications concise (1 sentence)
   - Explain the "why" not the "what"
   - Reference user's intent when relevant
'''

        return prompt

    def _build_unified_schema(self) -> Dict[str, Any]:
        """Build the JSON schema for unified response."""
        return {
            "type": "object",
            "properties": {
                "intent": {
                    "type": "object",
                    "properties": {
                        "input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
                        "primary_question": {"type": "string"},
                        "secondary_questions": {"type": "array", "items": {"type": "string"}},
                        "purpose": {"type": "string"},
                        "content_output": {"type": "string"},
                        "expected_deliverables": {"type": "array", "items": {"type": "string"}},
                        "depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
                        "focus_areas": {"type": "array", "items": {"type": "string"}},
                        "perspective": {"type": "string"},
                        "time_sensitivity": {"type": "string"},
                        "confidence": {"type": "number"},
                        "confidence_reason": {"type": "string"},
                        "great_example": {"type": "string"},
                        "needs_clarification": {"type": "boolean"},
                        "clarifying_questions": {"type": "array", "items": {"type": "string"}},
                        "analysis_summary": {"type": "string"}
                    },
                    "required": ["primary_question", "purpose", "expected_deliverables", "confidence"]
                },
                "queries": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "query": {"type": "string"},
                            "purpose": {"type": "string"},
                            "provider": {"type": "string"},
                            "priority": {"type": "integer"},
                            "expected_results": {"type": "string"},
                            "justification": {"type": "string"}
                        },
                        "required": ["query", "purpose", "provider", "priority"]
                    }
                },
                "enhanced_keywords": {"type": "array", "items": {"type": "string"}},
                "research_angles": {"type": "array", "items": {"type": "string"}},
                "recommended_provider": {"type": "string"},
                "provider_justification": {"type": "string"},
                "exa_config": {
                    "type": "object",
                    "properties": {
                        "enabled": {"type": "boolean"},
                        "type": {"type": "string"},
                        "type_justification": {"type": "string"},
                        "category": {"type": "string"},
                        "category_justification": {"type": "string"},
                        "numResults": {"type": "integer"},
                        "numResults_justification": {"type": "string"},
                        "includeDomains": {"type": "array", "items": {"type": "string"}},
                        "includeDomains_justification": {"type": "string"},
                        "startPublishedDate": {"type": "string"},
                        "date_justification": {"type": "string"},
                        "highlights": {"type": "boolean"},
                        "highlights_justification": {"type": "string"},
                        "context": {"type": "boolean"},
                        "context_justification": {"type": "string"}
                    }
                },
                "tavily_config": {
                    "type": "object",
                    "properties": {
                        "enabled": {"type": "boolean"},
                        "topic": {"type": "string"},
                        "topic_justification": {"type": "string"},
                        "search_depth": {"type": "string"},
                        "search_depth_justification": {"type": "string"},
                        "include_answer": {"type": "string"},
                        "include_answer_justification": {"type": "string"},
                        "time_range": {"type": "string"},
                        "time_range_justification": {"type": "string"},
                        "max_results": {"type": "integer"},
                        "max_results_justification": {"type": "string"},
                        "include_raw_content": {"type": "string"},
                        "include_raw_content_justification": {"type": "string"}
                    }
                },
                "trends_config": {
                    "type": "object",
                    "properties": {
                        "enabled": {"type": "boolean"},
                        "keywords": {"type": "array", "items": {"type": "string"}},
                        "keywords_justification": {"type": "string"},
                        "timeframe": {"type": "string"},
                        "timeframe_justification": {"type": "string"},
                        "geo": {"type": "string"},
                        "geo_justification": {"type": "string"},
                        "expected_insights": {"type": "array", "items": {"type": "string"}}
                    }
                }
            },
            "required": ["intent", "queries", "recommended_provider", "exa_config", "tavily_config"]
        }

    def _build_persona_context(
        self,
        research_persona: Optional[ResearchPersona],
        industry: Optional[str],
        target_audience: Optional[str],
    ) -> str:
        """Build persona context section."""
        parts = []

        if research_persona:
            if research_persona.default_industry:
                parts.append(f"Industry: {research_persona.default_industry}")
            if research_persona.default_target_audience:
                parts.append(f"Target Audience: {research_persona.default_target_audience}")
            if research_persona.research_angles:
                parts.append(f"Preferred Research Angles: {', '.join(research_persona.research_angles[:3])}")
            if research_persona.suggested_keywords:
                parts.append(f"Relevant Keywords: {', '.join(research_persona.suggested_keywords[:5])}")
        else:
            if industry:
                parts.append(f"Industry: {industry}")
            if target_audience:
                parts.append(f"Target Audience: {target_audience}")

        if not parts:
            return "No specific user context available. Use general best practices."

        return "\n".join(parts)

    def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
        """Build competitor context section."""
        if not competitor_data:
            return ""

        competitor_names = [c.get("name", c.get("url", "")) for c in competitor_data[:5]]
        if competitor_names:
            return f"\nKnown Competitors: {', '.join(competitor_names)}"
        return ""

    def _parse_unified_result(self, result: Dict[str, Any], user_input: str) -> Dict[str, Any]:
        """Parse the unified LLM result into structured response."""

        intent_data = result.get("intent", {})

        # Build ResearchIntent
        intent = ResearchIntent(
            primary_question=intent_data.get("primary_question", user_input),
            secondary_questions=intent_data.get("secondary_questions", []),
            purpose=intent_data.get("purpose", "learn"),
            content_output=intent_data.get("content_output", "general"),
            expected_deliverables=intent_data.get("expected_deliverables", ["key_statistics"]),
            depth=intent_data.get("depth", "detailed"),
            focus_areas=intent_data.get("focus_areas", []),
            perspective=intent_data.get("perspective"),
            time_sensitivity=intent_data.get("time_sensitivity"),
            input_type=intent_data.get("input_type", "keywords"),
            original_input=user_input,
            confidence=float(intent_data.get("confidence", 0.7)),
            confidence_reason=intent_data.get("confidence_reason"),
            great_example=intent_data.get("great_example"),
            needs_clarification=intent_data.get("needs_clarification", False),
            clarifying_questions=intent_data.get("clarifying_questions", []),
        )

        # Build queries
        queries = []
        for q in result.get("queries", []):
            try:
                queries.append(ResearchQuery(
                    query=q.get("query", ""),
                    purpose=q.get("purpose", "key_statistics"),
                    provider=q.get("provider", "exa"),
                    priority=int(q.get("priority", 3)),
                    expected_results=q.get("expected_results", ""),
                ))
            except Exception as e:
                logger.warning(f"Failed to parse query: {e}")

        return {
            "success": True,
            "intent": intent,
            "queries": queries,
            "enhanced_keywords": result.get("enhanced_keywords", []),
            "research_angles": result.get("research_angles", []),
            "recommended_provider": result.get("recommended_provider", "exa"),
            "provider_justification": result.get("provider_justification", ""),
            "exa_config": result.get("exa_config", {}),
            "tavily_config": result.get("tavily_config", {}),
            "trends_config": result.get("trends_config", {}),  # NEW: Google Trends configuration
            "analysis_summary": intent_data.get("analysis_summary", ""),
        }

    def _create_fallback_response(self, user_input: str, keywords: List[str]) -> Dict[str, Any]:
        """Create fallback response when analysis fails."""
        return {
            "success": False,
            "intent": ResearchIntent(
                primary_question=f"What are the key insights about: {user_input}?",
                purpose="learn",
                content_output="general",
                expected_deliverables=["key_statistics", "best_practices"],
                depth="detailed",
                original_input=user_input,
                confidence=0.5,
            ),
            "queries": [
                ResearchQuery(
                    query=user_input,
                    purpose="key_statistics",
                    provider="exa",
                    priority=5,
                    expected_results="General research results",
                )
            ],
            "enhanced_keywords": keywords,
            "research_angles": [],
            "recommended_provider": "exa",
            "provider_justification": "Default fallback to Exa for semantic search",
            "exa_config": {
                "enabled": True,
                "type": "auto",
                "type_justification": "Auto mode for balanced results",
                "numResults": 10,
                "highlights": True,
            },
            "tavily_config": {
                "enabled": True,
                "topic": "general",
                "search_depth": "advanced",
                "include_answer": True,
            },
            "trends_config": {
                "enabled": False,  # Disabled in fallback
            },
        }