ALwrity/backend/services/research/core/research_engine.py

"""
Research Engine - Core Orchestrator

The main entry point for AI research across all ALwrity tools.
This engine wraps existing providers (Exa, Tavily, Google) and provides
a unified interface for any content generation tool.

Usage:
    from services.research.core import ResearchEngine, ResearchContext, ContentType

    engine = ResearchEngine()
    result = await engine.research(ResearchContext(
        query="AI trends in healthcare 2025",
        content_type=ContentType.PODCAST,
        personalization=ResearchPersonalizationContext(
            industry="Healthcare",
            target_audience="Medical professionals"
        )
    ))

Author: ALwrity Team
Version: 2.0
"""

import os
import time
from typing import Dict, Any, Optional, Callable
from loguru import logger

from .research_context import (
    ResearchContext,
    ResearchResult,
    ResearchDepth,
    ContentType,
    ResearchPersonalizationContext,
)
from .parameter_optimizer import ParameterOptimizer

# Reuse existing blog writer models and services
from models.blog_models import (
    BlogResearchRequest,
    BlogResearchResponse,
    ResearchConfig,
    ResearchProvider,
    ResearchMode,
    PersonaInfo,
    ResearchSource,
)

# Research persona for personalization
from models.research_persona_models import ResearchPersona


class ResearchEngine:
    """
    AI Research Engine - Standalone module for content research.

    This engine:
    1. Accepts a ResearchContext from any tool
    2. Uses AI to optimize parameters for Exa/Tavily
    3. Integrates research persona for personalization
    4. Executes research using existing providers
    5. Returns standardized ResearchResult

    Can be imported by Blog Writer, Podcast Maker, YouTube Creator, etc.
    """

    def __init__(self, db_session=None):
        """Initialize the Research Engine."""
        self.optimizer = ParameterOptimizer()
        self._providers_initialized = False
        self._exa_provider = None
        self._tavily_provider = None
        self._google_provider = None
        self._db_session = db_session

        # Check provider availability
        self.exa_available = bool(os.getenv("EXA_API_KEY"))
        self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))

        logger.info(f"ResearchEngine initialized: exa={self.exa_available}, tavily={self.tavily_available}")

    def _get_research_persona(self, user_id: str, generate_if_missing: bool = True) -> Optional[ResearchPersona]:
        """
        Fetch research persona for user, generating if missing.

        Phase 2: Since onboarding is mandatory and always completes before accessing
        any tool, we can safely generate research persona on first use. This ensures
        hyper-personalization without requiring "General" fallbacks.

        Args:
            user_id: User ID (Clerk string)
            generate_if_missing: If True, generate persona if not cached (default: True)

        Returns:
            ResearchPersona if successful, None only if user has no core persona
        """
        if not user_id:
            return None

        try:
            from services.research.research_persona_service import ResearchPersonaService

            db = self._db_session
            if not db:
                from services.database import get_db_session
                db = get_db_session()

            persona_service = ResearchPersonaService(db_session=db)

            if generate_if_missing:
                # Phase 2: Use get_or_generate() to create persona on first visit
                # This triggers LLM call if not cached, but onboarding guarantees
                # core persona exists, so generation will succeed
                logger.info(f"🔄 Getting/generating research persona for user {user_id}...")
                persona = persona_service.get_or_generate(user_id, force_refresh=False)

                if persona:
                    logger.info(f"✅ Research persona ready for user {user_id}: industry={persona.default_industry}")
                else:
                    logger.warning(f"⚠️ Could not get/generate research persona for user {user_id} - using core persona fallback")
            else:
                # Fast path: only return cached (for config endpoints)
                persona = persona_service.get_cached_only(user_id)
                if persona:
                    logger.debug(f"Research persona loaded from cache for user {user_id}")

            return persona

        except Exception as e:
            logger.warning(f"Failed to load research persona for user {user_id}: {e}")
            return None

    def _enrich_context_with_persona(
        self,
        context: ResearchContext,
        persona: ResearchPersona
    ) -> ResearchContext:
        """
        Enrich the research context with persona data.

        Only applies persona defaults if the context doesn't already have values.
        User-provided values always take precedence.
        """
        # Create personalization context if not exists
        if not context.personalization:
            context.personalization = ResearchPersonalizationContext()

        # Apply persona defaults only if not already set
        if not context.personalization.industry or context.personalization.industry == "General":
            if persona.default_industry:
                context.personalization.industry = persona.default_industry
                logger.debug(f"Applied persona industry: {persona.default_industry}")

        if not context.personalization.target_audience or context.personalization.target_audience == "General":
            if persona.default_target_audience:
                context.personalization.target_audience = persona.default_target_audience
                logger.debug(f"Applied persona target_audience: {persona.default_target_audience}")

        # Apply suggested Exa domains if not already set
        if not context.include_domains and persona.suggested_exa_domains:
            context.include_domains = persona.suggested_exa_domains[:6]  # Limit to 6 domains
            logger.debug(f"Applied persona domains: {context.include_domains}")

        # Apply suggested Exa category if not already set
        if not context.exa_category and persona.suggested_exa_category:
            context.exa_category = persona.suggested_exa_category
            logger.debug(f"Applied persona exa_category: {persona.suggested_exa_category}")

        return context

    async def research(
        self,
        context: ResearchContext,
        progress_callback: Optional[Callable[[str], None]] = None
    ) -> ResearchResult:
        """
        Execute research based on the given context.

        Args:
            context: Research context with query, goals, and personalization
            progress_callback: Optional callback for progress updates

        Returns:
            ResearchResult with sources, analysis, and content
        """
        start_time = time.time()

        try:
            # Progress update
            self._progress(progress_callback, "🔍 Analyzing research query...")

            # Enrich context with research persona (Phase 2: generate if missing)
            user_id = context.get_user_id()
            if user_id:
                self._progress(progress_callback, "👤 Loading personalized research profile...")
                persona = self._get_research_persona(user_id, generate_if_missing=True)
                if persona:
                    self._progress(progress_callback, "✨ Applying hyper-personalized settings...")
                    context = self._enrich_context_with_persona(context, persona)
                else:
                    logger.warning(f"No research persona available for user {user_id} - proceeding with provided context")

            # Optimize parameters based on enriched context
            provider, config = self.optimizer.optimize(context)

            self._progress(progress_callback, f"🤖 Selected {provider.value.upper()} for research")

            # Build the request using existing blog models
            request = self._build_request(context, config)
            user_id = context.get_user_id() or ""

            # Execute research using appropriate provider
            self._progress(progress_callback, f"🌐 Connecting to {provider.value} search...")

            if provider == ResearchProvider.EXA:
                response = await self._execute_exa_research(request, config, user_id, progress_callback)
            elif provider == ResearchProvider.TAVILY:
                response = await self._execute_tavily_research(request, config, user_id, progress_callback)
            else:
                response = await self._execute_google_research(request, config, user_id, progress_callback)

            # Transform response to ResearchResult
            self._progress(progress_callback, "📊 Processing results...")

            result = self._transform_response(response, provider, context)

            duration_ms = (time.time() - start_time) * 1000
            logger.info(f"Research completed in {duration_ms:.0f}ms: {len(result.sources)} sources")

            self._progress(progress_callback, f"✅ Research complete: {len(result.sources)} sources found")

            return result

        except Exception as e:
            logger.error(f"Research failed: {e}")
            return ResearchResult(
                success=False,
                error_message=str(e),
                error_code="RESEARCH_FAILED",
                retry_suggested=True,
                original_query=context.query
            )

    def _progress(self, callback: Optional[Callable[[str], None]], message: str):
        """Send progress update if callback provided."""
        if callback:
            callback(message)
        logger.info(f"[Research] {message}")

    def _build_request(self, context: ResearchContext, config: ResearchConfig) -> BlogResearchRequest:
        """Build BlogResearchRequest from ResearchContext."""

        # Extract keywords from query
        keywords = context.keywords if context.keywords else [context.query]

        # Build persona info from personalization
        persona = None
        if context.personalization:
            persona = PersonaInfo(
                persona_id=context.personalization.persona_id,
                tone=context.personalization.tone,
                audience=context.personalization.target_audience,
                industry=context.personalization.industry,
            )

        return BlogResearchRequest(
            keywords=keywords,
            topic=context.query,
            industry=context.get_industry(),
            target_audience=context.get_audience(),
            tone=context.personalization.tone if context.personalization else None,
            word_count_target=context.personalization.word_count_target if context.personalization else 1500,
            persona=persona,
            research_mode=config.mode,
            config=config,
        )

    async def _execute_exa_research(
        self,
        request: BlogResearchRequest,
        config: ResearchConfig,
        user_id: str,
        progress_callback: Optional[Callable[[str], None]] = None
    ) -> BlogResearchResponse:
        """Execute research using Exa provider."""
        from services.blog_writer.research.exa_provider import ExaResearchProvider
        from services.blog_writer.research.research_strategies import get_strategy_for_mode

        self._progress(progress_callback, "🔍 Executing Exa neural search...")

        # Get strategy for building prompt
        strategy = get_strategy_for_mode(config.mode)
        topic = request.topic or ", ".join(request.keywords)
        industry = request.industry or "General"
        target_audience = request.target_audience or "General"

        research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)

        # Preflight subscription check
        try:
            db = self._db_session
            if not db:
                from services.database import get_db_session
                db = get_db_session()
            if db:
                from services.subscription import PricingService
                from models.subscription_models import APIProvider
                pricing_service = PricingService(db)
                can_proceed, message, usage_info = pricing_service.check_usage_limits(
                    user_id=user_id,
                    provider=APIProvider.EXA,
                    tokens_requested=0,
                    actual_provider_name="exa",
                )
                if not can_proceed:
                    raise HTTPException(status_code=429, detail={
                        'error': message, 'message': message,
                        'provider': 'exa', 'usage_info': usage_info or {}
                    })
                logger.info(f"[ResearchEngine] Exa preflight check passed for user {user_id}")
        except HTTPException:
            raise
        except Exception as e:
            logger.warning(f"[ResearchEngine] Exa preflight check failed: {e}")

        # Execute Exa search
        try:
            exa_provider = ExaResearchProvider()
            raw_result = await exa_provider.search(
                research_prompt, topic, industry, target_audience, config, user_id
            )

            # Track usage
            cost = raw_result.get('cost', {}).get('total', 0.005) if isinstance(raw_result.get('cost'), dict) else 0.005
            exa_provider.track_exa_usage(user_id, cost)

            self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")

            # Run common analysis
            return await self._run_analysis(request, raw_result, config, user_id, progress_callback)

        except RuntimeError as e:
            if "EXA_API_KEY not configured" in str(e):
                logger.warning("Exa not configured, falling back to Tavily")
                self._progress(progress_callback, "⚠️ Exa unavailable, trying Tavily...")
                config.provider = ResearchProvider.TAVILY
                return await self._execute_tavily_research(request, config, user_id, progress_callback)
            raise

    async def _execute_tavily_research(
        self,
        request: BlogResearchRequest,
        config: ResearchConfig,
        user_id: str,
        progress_callback: Optional[Callable[[str], None]] = None
    ) -> BlogResearchResponse:
        """Execute research using Tavily provider."""
        from services.blog_writer.research.tavily_provider import TavilyResearchProvider
        from services.blog_writer.research.research_strategies import get_strategy_for_mode

        self._progress(progress_callback, "🔍 Executing Tavily AI search...")

        # Get strategy for building prompt
        strategy = get_strategy_for_mode(config.mode)
        topic = request.topic or ", ".join(request.keywords)
        industry = request.industry or "General"
        target_audience = request.target_audience or "General"

        research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)

        # Preflight subscription check
        try:
            db = self._db_session
            if not db:
                from services.database import get_db_session
                db = get_db_session()
            if db:
                from services.subscription import PricingService
                from models.subscription_models import APIProvider
                pricing_service = PricingService(db)
                can_proceed, message, usage_info = pricing_service.check_usage_limits(
                    user_id=user_id,
                    provider=APIProvider.TAVILY,
                    tokens_requested=0,
                    actual_provider_name="tavily",
                )
                if not can_proceed:
                    raise HTTPException(status_code=429, detail={
                        'error': message, 'message': message,
                        'provider': 'tavily', 'usage_info': usage_info or {}
                    })
                logger.info(f"[ResearchEngine] Tavily preflight check passed for user {user_id}")
        except HTTPException:
            raise
        except Exception as e:
            logger.warning(f"[ResearchEngine] Tavily preflight check failed: {e}")

        # Execute Tavily search
        try:
            tavily_provider = TavilyResearchProvider()
            raw_result = await tavily_provider.search(
                research_prompt, topic, industry, target_audience, config, user_id
            )

            # Track usage
            cost = raw_result.get('cost', {}).get('total', 0.001) if isinstance(raw_result.get('cost'), dict) else 0.001
            search_depth = config.tavily_search_depth or "basic"
            tavily_provider.track_tavily_usage(user_id, cost, search_depth)

            self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")

            # Run common analysis
            return await self._run_analysis(request, raw_result, config, user_id, progress_callback)

        except RuntimeError as e:
            if "TAVILY_API_KEY not configured" in str(e):
                logger.warning("Tavily not configured, falling back to Google")
                self._progress(progress_callback, "⚠️ Tavily unavailable, using Google Search...")
                config.provider = ResearchProvider.GOOGLE
                return await self._execute_google_research(request, config, user_id, progress_callback)
            raise

    async def _execute_google_research(
        self,
        request: BlogResearchRequest,
        config: ResearchConfig,
        user_id: str,
        progress_callback: Optional[Callable[[str], None]] = None
    ) -> BlogResearchResponse:
        """Execute research using Google/Gemini grounding."""
        from services.blog_writer.research.google_provider import GoogleResearchProvider
        from services.blog_writer.research.research_strategies import get_strategy_for_mode

        self._progress(progress_callback, "🔍 Executing Google Search grounding...")

        # Get strategy for building prompt
        strategy = get_strategy_for_mode(config.mode)
        topic = request.topic or ", ".join(request.keywords)
        industry = request.industry or "General"
        target_audience = request.target_audience or "General"

        research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)

        # Execute Google search
        google_provider = GoogleResearchProvider()
        raw_result = await google_provider.search(
            research_prompt, topic, industry, target_audience, config, user_id
        )

        self._progress(progress_callback, "📝 Processing grounded results...")

        # Run common analysis
        return await self._run_analysis(request, raw_result, config, user_id, progress_callback, is_google=True)

    async def _run_analysis(
        self,
        request: BlogResearchRequest,
        raw_result: Dict[str, Any],
        config: ResearchConfig,
        user_id: str,
        progress_callback: Optional[Callable[[str], None]] = None,
        is_google: bool = False
    ) -> BlogResearchResponse:
        """Run common analysis on raw results."""
        from services.blog_writer.research.keyword_analyzer import KeywordAnalyzer
        from services.blog_writer.research.competitor_analyzer import CompetitorAnalyzer
        from services.blog_writer.research.content_angle_generator import ContentAngleGenerator
        from services.blog_writer.research.data_filter import ResearchDataFilter

        self._progress(progress_callback, "🔍 Analyzing keywords and content angles...")

        # Extract content for analysis
        if is_google:
            content = raw_result.get("content", "")
            sources = self._extract_sources_from_grounding(raw_result)
            search_queries = raw_result.get("search_queries", []) or []
            grounding_metadata = self._extract_grounding_metadata(raw_result)
        else:
            content = raw_result.get('content', '')
            sources = [ResearchSource(**s) if isinstance(s, dict) else s for s in raw_result.get('sources', [])]
            search_queries = raw_result.get('search_queries', [])
            grounding_metadata = None

        topic = request.topic or ", ".join(request.keywords)
        industry = request.industry or "General"

        # Run analyzers
        keyword_analyzer = KeywordAnalyzer()
        competitor_analyzer = CompetitorAnalyzer()
        content_angle_generator = ContentAngleGenerator()
        data_filter = ResearchDataFilter()

        keyword_analysis = keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
        competitor_analysis = competitor_analyzer.analyze(content, user_id=user_id)
        suggested_angles = content_angle_generator.generate(content, topic, industry, user_id=user_id)

        # Build response
        response = BlogResearchResponse(
            success=True,
            sources=sources,
            keyword_analysis=keyword_analysis,
            competitor_analysis=competitor_analysis,
            suggested_angles=suggested_angles,
            search_widget="",
            search_queries=search_queries,
            grounding_metadata=grounding_metadata,
            original_keywords=request.keywords,
        )

        # Filter and clean research data
        self._progress(progress_callback, "✨ Filtering and optimizing results...")
        filtered_response = data_filter.filter_research_data(response)

        return filtered_response

    def _extract_sources_from_grounding(self, gemini_result: Dict[str, Any]) -> list:
        """Extract sources from Gemini grounding metadata."""
        from models.blog_models import ResearchSource

        sources = []
        if not gemini_result or not isinstance(gemini_result, dict):
            return sources

        raw_sources = gemini_result.get("sources", []) or []

        for src in raw_sources:
            source = ResearchSource(
                title=src.get("title", "Untitled"),
                url=src.get("url", ""),
                excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
                credibility_score=float(src.get("credibility_score", 0.8)),
                published_at=str(src.get("publication_date", "2024-01-01")),
                index=src.get("index"),
                source_type=src.get("type", "web")
            )
            sources.append(source)

        return sources

    def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Extract grounding metadata from Gemini result."""
        if not gemini_result or not isinstance(gemini_result, dict):
            return None

        return gemini_result.get("grounding_metadata")

    def _transform_response(
        self,
        response: BlogResearchResponse,
        provider: ResearchProvider,
        context: ResearchContext
    ) -> ResearchResult:
        """Transform BlogResearchResponse to ResearchResult."""

        # Convert sources to dicts
        sources = []
        for s in response.sources:
            if hasattr(s, 'dict'):
                sources.append(s.dict())
            elif isinstance(s, dict):
                sources.append(s)
            else:
                sources.append({
                    'title': getattr(s, 'title', ''),
                    'url': getattr(s, 'url', ''),
                    'excerpt': getattr(s, 'excerpt', ''),
                })

        # Extract grounding metadata
        grounding = None
        if response.grounding_metadata:
            if hasattr(response.grounding_metadata, 'dict'):
                grounding = response.grounding_metadata.dict()
            else:
                grounding = response.grounding_metadata

        return ResearchResult(
            success=response.success,
            sources=sources,
            keyword_analysis=response.keyword_analysis,
            competitor_analysis=response.competitor_analysis,
            suggested_angles=response.suggested_angles,
            provider_used=provider.value,
            search_queries=response.search_queries,
            grounding_metadata=grounding,
            original_query=context.query,
            error_message=response.error_message,
            error_code=response.error_code if hasattr(response, 'error_code') else None,
            retry_suggested=response.retry_suggested if hasattr(response, 'retry_suggested') else False,
        )

    def get_provider_status(self) -> Dict[str, Any]:
        """Get status of available providers."""
        return {
            "exa": {
                "available": self.exa_available,
                "priority": 1,
                "description": "Neural search for semantic understanding"
            },
            "tavily": {
                "available": self.tavily_available,
                "priority": 2,
                "description": "AI-powered web search"
            },
            "google": {
                "available": True,  # Always available via Gemini
                "priority": 3,
                "description": "Google Search grounding"
            }
        }