ALwrity/backend/services/research/research_persona_service.py

"""
Research Persona Service

Handles generation, caching, and retrieval of AI-powered research personas.
"""

from typing import Dict, Any, Optional
from datetime import datetime, timedelta
from loguru import logger
from fastapi import HTTPException

from services.database import get_db_session
from models.onboarding import PersonaData, OnboardingSession
from models.research_persona_models import ResearchPersona
from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
from services.llm_providers.main_text_generation import llm_text_gen
from services.onboarding.database_service import OnboardingDatabaseService
from services.persona_data_service import PersonaDataService


class ResearchPersonaService:
    """Service for generating and managing research personas."""

    CACHE_TTL_DAYS = 7  # 7-day cache TTL

    def __init__(self, db_session=None):
        self.db = db_session or get_db_session()
        self.prompt_builder = ResearchPersonaPromptBuilder()
        self.onboarding_service = OnboardingDatabaseService(db=self.db)
        self.persona_data_service = PersonaDataService(db_session=self.db)

    def get_cached_only(
        self,
        user_id: str
    ) -> Optional[ResearchPersona]:
        """
        Get research persona for user ONLY if it exists in cache.
        This method NEVER generates - it only returns cached personas.
        Use this for config endpoints to avoid triggering rate limit checks.

        Args:
            user_id: User ID (Clerk string)

        Returns:
            ResearchPersona if cached and valid, None otherwise
        """
        try:
            # Get persona data record
            persona_data = self._get_persona_data_record(user_id)

            if not persona_data:
                logger.debug(f"No persona data found for user {user_id}")
                return None

            # Only return if cache is valid and persona exists
            if self.is_cache_valid(persona_data) and persona_data.research_persona:
                try:
                    logger.debug(f"Returning cached research persona for user {user_id}")
                    return ResearchPersona(**persona_data.research_persona)
                except Exception as e:
                    logger.warning(f"Failed to parse cached research persona: {e}")
                    return None

            # Cache invalid or persona missing - return None (don't generate)
            logger.debug(f"No valid cached research persona for user {user_id}")
            return None

        except Exception as e:
            logger.error(f"Error getting cached research persona for user {user_id}: {e}")
            return None

    def get_or_generate(
        self,
        user_id: str,
        force_refresh: bool = False
    ) -> Optional[ResearchPersona]:
        """
        Get research persona for user, generating if missing or expired.

        Args:
            user_id: User ID (Clerk string)
            force_refresh: If True, regenerate even if cache is valid

        Returns:
            ResearchPersona if successful, None otherwise
        """
        try:
            # Get persona data record
            persona_data = self._get_persona_data_record(user_id)

            if not persona_data:
                logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
                return None

            # Check cache if not forcing refresh
            if not force_refresh and self.is_cache_valid(persona_data):
                if persona_data.research_persona:
                    logger.info(f"Using cached research persona for user {user_id}")
                    try:
                        return ResearchPersona(**persona_data.research_persona)
                    except Exception as e:
                        logger.warning(f"Failed to parse cached research persona: {e}, regenerating...")
                        # Fall through to regeneration
                else:
                    logger.info(f"Research persona missing for user {user_id}, generating...")
            else:
                if force_refresh:
                    logger.info(f"Forcing refresh of research persona for user {user_id}")
                else:
                    logger.info(f"Cache expired for user {user_id}, regenerating...")

            # Generate new research persona
            try:
                research_persona = self.generate_research_persona(user_id)
            except HTTPException:
                # Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
                raise

            if research_persona:
                # Save to database
                if self.save_research_persona(user_id, research_persona):
                    logger.info(f"✅ Research persona generated and saved for user {user_id}")
                else:
                    logger.warning(f"Failed to save research persona for user {user_id}")

                return research_persona
            else:
                # Log detailed error for debugging expensive failures
                logger.error(
                    f"❌ Failed to generate research persona for user {user_id} - "
                    f"This is an expensive failure (API call consumed). Check logs above for details."
                )
                # Don't return None silently - let the caller know this failed
                return None

        except HTTPException:
            # Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
            raise
        except Exception as e:
            logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
            return None

    def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
        """
        Generate a new research persona for the user.

        Args:
            user_id: User ID (Clerk string)

        Returns:
            ResearchPersona if successful, None otherwise
        """
        try:
            logger.info(f"Generating research persona for user {user_id}")

            # Collect onboarding data
            onboarding_data = self._collect_onboarding_data(user_id)

            if not onboarding_data:
                logger.warning(f"Insufficient onboarding data for user {user_id}")
                return None

            # Build prompt
            prompt = self.prompt_builder.build_research_persona_prompt(onboarding_data)

            # Get JSON schema for structured response
            json_schema = self.prompt_builder.get_json_schema()

            # Call LLM with structured JSON response
            logger.info(f"Calling LLM for research persona generation (user: {user_id})")
            try:
                response_text = llm_text_gen(
                    prompt=prompt,
                    json_struct=json_schema,
                    user_id=user_id
                )
            except HTTPException:
                # Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
                logger.warning(f"HTTPException during LLM call for user {user_id} - re-raising")
                raise
            except RuntimeError as e:
                # Re-raise RuntimeError (subscription limits) as HTTPException
                logger.warning(f"RuntimeError during LLM call for user {user_id}: {e}")
                raise HTTPException(status_code=429, detail=str(e))

            if not response_text:
                logger.error("Empty response from LLM")
                return None

            # Parse JSON response
            import json
            try:
                # When json_struct is provided, llm_text_gen may return a dict directly
                if isinstance(response_text, dict):
                    # Already parsed, use directly
                    persona_dict = response_text
                elif isinstance(response_text, str):
                    # Handle case where LLM returns markdown-wrapped JSON or plain JSON string
                    response_text = response_text.strip()
                    if response_text.startswith("```json"):
                        response_text = response_text[7:]
                    if response_text.startswith("```"):
                        response_text = response_text[3:]
                    if response_text.endswith("```"):
                        response_text = response_text[:-3]
                    response_text = response_text.strip()

                    persona_dict = json.loads(response_text)
                else:
                    logger.error(f"Unexpected response type from LLM: {type(response_text)}")
                    return None

                # Add generated_at timestamp
                persona_dict["generated_at"] = datetime.utcnow().isoformat()

                # Validate and create ResearchPersona
                # Log the dict structure for debugging if validation fails
                try:
                    research_persona = ResearchPersona(**persona_dict)
                    logger.info(f"✅ Research persona generated successfully for user {user_id}")
                    return research_persona
                except Exception as validation_error:
                    logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
                    logger.debug(f"Persona dict keys: {list(persona_dict.keys()) if isinstance(persona_dict, dict) else 'Not a dict'}")
                    logger.debug(f"Persona dict sample: {str(persona_dict)[:500]}")
                    # Re-raise to be caught by outer exception handler
                    raise

            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse LLM response as JSON: {e}")
                logger.debug(f"Response text: {response_text[:500] if isinstance(response_text, str) else str(response_text)[:500]}")
                return None
            except Exception as e:
                logger.error(f"Failed to create ResearchPersona from response: {e}")
                return None

        except HTTPException:
            # Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
            raise
        except Exception as e:
            logger.error(f"Error generating research persona for user {user_id}: {e}")
            return None

    def is_cache_valid(self, persona_data: PersonaData) -> bool:
        """
        Check if cached research persona is still valid (within TTL).

        Args:
            persona_data: PersonaData database record

        Returns:
            True if cache is valid, False otherwise
        """
        if not persona_data.research_persona_generated_at:
            return False

        # Check if within TTL
        cache_age = datetime.utcnow() - persona_data.research_persona_generated_at
        is_valid = cache_age < timedelta(days=self.CACHE_TTL_DAYS)

        if not is_valid:
            logger.debug(f"Cache expired (age: {cache_age.days} days, TTL: {self.CACHE_TTL_DAYS} days)")

        return is_valid

    def save_research_persona(
        self,
        user_id: str,
        research_persona: ResearchPersona
    ) -> bool:
        """
        Save research persona to database.

        Args:
            user_id: User ID (Clerk string)
            research_persona: ResearchPersona to save

        Returns:
            True if successful, False otherwise
        """
        try:
            persona_data = self._get_persona_data_record(user_id)

            if not persona_data:
                logger.error(f"No persona data record found for user {user_id}")
                return False

            # Convert ResearchPersona to dict for JSON storage
            persona_dict = research_persona.dict()

            # Update database record
            persona_data.research_persona = persona_dict
            persona_data.research_persona_generated_at = datetime.utcnow()

            self.db.commit()

            logger.info(f"✅ Research persona saved for user {user_id}")
            return True

        except Exception as e:
            logger.error(f"Error saving research persona for user {user_id}: {e}")
            self.db.rollback()
            return False

    def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
        """Get PersonaData database record for user."""
        try:
            # Ensure research_persona columns exist before querying
            self.onboarding_service._ensure_research_persona_columns(self.db)

            # Get onboarding session
            session = self.db.query(OnboardingSession).filter(
                OnboardingSession.user_id == user_id
            ).first()

            if not session:
                return None

            # Get persona data
            persona_data = self.db.query(PersonaData).filter(
                PersonaData.session_id == session.id
            ).first()

            return persona_data

        except Exception as e:
            logger.error(f"Error getting persona data record for user {user_id}: {e}")
            return None

    def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
        """
        Collect all onboarding data needed for research persona generation.

        Returns:
            Dictionary with website_analysis, persona_data, research_preferences, business_info
        """
        try:
            # Get website analysis
            website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}

            # Get persona data
            persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}

            # Get research preferences
            research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}

            # Get business info - construct from persona data and website analysis
            business_info = {}

            # Try to extract from persona data
            if persona_data_dict:
                core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
                if core_persona:
                    if core_persona.get('industry'):
                        business_info['industry'] = core_persona['industry']
                    if core_persona.get('target_audience'):
                        business_info['target_audience'] = core_persona['target_audience']

            # Fallback to website analysis if not in persona
            if not business_info.get('industry') and website_analysis:
                target_audience_data = website_analysis.get('target_audience', {})
                if isinstance(target_audience_data, dict):
                    industry_focus = target_audience_data.get('industry_focus')
                    if industry_focus:
                        business_info['industry'] = industry_focus
                    demographics = target_audience_data.get('demographics')
                    if demographics:
                        business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)

            # Check if we have enough data - be more lenient since we can infer from minimal data
            # We need at least some basic information to generate a meaningful persona
            has_basic_data = bool(
                website_analysis or
                persona_data_dict or
                research_prefs.get('content_types') or
                business_info.get('industry')
            )

            if not has_basic_data:
                logger.warning(f"Insufficient onboarding data for user {user_id} - no basic data found")
                return None

            # If we have minimal data, add intelligent defaults to help the AI
            if not business_info.get('industry'):
                # Try to infer industry from research preferences or content types
                content_types = research_prefs.get('content_types', [])
                if 'blog' in content_types or 'article' in content_types:
                    business_info['industry'] = 'Content Marketing'
                    business_info['inferred'] = True
                elif 'social_media' in content_types:
                    business_info['industry'] = 'Social Media Marketing'
                    business_info['inferred'] = True
                elif 'video' in content_types:
                    business_info['industry'] = 'Video Content Creation'
                    business_info['inferred'] = True

            if not business_info.get('target_audience'):
                # Default to professionals for content creators
                business_info['target_audience'] = 'Professionals and content consumers'
                business_info['inferred'] = True

            # Get competitor analysis data (if available)
            competitor_analysis = None
            try:
                competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
                if competitor_analysis:
                    logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
            except Exception as e:
                logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")

            return {
                "website_analysis": website_analysis,
                "persona_data": persona_data_dict,
                "research_preferences": research_prefs,
                "business_info": business_info,
                "competitor_analysis": competitor_analysis  # Add competitor data for better preset generation
            }

        except Exception as e:
            logger.error(f"Error collecting onboarding data for user {user_id}: {e}")
            return None