Files
ALwrity/backend/services/research/research_persona_service.py

479 lines
22 KiB
Python

"""
Research Persona Service
Handles generation, caching, and retrieval of AI-powered research personas.
"""
from typing import Dict, Any, Optional
from datetime import datetime, timedelta
from loguru import logger
from fastapi import HTTPException
from services.database import get_db_session
from models.onboarding import PersonaData, OnboardingSession
from models.research_persona_models import ResearchPersona
from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
from services.llm_providers.main_text_generation import llm_text_gen
from services.onboarding.database_service import OnboardingDatabaseService
from services.persona_data_service import PersonaDataService
class ResearchPersonaService:
"""Service for generating and managing research personas."""
CACHE_TTL_DAYS = 7 # 7-day cache TTL
def __init__(self, db_session=None):
self.db = db_session or get_db_session()
self.prompt_builder = ResearchPersonaPromptBuilder()
self.onboarding_service = OnboardingDatabaseService(db=self.db)
self.persona_data_service = PersonaDataService(db_session=self.db)
def get_cached_only(
self,
user_id: str
) -> Optional[ResearchPersona]:
"""
Get research persona for user if it exists in database (regardless of cache validity).
This method NEVER generates - it only returns existing personas.
Use this for config endpoints to avoid triggering rate limit checks.
Note: Returns persona even if cache is expired - cache validity only matters for regeneration.
Args:
user_id: User ID (Clerk string)
Returns:
ResearchPersona if exists in database, None otherwise
"""
try:
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.debug(f"[get_cached_only] No persona data record found for user {user_id}")
return None
# Check if research_persona field exists and is not None/empty
# Handle cases where it might be None, empty dict {}, or empty string ""
research_persona_raw = persona_data.research_persona
has_persona = (
research_persona_raw is not None
and research_persona_raw != {}
and research_persona_raw != ""
and (isinstance(research_persona_raw, dict) and len(research_persona_raw) > 0)
)
logger.info(
f"[get_cached_only] Checking research persona for user {user_id}: "
f"persona_data exists=True, research_persona_raw={research_persona_raw is not None}, "
f"research_persona type={type(research_persona_raw)}, "
f"has_persona={has_persona}, "
f"generated_at={persona_data.research_persona_generated_at}"
)
# Return persona if it exists, regardless of cache validity
# Cache validity only matters when deciding whether to regenerate
if has_persona:
try:
cache_valid = self.is_cache_valid(persona_data)
cache_status = "valid" if cache_valid else "expired"
logger.info(
f"[get_cached_only] ✅ Returning research persona for user {user_id} "
f"(cache: {cache_status}, generated_at: {persona_data.research_persona_generated_at})"
)
# Ensure we're passing a dict to ResearchPersona
if not isinstance(research_persona_raw, dict):
logger.error(f"[get_cached_only] research_persona_raw is not a dict: {type(research_persona_raw)}")
return None
parsed_persona = ResearchPersona(**research_persona_raw)
logger.info(
f"[get_cached_only] ✅ Successfully parsed persona for user {user_id}: "
f"industry={parsed_persona.default_industry}, "
f"target_audience={parsed_persona.default_target_audience}"
)
return parsed_persona
except Exception as e:
logger.error(f"[get_cached_only] ❌ Failed to parse research persona for user {user_id}: {e}", exc_info=True)
logger.debug(
f"[get_cached_only] Persona data details: "
f"type={type(research_persona_raw)}, "
f"is_dict={isinstance(research_persona_raw, dict)}, "
f"value sample: {str(research_persona_raw)[:500] if research_persona_raw else 'None'}"
)
return None
# Persona doesn't exist in database
logger.info(f"[get_cached_only] ⚠️ No research persona found in database for user {user_id}")
return None
except Exception as e:
logger.error(f"[get_cached_only] ❌ Error getting research persona for user {user_id}: {e}", exc_info=True)
return None
def get_or_generate(
self,
user_id: str,
force_refresh: bool = False
) -> Optional[ResearchPersona]:
"""
Get research persona for user, generating if missing or expired.
Args:
user_id: User ID (Clerk string)
force_refresh: If True, regenerate even if cache is valid
Returns:
ResearchPersona if successful, None otherwise
"""
try:
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
return None
# Check if persona exists in database
if persona_data.research_persona:
# Persona exists - check if we should return it or regenerate
cache_valid = self.is_cache_valid(persona_data)
if not force_refresh and cache_valid:
# Cache is valid - return existing persona
logger.info(f"Using cached research persona for user {user_id}")
try:
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse cached research persona: {e}, regenerating...")
# Fall through to regeneration if parsing fails
elif not force_refresh:
# Persona exists but cache expired - return it anyway (don't regenerate unless forced)
logger.info(f"Research persona exists for user {user_id} but cache expired - returning existing persona (use force_refresh=true to regenerate)")
try:
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse existing research persona: {e}, regenerating...")
# Fall through to regeneration if parsing fails
else:
# force_refresh=True - regenerate even though persona exists
logger.info(f"Forcing refresh of research persona for user {user_id}")
else:
# Persona doesn't exist - generate new one
logger.info(f"Research persona missing for user {user_id}, generating...")
# Generate new research persona (only reaches here if:
# 1. Persona doesn't exist, OR
# 2. force_refresh=True, OR
# 3. Parsing of existing persona failed
try:
logger.info(f"Generating research persona for user {user_id}")
research_persona = self.generate_research_persona(user_id)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
if research_persona:
# Save to database
if self.save_research_persona(user_id, research_persona):
logger.info(f"✅ Research persona generated and saved for user {user_id}")
else:
logger.warning(f"Failed to save research persona for user {user_id}")
return research_persona
else:
# Log detailed error for debugging expensive failures
logger.error(
f"❌ Failed to generate research persona for user {user_id} - "
f"This is an expensive failure (API call consumed). Check logs above for details."
)
# Don't return None silently - let the caller know this failed
return None
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
except Exception as e:
logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
return None
def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
"""
Generate a new research persona for the user.
Args:
user_id: User ID (Clerk string)
Returns:
ResearchPersona if successful, None otherwise
"""
try:
logger.info(f"Generating research persona for user {user_id}")
# Collect onboarding data
onboarding_data = self._collect_onboarding_data(user_id)
if not onboarding_data:
logger.warning(f"Insufficient onboarding data for user {user_id}")
return None
# Build prompt
prompt = self.prompt_builder.build_research_persona_prompt(onboarding_data)
# Get JSON schema for structured response
json_schema = self.prompt_builder.get_json_schema()
# Call LLM with structured JSON response
logger.info(f"Calling LLM for research persona generation (user: {user_id})")
try:
response_text = llm_text_gen(
prompt=prompt,
json_struct=json_schema,
user_id=user_id
)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
logger.warning(f"HTTPException during LLM call for user {user_id} - re-raising")
raise
except RuntimeError as e:
# Re-raise RuntimeError (subscription limits) as HTTPException
logger.warning(f"RuntimeError during LLM call for user {user_id}: {e}")
raise HTTPException(status_code=429, detail=str(e))
if not response_text:
logger.error("Empty response from LLM")
return None
# Parse JSON response
import json
try:
# When json_struct is provided, llm_text_gen may return a dict directly
if isinstance(response_text, dict):
# Already parsed, use directly
persona_dict = response_text
elif isinstance(response_text, str):
# Handle case where LLM returns markdown-wrapped JSON or plain JSON string
response_text = response_text.strip()
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.startswith("```"):
response_text = response_text[3:]
if response_text.endswith("```"):
response_text = response_text[:-3]
response_text = response_text.strip()
persona_dict = json.loads(response_text)
else:
logger.error(f"Unexpected response type from LLM: {type(response_text)}")
return None
# Add generated_at timestamp
persona_dict["generated_at"] = datetime.utcnow().isoformat()
# Validate and create ResearchPersona
# Log the dict structure for debugging if validation fails
try:
research_persona = ResearchPersona(**persona_dict)
logger.info(f"✅ Research persona generated successfully for user {user_id}")
return research_persona
except Exception as validation_error:
logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
logger.debug(f"Persona dict keys: {list(persona_dict.keys()) if isinstance(persona_dict, dict) else 'Not a dict'}")
logger.debug(f"Persona dict sample: {str(persona_dict)[:500]}")
# Re-raise to be caught by outer exception handler
raise
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response as JSON: {e}")
logger.debug(f"Response text: {response_text[:500] if isinstance(response_text, str) else str(response_text)[:500]}")
return None
except Exception as e:
logger.error(f"Failed to create ResearchPersona from response: {e}")
return None
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
except Exception as e:
logger.error(f"Error generating research persona for user {user_id}: {e}")
return None
def is_cache_valid(self, persona_data: PersonaData) -> bool:
"""
Check if cached research persona is still valid (within TTL).
Args:
persona_data: PersonaData database record
Returns:
True if cache is valid, False otherwise
"""
if not persona_data.research_persona_generated_at:
return False
# Check if within TTL
cache_age = datetime.utcnow() - persona_data.research_persona_generated_at
is_valid = cache_age < timedelta(days=self.CACHE_TTL_DAYS)
if not is_valid:
logger.debug(f"Cache expired (age: {cache_age.days} days, TTL: {self.CACHE_TTL_DAYS} days)")
return is_valid
def save_research_persona(
self,
user_id: str,
research_persona: ResearchPersona
) -> bool:
"""
Save research persona to database.
Args:
user_id: User ID (Clerk string)
research_persona: ResearchPersona to save
Returns:
True if successful, False otherwise
"""
try:
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.error(f"No persona data record found for user {user_id}")
return False
# Convert ResearchPersona to dict for JSON storage
persona_dict = research_persona.dict()
# Update database record
persona_data.research_persona = persona_dict
persona_data.research_persona_generated_at = datetime.utcnow()
self.db.commit()
logger.info(f"✅ Research persona saved for user {user_id}")
return True
except Exception as e:
logger.error(f"Error saving research persona for user {user_id}: {e}")
self.db.rollback()
return False
def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
"""Get PersonaData database record for user."""
try:
# Ensure research_persona columns exist before querying
self.onboarding_service._ensure_research_persona_columns(self.db)
# Get onboarding session
session = self.db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).first()
if not session:
return None
# Get persona data
persona_data = self.db.query(PersonaData).filter(
PersonaData.session_id == session.id
).first()
return persona_data
except Exception as e:
logger.error(f"Error getting persona data record for user {user_id}: {e}")
return None
def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
"""
Collect all onboarding data needed for research persona generation.
Returns:
Dictionary with website_analysis, persona_data, research_preferences, business_info
"""
try:
# Get website analysis
website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}
# Get persona data
persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}
# Get research preferences
research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}
# Get business info - construct from persona data and website analysis
business_info = {}
# Try to extract from persona data
if persona_data_dict:
core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
if core_persona:
if core_persona.get('industry'):
business_info['industry'] = core_persona['industry']
if core_persona.get('target_audience'):
business_info['target_audience'] = core_persona['target_audience']
# Fallback to website analysis if not in persona
if not business_info.get('industry') and website_analysis:
target_audience_data = website_analysis.get('target_audience', {})
if isinstance(target_audience_data, dict):
industry_focus = target_audience_data.get('industry_focus')
if industry_focus:
business_info['industry'] = industry_focus
demographics = target_audience_data.get('demographics')
if demographics:
business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
# Check if we have enough data - be more lenient since we can infer from minimal data
# We need at least some basic information to generate a meaningful persona
has_basic_data = bool(
website_analysis or
persona_data_dict or
research_prefs.get('content_types') or
business_info.get('industry')
)
if not has_basic_data:
logger.warning(f"Insufficient onboarding data for user {user_id} - no basic data found")
return None
# If we have minimal data, add intelligent defaults to help the AI
if not business_info.get('industry'):
# Try to infer industry from research preferences or content types
content_types = research_prefs.get('content_types', [])
if 'blog' in content_types or 'article' in content_types:
business_info['industry'] = 'Content Marketing'
business_info['inferred'] = True
elif 'social_media' in content_types:
business_info['industry'] = 'Social Media Marketing'
business_info['inferred'] = True
elif 'video' in content_types:
business_info['industry'] = 'Video Content Creation'
business_info['inferred'] = True
if not business_info.get('target_audience'):
# Default to professionals for content creators
business_info['target_audience'] = 'Professionals and content consumers'
business_info['inferred'] = True
# Get competitor analysis data (if available)
competitor_analysis = None
try:
competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
if competitor_analysis:
logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
except Exception as e:
logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")
return {
"website_analysis": website_analysis,
"persona_data": persona_data_dict,
"research_preferences": research_prefs,
"business_info": business_info,
"competitor_analysis": competitor_analysis # Add competitor data for better preset generation
}
except Exception as e:
logger.error(f"Error collecting onboarding data for user {user_id}: {e}")
return None