Scheduled research persona generation

This commit is contained in:
ajaysi
2025-11-05 08:51:00 +05:30
parent 55087c4f37
commit d99c7c83a7
98 changed files with 14518 additions and 828 deletions

View File

@@ -0,0 +1,171 @@
"""
Research Persona Prompt Builder
Handles building comprehensive prompts for research persona generation.
Generates personalized research defaults, suggestions, and configurations.
"""
from typing import Dict, Any
import json
from loguru import logger
class ResearchPersonaPromptBuilder:
"""Builds comprehensive prompts for research persona generation."""
def build_research_persona_prompt(self, onboarding_data: Dict[str, Any]) -> str:
"""Build the research persona generation prompt with comprehensive data."""
# Extract data from onboarding_data
website_analysis = onboarding_data.get("website_analysis", {}) or {}
persona_data = onboarding_data.get("persona_data", {}) or {}
research_prefs = onboarding_data.get("research_preferences", {}) or {}
business_info = onboarding_data.get("business_info", {}) or {}
# Extract core persona
core_persona = persona_data.get("core_persona", {}) or {}
prompt = f"""
COMPREHENSIVE RESEARCH PERSONA GENERATION TASK: Create a highly detailed, personalized research persona based on the user's business, writing style, and content strategy. This persona will provide intelligent defaults and suggestions for research inputs.
=== USER CONTEXT ===
BUSINESS INFORMATION:
{json.dumps(business_info, indent=2)}
WEBSITE ANALYSIS:
{json.dumps(website_analysis, indent=2)}
CORE PERSONA:
{json.dumps(core_persona, indent=2)}
RESEARCH PREFERENCES:
{json.dumps(research_prefs, indent=2)}
=== RESEARCH PERSONA GENERATION REQUIREMENTS ===
Generate a comprehensive research persona in JSON format with the following structure:
1. DEFAULT VALUES:
- "default_industry": Extract from core_persona.industry, business_info.industry, or website_analysis target_audience. Use "General" only if none available.
- "default_target_audience": Extract from core_persona.target_audience, website_analysis.target_audience, or business_info.target_audience. Be specific and descriptive.
- "default_research_mode": Suggest "basic", "comprehensive", or "targeted" based on research_preferences.research_depth and content_type preferences.
- "default_provider": Suggest "google" for news/trends, "exa" for academic/technical deep-dives, or "google" as default.
2. KEYWORD INTELLIGENCE:
- "suggested_keywords": Generate 8-12 keywords relevant to the user's industry, interests (from core_persona), and content goals.
- "keyword_expansion_patterns": Create a dictionary mapping common keywords to expanded, industry-specific terms. Include 10-15 patterns like:
{{"AI": ["healthcare AI", "medical AI", "clinical AI", "diagnostic AI"], "tools": ["medical devices", "clinical tools"], ...}}
Focus on industry-specific terminology from the user's domain.
3. DOMAIN EXPERTISE:
- "suggested_exa_domains": List 4-6 authoritative domains for the user's industry (e.g., Healthcare: ["pubmed.gov", "nejm.org", "thelancet.com"]).
- "suggested_exa_category": Suggest appropriate Exa category based on industry:
- Healthcare/Science: "research paper"
- Finance: "financial report"
- Technology/Business: "company" or "news"
- Default: null (empty string for all categories)
4. RESEARCH ANGLES:
- "research_angles": Generate 5-8 alternative research angles/focuses based on:
- User's pain points and challenges (from core_persona)
- Industry trends and opportunities
- Content goals (from research_preferences)
- Audience interests (from core_persona.interests)
Examples: "Compare {{topic}} tools", "{{topic}} ROI analysis", "Latest {{topic}} trends", etc.
5. QUERY ENHANCEMENT:
- "query_enhancement_rules": Create templates for improving vague user queries:
{{"vague_ai": "Research: AI applications in {{industry}} for {{audience}}", "vague_tools": "Compare top {{industry}} tools", ...}}
Include 5-8 enhancement patterns.
6. RECOMMENDED PRESETS:
- "recommended_presets": Generate 3-5 personalized research preset templates. Each preset should include:
- name: Descriptive name (e.g., "{{Industry}} Trends", "{{Audience}} Insights")
- keywords: Research query string
- industry: User's industry
- target_audience: User's target audience
- research_mode: "basic", "comprehensive", or "targeted"
- config: Complete ResearchConfig object with appropriate settings
- description: Brief explanation of what this preset researches
Make presets relevant to the user's specific industry, audience, and content goals.
7. RESEARCH PREFERENCES:
- "research_preferences": Extract and structure research preferences from onboarding:
- research_depth: From research_preferences.research_depth
- content_types: From research_preferences.content_types
- auto_research: From research_preferences.auto_research
- factual_content: From research_preferences.factual_content
=== OUTPUT REQUIREMENTS ===
Return a valid JSON object matching this exact structure:
{{
"default_industry": "string",
"default_target_audience": "string",
"default_research_mode": "basic" | "comprehensive" | "targeted",
"default_provider": "google" | "exa",
"suggested_keywords": ["keyword1", "keyword2", ...],
"keyword_expansion_patterns": {{
"keyword": ["expansion1", "expansion2", ...]
}},
"suggested_exa_domains": ["domain1.com", "domain2.com", ...],
"suggested_exa_category": "string or null",
"research_angles": ["angle1", "angle2", ...],
"query_enhancement_rules": {{
"pattern": "template"
}},
"recommended_presets": [
{{
"name": "string",
"keywords": "string",
"industry": "string",
"target_audience": "string",
"research_mode": "basic" | "comprehensive" | "targeted",
"config": {{
"mode": "basic" | "comprehensive" | "targeted",
"provider": "google" | "exa",
"max_sources": 10 | 15 | 12,
"include_statistics": true | false,
"include_expert_quotes": true | false,
"include_competitors": true | false,
"include_trends": true | false,
"exa_category": "string or null",
"exa_include_domains": ["domain1.com", ...],
"exa_search_type": "auto" | "keyword" | "neural"
}},
"description": "string"
}}
],
"research_preferences": {{
"research_depth": "string",
"content_types": ["type1", "type2", ...],
"auto_research": true | false,
"factual_content": true | false
}},
"version": "1.0",
"confidence_score": 85.0
}}
=== IMPORTANT INSTRUCTIONS ===
1. Be highly specific and personalized - use actual data from the user's business, persona, and preferences.
2. Avoid generic suggestions - every field should reflect the user's unique context.
3. For industries not clearly identified, infer from website_analysis.content_characteristics or writing_style.
4. Ensure all suggested keywords, domains, and angles are relevant to the user's industry and audience.
5. Generate realistic, actionable presets that the user would actually want to use.
6. Confidence score should reflect data richness (0-100): higher if rich onboarding data, lower if minimal data.
7. Return ONLY valid JSON - no markdown formatting, no explanatory text.
Generate the research persona now:
"""
return prompt
def get_json_schema(self) -> Dict[str, Any]:
"""Return JSON schema for structured LLM response."""
# This will be used with llm_text_gen(json_struct=...)
from models.research_persona_models import ResearchPersona, ResearchPreset
# Convert Pydantic model to JSON schema
return ResearchPersona.schema()

View File

@@ -0,0 +1,194 @@
"""
Research Persona Scheduler
Handles scheduled generation of research personas after onboarding.
"""
from datetime import datetime, timedelta, timezone
from typing import Dict, Any
from loguru import logger
from services.database import get_db_session
from services.research.research_persona_service import ResearchPersonaService
from models.scheduler_models import SchedulerEventLog
async def generate_research_persona_task(user_id: str):
"""
Async task function to generate research persona for a user.
This function is called by the scheduler 20 minutes after onboarding completion.
Args:
user_id: User ID (Clerk string)
"""
db = None
try:
logger.info(f"Scheduled research persona generation started for user {user_id}")
# Get database session
db = get_db_session()
if not db:
logger.error(f"Failed to get database session for research persona generation (user: {user_id})")
return
# Generate research persona
persona_service = ResearchPersonaService(db_session=db)
# Check if persona already exists to avoid unnecessary API calls
persona_data = persona_service._get_persona_data_record(user_id)
if persona_data and persona_data.research_persona:
logger.info(f"Research persona already exists for user {user_id}, skipping generation")
return
start_time = datetime.utcnow()
try:
research_persona = persona_service.get_or_generate(user_id, force_refresh=False)
execution_time = (datetime.utcnow() - start_time).total_seconds()
if research_persona:
logger.info(f"✅ Scheduled research persona generation completed for user {user_id}")
# Log success to scheduler event log for dashboard
try:
event_log = SchedulerEventLog(
event_type='job_completed',
event_date=start_time,
job_id=f"research_persona_{user_id}",
job_type='one_time',
user_id=user_id,
event_data={
'job_function': 'generate_research_persona_task',
'execution_time_seconds': execution_time,
'status': 'success'
}
)
db.add(event_log)
db.commit()
except Exception as log_error:
logger.warning(f"Failed to log persona generation success to scheduler event log: {log_error}")
if db:
db.rollback()
else:
error_msg = (
f"Scheduled research persona generation FAILED for user {user_id}. "
f"Expensive API call was made but generation failed. "
f"Will NOT automatically retry to prevent wasteful API calls."
)
logger.error(f"{error_msg}")
# Log failure to scheduler event log for dashboard visibility
try:
event_log = SchedulerEventLog(
event_type='job_failed',
event_date=start_time,
job_id=f"research_persona_{user_id}",
job_type='one_time',
user_id=user_id,
error_message=error_msg,
event_data={
'job_function': 'generate_research_persona_task',
'execution_time_seconds': execution_time,
'status': 'failed',
'failure_reason': 'generation_returned_none',
'expensive_api_call': True
}
)
db.add(event_log)
db.commit()
except Exception as log_error:
logger.warning(f"Failed to log persona generation failure to scheduler event log: {log_error}")
if db:
db.rollback()
# DO NOT reschedule - this prevents infinite retry loops
# User can manually trigger generation from frontend if needed
except Exception as gen_error:
execution_time = (datetime.utcnow() - start_time).total_seconds()
error_msg = (
f"Exception during scheduled research persona generation for user {user_id}: {str(gen_error)}. "
f"Expensive API call may have been made. Will NOT automatically retry."
)
logger.error(f"{error_msg}")
# Log exception to scheduler event log for dashboard visibility
try:
event_log = SchedulerEventLog(
event_type='job_failed',
event_date=start_time,
job_id=f"research_persona_{user_id}", # Match scheduled job ID format
job_type='one_time',
user_id=user_id,
error_message=error_msg,
event_data={
'job_function': 'generate_research_persona_task',
'execution_time_seconds': execution_time,
'status': 'failed',
'failure_reason': 'exception',
'exception_type': type(gen_error).__name__,
'exception_message': str(gen_error),
'expensive_api_call': True
}
)
db.add(event_log)
db.commit()
except Exception as log_error:
logger.warning(f"Failed to log persona generation exception to scheduler event log: {log_error}")
if db:
db.rollback()
# DO NOT reschedule - prevent infinite retry loops
except Exception as e:
logger.error(f"Error in scheduled research persona generation for user {user_id}: {e}")
finally:
if db:
try:
db.close()
except Exception as e:
logger.error(f"Error closing database session: {e}")
def schedule_research_persona_generation(user_id: str, delay_minutes: int = 20) -> str:
"""
Schedule research persona generation for a user after a delay.
Args:
user_id: User ID (Clerk string)
delay_minutes: Delay in minutes before generating persona (default: 20)
Returns:
Job ID
"""
try:
from services.scheduler import get_scheduler
scheduler = get_scheduler()
# Calculate run date (current time + delay) - ensure UTC timezone-aware
run_date = datetime.now(timezone.utc) + timedelta(minutes=delay_minutes)
# Generate consistent job ID (without timestamp) for proper restoration
# This allows restoration to find and restore the job with original scheduled time
# Note: Clerk user_id already includes "user_" prefix, so we don't add it again
job_id = f"research_persona_{user_id}"
# Schedule the task
scheduled_job_id = scheduler.schedule_one_time_task(
func=generate_research_persona_task,
run_date=run_date,
job_id=job_id,
kwargs={"user_id": user_id},
replace_existing=True
)
logger.info(
f"Scheduled research persona generation for user {user_id} "
f"at {run_date} (job_id: {scheduled_job_id})"
)
return scheduled_job_id
except Exception as e:
logger.error(f"Failed to schedule research persona generation for user {user_id}: {e}")
raise

View File

@@ -0,0 +1,384 @@
"""
Research Persona Service
Handles generation, caching, and retrieval of AI-powered research personas.
"""
from typing import Dict, Any, Optional
from datetime import datetime, timedelta
from loguru import logger
from fastapi import HTTPException
from services.database import get_db_session
from models.onboarding import PersonaData, OnboardingSession
from models.research_persona_models import ResearchPersona
from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
from services.llm_providers.main_text_generation import llm_text_gen
from services.onboarding.database_service import OnboardingDatabaseService
from services.persona_data_service import PersonaDataService
class ResearchPersonaService:
"""Service for generating and managing research personas."""
CACHE_TTL_DAYS = 7 # 7-day cache TTL
def __init__(self, db_session=None):
self.db = db_session or get_db_session()
self.prompt_builder = ResearchPersonaPromptBuilder()
self.onboarding_service = OnboardingDatabaseService(db=self.db)
self.persona_data_service = PersonaDataService(db_session=self.db)
def get_cached_only(
self,
user_id: str
) -> Optional[ResearchPersona]:
"""
Get research persona for user ONLY if it exists in cache.
This method NEVER generates - it only returns cached personas.
Use this for config endpoints to avoid triggering rate limit checks.
Args:
user_id: User ID (Clerk string)
Returns:
ResearchPersona if cached and valid, None otherwise
"""
try:
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.debug(f"No persona data found for user {user_id}")
return None
# Only return if cache is valid and persona exists
if self.is_cache_valid(persona_data) and persona_data.research_persona:
try:
logger.debug(f"Returning cached research persona for user {user_id}")
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse cached research persona: {e}")
return None
# Cache invalid or persona missing - return None (don't generate)
logger.debug(f"No valid cached research persona for user {user_id}")
return None
except Exception as e:
logger.error(f"Error getting cached research persona for user {user_id}: {e}")
return None
def get_or_generate(
self,
user_id: str,
force_refresh: bool = False
) -> Optional[ResearchPersona]:
"""
Get research persona for user, generating if missing or expired.
Args:
user_id: User ID (Clerk string)
force_refresh: If True, regenerate even if cache is valid
Returns:
ResearchPersona if successful, None otherwise
"""
try:
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
return None
# Check cache if not forcing refresh
if not force_refresh and self.is_cache_valid(persona_data):
if persona_data.research_persona:
logger.info(f"Using cached research persona for user {user_id}")
try:
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse cached research persona: {e}, regenerating...")
# Fall through to regeneration
else:
logger.info(f"Research persona missing for user {user_id}, generating...")
else:
if force_refresh:
logger.info(f"Forcing refresh of research persona for user {user_id}")
else:
logger.info(f"Cache expired for user {user_id}, regenerating...")
# Generate new research persona
try:
research_persona = self.generate_research_persona(user_id)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
if research_persona:
# Save to database
if self.save_research_persona(user_id, research_persona):
logger.info(f"✅ Research persona generated and saved for user {user_id}")
else:
logger.warning(f"Failed to save research persona for user {user_id}")
return research_persona
else:
# Log detailed error for debugging expensive failures
logger.error(
f"❌ Failed to generate research persona for user {user_id} - "
f"This is an expensive failure (API call consumed). Check logs above for details."
)
# Don't return None silently - let the caller know this failed
return None
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
except Exception as e:
logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
return None
def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
"""
Generate a new research persona for the user.
Args:
user_id: User ID (Clerk string)
Returns:
ResearchPersona if successful, None otherwise
"""
try:
logger.info(f"Generating research persona for user {user_id}")
# Collect onboarding data
onboarding_data = self._collect_onboarding_data(user_id)
if not onboarding_data:
logger.warning(f"Insufficient onboarding data for user {user_id}")
return None
# Build prompt
prompt = self.prompt_builder.build_research_persona_prompt(onboarding_data)
# Get JSON schema for structured response
json_schema = self.prompt_builder.get_json_schema()
# Call LLM with structured JSON response
logger.info(f"Calling LLM for research persona generation (user: {user_id})")
try:
response_text = llm_text_gen(
prompt=prompt,
json_struct=json_schema,
user_id=user_id
)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
logger.warning(f"HTTPException during LLM call for user {user_id} - re-raising")
raise
except RuntimeError as e:
# Re-raise RuntimeError (subscription limits) as HTTPException
logger.warning(f"RuntimeError during LLM call for user {user_id}: {e}")
raise HTTPException(status_code=429, detail=str(e))
if not response_text:
logger.error("Empty response from LLM")
return None
# Parse JSON response
import json
try:
# When json_struct is provided, llm_text_gen may return a dict directly
if isinstance(response_text, dict):
# Already parsed, use directly
persona_dict = response_text
elif isinstance(response_text, str):
# Handle case where LLM returns markdown-wrapped JSON or plain JSON string
response_text = response_text.strip()
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.startswith("```"):
response_text = response_text[3:]
if response_text.endswith("```"):
response_text = response_text[:-3]
response_text = response_text.strip()
persona_dict = json.loads(response_text)
else:
logger.error(f"Unexpected response type from LLM: {type(response_text)}")
return None
# Add generated_at timestamp
persona_dict["generated_at"] = datetime.utcnow().isoformat()
# Validate and create ResearchPersona
# Log the dict structure for debugging if validation fails
try:
research_persona = ResearchPersona(**persona_dict)
logger.info(f"✅ Research persona generated successfully for user {user_id}")
return research_persona
except Exception as validation_error:
logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
logger.debug(f"Persona dict keys: {list(persona_dict.keys()) if isinstance(persona_dict, dict) else 'Not a dict'}")
logger.debug(f"Persona dict sample: {str(persona_dict)[:500]}")
# Re-raise to be caught by outer exception handler
raise
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response as JSON: {e}")
logger.debug(f"Response text: {response_text[:500] if isinstance(response_text, str) else str(response_text)[:500]}")
return None
except Exception as e:
logger.error(f"Failed to create ResearchPersona from response: {e}")
return None
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
except Exception as e:
logger.error(f"Error generating research persona for user {user_id}: {e}")
return None
def is_cache_valid(self, persona_data: PersonaData) -> bool:
"""
Check if cached research persona is still valid (within TTL).
Args:
persona_data: PersonaData database record
Returns:
True if cache is valid, False otherwise
"""
if not persona_data.research_persona_generated_at:
return False
# Check if within TTL
cache_age = datetime.utcnow() - persona_data.research_persona_generated_at
is_valid = cache_age < timedelta(days=self.CACHE_TTL_DAYS)
if not is_valid:
logger.debug(f"Cache expired (age: {cache_age.days} days, TTL: {self.CACHE_TTL_DAYS} days)")
return is_valid
def save_research_persona(
self,
user_id: str,
research_persona: ResearchPersona
) -> bool:
"""
Save research persona to database.
Args:
user_id: User ID (Clerk string)
research_persona: ResearchPersona to save
Returns:
True if successful, False otherwise
"""
try:
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.error(f"No persona data record found for user {user_id}")
return False
# Convert ResearchPersona to dict for JSON storage
persona_dict = research_persona.dict()
# Update database record
persona_data.research_persona = persona_dict
persona_data.research_persona_generated_at = datetime.utcnow()
self.db.commit()
logger.info(f"✅ Research persona saved for user {user_id}")
return True
except Exception as e:
logger.error(f"Error saving research persona for user {user_id}: {e}")
self.db.rollback()
return False
def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
"""Get PersonaData database record for user."""
try:
# Ensure research_persona columns exist before querying
self.onboarding_service._ensure_research_persona_columns(self.db)
# Get onboarding session
session = self.db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).first()
if not session:
return None
# Get persona data
persona_data = self.db.query(PersonaData).filter(
PersonaData.session_id == session.id
).first()
return persona_data
except Exception as e:
logger.error(f"Error getting persona data record for user {user_id}: {e}")
return None
def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
"""
Collect all onboarding data needed for research persona generation.
Returns:
Dictionary with website_analysis, persona_data, research_preferences, business_info
"""
try:
# Get website analysis
website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}
# Get persona data
persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}
# Get research preferences
research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}
# Get business info - construct from persona data and website analysis
business_info = {}
# Try to extract from persona data
if persona_data_dict:
core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
if core_persona:
if core_persona.get('industry'):
business_info['industry'] = core_persona['industry']
if core_persona.get('target_audience'):
business_info['target_audience'] = core_persona['target_audience']
# Fallback to website analysis if not in persona
if not business_info.get('industry') and website_analysis:
target_audience_data = website_analysis.get('target_audience', {})
if isinstance(target_audience_data, dict):
industry_focus = target_audience_data.get('industry_focus')
if industry_focus:
business_info['industry'] = industry_focus
demographics = target_audience_data.get('demographics')
if demographics:
business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
# Check if we have enough data
if not website_analysis and not persona_data_dict:
logger.warning(f"Insufficient onboarding data for user {user_id}")
return None
return {
"website_analysis": website_analysis,
"persona_data": persona_data_dict,
"research_preferences": research_prefs,
"business_info": business_info
}
except Exception as e:
logger.error(f"Error collecting onboarding data for user {user_id}: {e}")
return None