Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,55 @@
"""
Research Services Module for ALwrity
This module provides research and grounding capabilities for content generation,
replacing mock research with real-time industry information.
Available Services:
- GoogleSearchService: Real-time industry research using Google Custom Search API
- ExaService: Competitor discovery and analysis using Exa API
- TavilyService: AI-powered web search with real-time information
- Source ranking and credibility assessment
- Content extraction and insight generation
Core Module (v2.0):
- ResearchEngine: Standalone AI research engine for any content tool
- ResearchContext: Unified input schema for research requests
- ParameterOptimizer: AI-driven parameter optimization
Author: ALwrity Team
Version: 2.0
Last Updated: December 2025
"""
from .google_search_service import GoogleSearchService
from .exa_service import ExaService
from .tavily_service import TavilyService
# Core Research Engine (v2.0)
from .core import (
ResearchEngine,
ResearchContext,
ResearchPersonalizationContext,
ContentType,
ResearchGoal,
ResearchDepth,
ProviderPreference,
ParameterOptimizer,
)
__all__ = [
# Legacy services (still used by blog writer)
"GoogleSearchService",
"ExaService",
"TavilyService",
# Core Research Engine (v2.0)
"ResearchEngine",
"ResearchContext",
"ResearchPersonalizationContext",
"ContentType",
"ResearchGoal",
"ResearchDepth",
"ProviderPreference",
"ParameterOptimizer",
]

View File

@@ -0,0 +1,270 @@
"""
AI Prompts for Competitor Analysis
This module contains prompts for analyzing competitor data from Exa API
to generate actionable insights for content strategy and competitive positioning.
"""
COMPETITOR_ANALYSIS_PROMPT = """
You are a competitive intelligence analyst specializing in content strategy and market positioning.
**TASK**: Analyze competitor data to provide actionable insights for content strategy and competitive positioning.
**COMPETITOR DATA**:
{competitor_context}
**USER'S WEBSITE**: {user_url}
**INDUSTRY CONTEXT**: {industry_context}
**ANALYSIS REQUIREMENTS**:
1. **Market Position Analysis**
- Identify the competitive landscape structure
- Determine market leaders vs. challengers
- Assess market saturation and opportunities
2. **Content Strategy Insights**
- Analyze competitor content themes and topics
- Identify content gaps and opportunities
- Suggest unique content angles for differentiation
3. **Competitive Advantages**
- Highlight what makes each competitor unique
- Identify areas where the user can differentiate
- Suggest positioning strategies
4. **SEO and Marketing Insights**
- Analyze competitor positioning and messaging
- Identify keyword and content opportunities
- Suggest marketing strategies
**OUTPUT FORMAT** (JSON):
{{
"market_analysis": {{
"competitive_landscape": "Description of market structure",
"market_leaders": ["List of top 3 competitors"],
"market_opportunities": ["List of 3-5 opportunities"],
"saturation_level": "high/medium/low"
}},
"content_strategy": {{
"common_themes": ["List of common content themes"],
"content_gaps": ["List of 5 content opportunities"],
"unique_angles": ["List of 3 unique content angles"],
"content_frequency_insights": "Analysis of publishing patterns"
}},
"competitive_positioning": {{
"differentiation_opportunities": ["List of 5 ways to differentiate"],
"unique_value_propositions": ["List of 3 unique positioning ideas"],
"target_audience_insights": "Analysis of competitor audience targeting"
}},
"seo_opportunities": {{
"keyword_gaps": ["List of 5 keyword opportunities"],
"content_topics": ["List of 5 high-value content topics"],
"marketing_channels": ["List of competitor marketing strategies"]
}},
"actionable_recommendations": [
"List of 5 specific, actionable recommendations"
],
"risk_assessment": {{
"competitive_threats": ["List of 3 main threats"],
"market_barriers": ["List of 2-3 barriers to entry"],
"success_factors": ["List of 3 key success factors"]
}}
}}
**INSTRUCTIONS**:
- Be specific and actionable in your recommendations
- Focus on opportunities for differentiation
- Consider the user's industry context
- Prioritize recommendations by impact and feasibility
- Use data from the competitor analysis to support insights
- Keep recommendations practical and implementable
**QUALITY STANDARDS**:
- Each recommendation should be specific and actionable
- Insights should be based on actual competitor data
- Focus on differentiation and competitive advantage
- Consider both short-term and long-term strategies
- Ensure recommendations are relevant to the user's industry
"""
CONTENT_GAP_ANALYSIS_PROMPT = """
You are a content strategist analyzing competitor content to identify gaps and opportunities.
**TASK**: Analyze competitor content patterns to identify content gaps and opportunities.
**COMPETITOR CONTENT DATA**:
{competitor_context}
**USER'S INDUSTRY**: {industry_context}
**TARGET AUDIENCE**: {target_audience}
**ANALYSIS FOCUS**:
1. **Content Topic Analysis**
- Identify most common content topics across competitors
- Find underserved or missing topics
- Analyze content depth and quality patterns
2. **Content Format Opportunities**
- Identify popular content formats among competitors
- Find format gaps and opportunities
- Suggest innovative content approaches
3. **Audience Targeting Gaps**
- Analyze competitor audience targeting
- Identify underserved audience segments
- Suggest audience expansion opportunities
4. **SEO Content Opportunities**
- Identify high-value keywords competitors are missing
- Find long-tail keyword opportunities
- Suggest content clusters for SEO
**OUTPUT FORMAT** (JSON):
{{
"content_gaps": [
{{
"topic": "Specific content topic",
"opportunity_level": "high/medium/low",
"reasoning": "Why this is an opportunity",
"content_angle": "Unique angle for this topic",
"estimated_difficulty": "easy/medium/hard"
}}
],
"format_opportunities": [
{{
"format": "Content format type",
"gap_reason": "Why competitors aren't using this",
"potential_impact": "Expected impact level",
"implementation_tips": "How to implement"
}}
],
"audience_gaps": [
{{
"audience_segment": "Underserved audience",
"opportunity_size": "large/medium/small",
"content_needs": "What content this audience needs",
"engagement_strategy": "How to engage this audience"
}}
],
"seo_opportunities": [
{{
"keyword_theme": "Keyword cluster theme",
"search_volume": "estimated_high/medium/low",
"competition_level": "low/medium/high",
"content_ideas": ["3-5 content ideas for this theme"]
}}
],
"priority_recommendations": [
"Top 5 prioritized content opportunities with implementation order"
]
}}
"""
COMPETITIVE_INTELLIGENCE_PROMPT = """
You are a competitive intelligence expert providing strategic insights for market positioning.
**TASK**: Generate comprehensive competitive intelligence insights for strategic decision-making.
**COMPETITOR INTELLIGENCE DATA**:
{competitor_context}
**BUSINESS CONTEXT**:
- User Website: {user_url}
- Industry: {industry_context}
- Business Model: {business_model}
- Target Market: {target_market}
**INTELLIGENCE AREAS**:
1. **Competitive Landscape Mapping**
- Market positioning analysis
- Competitive strength assessment
- Market share estimation
2. **Strategic Positioning Opportunities**
- Blue ocean opportunities
- Differentiation strategies
- Competitive moats
3. **Threat Assessment**
- Competitive threats
- Market disruption risks
- Barrier to entry analysis
4. **Growth Strategy Insights**
- Market expansion opportunities
- Partnership possibilities
- Acquisition targets
**OUTPUT FORMAT** (JSON):
{{
"competitive_landscape": {{
"market_structure": "Description of market structure",
"key_players": [
{{
"name": "Competitor name",
"position": "market_leader/challenger/niche",
"strengths": ["List of key strengths"],
"weaknesses": ["List of key weaknesses"],
"market_share": "estimated_percentage"
}}
],
"market_dynamics": "Analysis of market trends and forces"
}},
"positioning_opportunities": {{
"blue_ocean_opportunities": ["List of uncontested market spaces"],
"differentiation_strategies": ["List of positioning strategies"],
"competitive_advantages": ["List of potential advantages to build"]
}},
"threat_analysis": {{
"immediate_threats": ["List of current competitive threats"],
"future_risks": ["List of potential future risks"],
"market_barriers": ["List of barriers to success"]
}},
"strategic_recommendations": {{
"short_term_actions": ["List of 3-5 immediate actions"],
"medium_term_strategy": ["List of 3-5 strategic initiatives"],
"long_term_vision": ["List of 2-3 long-term strategic goals"]
}},
"success_metrics": {{
"kpis_to_track": ["List of key performance indicators"],
"competitive_benchmarks": ["List of metrics to benchmark against"],
"success_thresholds": ["List of success criteria"]
}}
}}
"""
# Utility function to format prompts with data
def format_competitor_analysis_prompt(competitor_context: str, user_url: str, industry_context: str = None) -> str:
"""Format the competitor analysis prompt with actual data."""
return COMPETITOR_ANALYSIS_PROMPT.format(
competitor_context=competitor_context,
user_url=user_url,
industry_context=industry_context or "Not specified"
)
def format_content_gap_prompt(competitor_context: str, industry_context: str = None, target_audience: str = None) -> str:
"""Format the content gap analysis prompt with actual data."""
return CONTENT_GAP_ANALYSIS_PROMPT.format(
competitor_context=competitor_context,
industry_context=industry_context or "Not specified",
target_audience=target_audience or "Not specified"
)
def format_competitive_intelligence_prompt(
competitor_context: str,
user_url: str,
industry_context: str = None,
business_model: str = None,
target_market: str = None
) -> str:
"""Format the competitive intelligence prompt with actual data."""
return COMPETITIVE_INTELLIGENCE_PROMPT.format(
competitor_context=competitor_context,
user_url=user_url,
industry_context=industry_context or "Not specified",
business_model=business_model or "Not specified",
target_market=target_market or "Not specified"
)

View File

@@ -0,0 +1,51 @@
"""
Research Engine Core Module
This is the standalone AI Research Engine that can be imported by
Blog Writer, Podcast Maker, YouTube Creator, and other ALwrity tools.
Design Goals:
- Tool-agnostic: Any content tool can import and use this
- AI-driven parameter optimization: Users don't need to understand Exa/Tavily internals
- Provider priority: Exa → Tavily → Google (fallback)
- Personalization-aware: Accepts context from calling tools
- Advanced by default: Prioritizes quality over speed
Usage:
from services.research.core import ResearchEngine, ResearchContext
engine = ResearchEngine()
result = await engine.research(ResearchContext(
query="AI trends in healthcare 2025",
content_type=ContentType.BLOG,
persona_context={"industry": "Healthcare", "audience": "Medical professionals"}
))
Author: ALwrity Team
Version: 2.0
Last Updated: December 2025
"""
from .research_context import (
ResearchContext,
ResearchPersonalizationContext,
ContentType,
ResearchGoal,
ResearchDepth,
ProviderPreference,
)
from .parameter_optimizer import ParameterOptimizer
from .research_engine import ResearchEngine
__all__ = [
# Context schemas
"ResearchContext",
"ResearchPersonalizationContext",
"ContentType",
"ResearchGoal",
"ResearchDepth",
"ProviderPreference",
# Core classes
"ParameterOptimizer",
"ResearchEngine",
]

View File

@@ -0,0 +1,384 @@
"""
AI Parameter Optimizer for Research Engine
Uses AI to analyze the research query and context to select optimal
parameters for Exa and Tavily APIs. This abstracts the complexity
from non-technical users.
Key Decisions:
- Provider selection (Exa vs Tavily vs Google)
- Search type (neural vs keyword)
- Category/topic selection
- Depth and result limits
- Domain filtering
Author: ALwrity Team
Version: 2.0
"""
import os
import re
from typing import Dict, Any, Optional, Tuple
from loguru import logger
from .research_context import (
ResearchContext,
ResearchGoal,
ResearchDepth,
ProviderPreference,
ContentType,
)
from models.blog_models import ResearchConfig, ResearchProvider, ResearchMode
class ParameterOptimizer:
"""
AI-driven parameter optimization for research providers.
Analyzes the research context and selects optimal parameters
for Exa, Tavily, or Google without requiring user expertise.
"""
# Query patterns for intelligent routing
TRENDING_PATTERNS = [
r'\b(latest|recent|new|2024|2025|current|trending|news)\b',
r'\b(update|announcement|launch|release)\b',
]
TECHNICAL_PATTERNS = [
r'\b(api|sdk|framework|library|implementation|architecture)\b',
r'\b(code|programming|developer|technical|engineering)\b',
]
COMPETITIVE_PATTERNS = [
r'\b(competitor|alternative|vs|versus|compare|comparison)\b',
r'\b(market|industry|landscape|players)\b',
]
FACTUAL_PATTERNS = [
r'\b(statistics|data|research|study|report|survey)\b',
r'\b(percent|percentage|number|figure|metric)\b',
]
# Exa category mapping based on query analysis
EXA_CATEGORY_MAP = {
'research': 'research paper',
'news': 'news',
'company': 'company',
'personal': 'personal site',
'github': 'github',
'linkedin': 'linkedin profile',
'finance': 'financial report',
}
# Tavily topic mapping
TAVILY_TOPIC_MAP = {
ResearchGoal.TRENDING: 'news',
ResearchGoal.FACTUAL: 'general',
ResearchGoal.COMPETITIVE: 'general',
ResearchGoal.TECHNICAL: 'general',
ResearchGoal.EDUCATIONAL: 'general',
ResearchGoal.INSPIRATIONAL: 'general',
}
def __init__(self):
"""Initialize the optimizer."""
self.exa_available = bool(os.getenv("EXA_API_KEY"))
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
logger.info(f"ParameterOptimizer initialized: exa={self.exa_available}, tavily={self.tavily_available}")
def optimize(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
"""
Analyze research context and return optimized provider and config.
Args:
context: The research context from the calling tool
Returns:
Tuple of (selected_provider, optimized_config)
"""
# If advanced mode, use raw parameters
if context.advanced_mode:
return self._build_advanced_config(context)
# Analyze query to determine optimal approach
query_analysis = self._analyze_query(context.query)
# Select provider based on analysis and preferences
provider = self._select_provider(context, query_analysis)
# Build optimized config for selected provider
config = self._build_config(context, provider, query_analysis)
logger.info(f"Optimized research: provider={provider.value}, mode={config.mode.value}")
return provider, config
def _analyze_query(self, query: str) -> Dict[str, Any]:
"""
Analyze the query to understand intent and optimal approach.
Returns dict with:
- is_trending: Query is about recent/current events
- is_technical: Query is technical in nature
- is_competitive: Query is about competition/comparison
- is_factual: Query needs data/statistics
- suggested_category: Exa category if applicable
- suggested_topic: Tavily topic
"""
query_lower = query.lower()
analysis = {
'is_trending': self._matches_patterns(query_lower, self.TRENDING_PATTERNS),
'is_technical': self._matches_patterns(query_lower, self.TECHNICAL_PATTERNS),
'is_competitive': self._matches_patterns(query_lower, self.COMPETITIVE_PATTERNS),
'is_factual': self._matches_patterns(query_lower, self.FACTUAL_PATTERNS),
'suggested_category': None,
'suggested_topic': 'general',
'suggested_search_type': 'auto',
}
# Determine Exa category
if 'research' in query_lower or 'study' in query_lower or 'paper' in query_lower:
analysis['suggested_category'] = 'research paper'
elif 'github' in query_lower or 'repository' in query_lower:
analysis['suggested_category'] = 'github'
elif 'linkedin' in query_lower or 'professional' in query_lower:
analysis['suggested_category'] = 'linkedin profile'
elif analysis['is_trending']:
analysis['suggested_category'] = 'news'
elif 'company' in query_lower or 'startup' in query_lower:
analysis['suggested_category'] = 'company'
# Determine Tavily topic
if analysis['is_trending']:
analysis['suggested_topic'] = 'news'
elif 'finance' in query_lower or 'stock' in query_lower or 'investment' in query_lower:
analysis['suggested_topic'] = 'finance'
else:
analysis['suggested_topic'] = 'general'
# Determine search type
if analysis['is_technical'] or analysis['is_factual']:
analysis['suggested_search_type'] = 'neural' # Better for semantic understanding
elif analysis['is_trending']:
analysis['suggested_search_type'] = 'keyword' # Better for current events
return analysis
def _matches_patterns(self, text: str, patterns: list) -> bool:
"""Check if text matches any of the patterns."""
for pattern in patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def _select_provider(self, context: ResearchContext, analysis: Dict[str, Any]) -> ResearchProvider:
"""
Select the optimal provider based on context and query analysis.
Priority: Exa → Tavily → Google for ALL modes (including basic).
This provides better semantic search results for content creators.
Exa's neural search excels at understanding context and meaning,
which is valuable for all research types, not just technical queries.
"""
preference = context.provider_preference
# If user explicitly requested a provider, respect that
if preference == ProviderPreference.EXA:
if self.exa_available:
return ResearchProvider.EXA
logger.warning("Exa requested but not available, falling back")
if preference == ProviderPreference.TAVILY:
if self.tavily_available:
return ResearchProvider.TAVILY
logger.warning("Tavily requested but not available, falling back")
if preference == ProviderPreference.GOOGLE:
return ResearchProvider.GOOGLE
# AUTO mode: Always prefer Exa → Tavily → Google
# Exa provides superior semantic search for all content types
if self.exa_available:
logger.info(f"Selected Exa (primary provider): query analysis shows " +
f"technical={analysis.get('is_technical', False)}, " +
f"trending={analysis.get('is_trending', False)}")
return ResearchProvider.EXA
# Tavily as secondary option - good for real-time and news
if self.tavily_available:
logger.info(f"Selected Tavily (secondary): Exa unavailable, " +
f"trending={analysis.get('is_trending', False)}")
return ResearchProvider.TAVILY
# Google grounding as fallback
logger.info("Selected Google (fallback): Exa and Tavily unavailable")
return ResearchProvider.GOOGLE
def _build_config(
self,
context: ResearchContext,
provider: ResearchProvider,
analysis: Dict[str, Any]
) -> ResearchConfig:
"""Build optimized ResearchConfig for the selected provider."""
# Map ResearchDepth to ResearchMode
mode_map = {
ResearchDepth.QUICK: ResearchMode.BASIC,
ResearchDepth.STANDARD: ResearchMode.BASIC,
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
}
mode = mode_map.get(context.depth, ResearchMode.BASIC)
# Base config
config = ResearchConfig(
mode=mode,
provider=provider,
max_sources=context.max_sources,
include_statistics=context.personalization.include_statistics if context.personalization else True,
include_expert_quotes=context.personalization.include_expert_quotes if context.personalization else True,
include_competitors=analysis['is_competitive'],
include_trends=analysis['is_trending'],
)
# Provider-specific optimizations
if provider == ResearchProvider.EXA:
config = self._optimize_exa_config(config, context, analysis)
elif provider == ResearchProvider.TAVILY:
config = self._optimize_tavily_config(config, context, analysis)
# Apply domain filters
if context.include_domains:
if provider == ResearchProvider.EXA:
config.exa_include_domains = context.include_domains
elif provider == ResearchProvider.TAVILY:
config.tavily_include_domains = context.include_domains[:300] # Tavily limit
if context.exclude_domains:
if provider == ResearchProvider.EXA:
config.exa_exclude_domains = context.exclude_domains
elif provider == ResearchProvider.TAVILY:
config.tavily_exclude_domains = context.exclude_domains[:150] # Tavily limit
return config
def _optimize_exa_config(
self,
config: ResearchConfig,
context: ResearchContext,
analysis: Dict[str, Any]
) -> ResearchConfig:
"""Add Exa-specific optimizations."""
# Set category based on analysis
if analysis['suggested_category']:
config.exa_category = analysis['suggested_category']
# Set search type
config.exa_search_type = analysis.get('suggested_search_type', 'auto')
# For comprehensive research, use neural search
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
config.exa_search_type = 'neural'
return config
def _optimize_tavily_config(
self,
config: ResearchConfig,
context: ResearchContext,
analysis: Dict[str, Any]
) -> ResearchConfig:
"""Add Tavily-specific optimizations."""
# Set topic based on analysis
config.tavily_topic = analysis.get('suggested_topic', 'general')
# Set search depth based on research depth
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
config.tavily_search_depth = 'advanced' # 2 credits, but better results
config.tavily_chunks_per_source = 3
else:
config.tavily_search_depth = 'basic' # 1 credit
# Set time range based on recency
if context.recency:
recency_map = {
'day': 'd',
'week': 'w',
'month': 'm',
'year': 'y',
}
config.tavily_time_range = recency_map.get(context.recency, context.recency)
elif analysis['is_trending']:
config.tavily_time_range = 'w' # Last week for trending topics
# Include answer for comprehensive research
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
config.tavily_include_answer = 'advanced'
# Include raw content for expert depth
if context.depth == ResearchDepth.EXPERT:
config.tavily_include_raw_content = 'markdown'
return config
def _build_advanced_config(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
"""
Build config from raw advanced parameters.
Used when advanced_mode=True and user wants full control.
"""
# Determine provider from explicit parameters
provider = ResearchProvider.GOOGLE
if context.exa_category or context.exa_search_type:
provider = ResearchProvider.EXA if self.exa_available else ResearchProvider.GOOGLE
elif context.tavily_topic or context.tavily_search_depth:
provider = ResearchProvider.TAVILY if self.tavily_available else ResearchProvider.GOOGLE
# Check preference override
if context.provider_preference == ProviderPreference.EXA and self.exa_available:
provider = ResearchProvider.EXA
elif context.provider_preference == ProviderPreference.TAVILY and self.tavily_available:
provider = ResearchProvider.TAVILY
elif context.provider_preference == ProviderPreference.GOOGLE:
provider = ResearchProvider.GOOGLE
# Map depth to mode
mode_map = {
ResearchDepth.QUICK: ResearchMode.BASIC,
ResearchDepth.STANDARD: ResearchMode.BASIC,
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
}
mode = mode_map.get(context.depth, ResearchMode.BASIC)
# Build config with raw parameters
config = ResearchConfig(
mode=mode,
provider=provider,
max_sources=context.max_sources,
# Exa
exa_category=context.exa_category,
exa_search_type=context.exa_search_type,
exa_include_domains=context.include_domains,
exa_exclude_domains=context.exclude_domains,
# Tavily
tavily_topic=context.tavily_topic,
tavily_search_depth=context.tavily_search_depth,
tavily_include_domains=context.include_domains[:300] if context.include_domains else [],
tavily_exclude_domains=context.exclude_domains[:150] if context.exclude_domains else [],
tavily_include_answer=context.tavily_include_answer,
tavily_include_raw_content=context.tavily_include_raw_content,
tavily_time_range=context.tavily_time_range,
tavily_country=context.tavily_country,
)
logger.info(f"Advanced config: provider={provider.value}, mode={mode.value}")
return provider, config

View File

@@ -0,0 +1,198 @@
"""
Research Context Schema
Defines the unified input schema for the Research Engine.
Any tool (Blog Writer, Podcast Maker, YouTube Creator) can create a ResearchContext
and pass it to the Research Engine.
Author: ALwrity Team
Version: 2.0
"""
from enum import Enum
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
class ContentType(str, Enum):
"""Type of content being created - affects research focus."""
BLOG = "blog"
PODCAST = "podcast"
VIDEO = "video"
SOCIAL = "social"
EMAIL = "email"
NEWSLETTER = "newsletter"
WHITEPAPER = "whitepaper"
GENERAL = "general"
class ResearchGoal(str, Enum):
"""Primary goal of the research - affects provider selection and depth."""
FACTUAL = "factual" # Stats, data, citations
TRENDING = "trending" # Current trends, news
COMPETITIVE = "competitive" # Competitor analysis
EDUCATIONAL = "educational" # How-to, explanations
INSPIRATIONAL = "inspirational" # Stories, quotes
TECHNICAL = "technical" # Deep technical content
class ResearchDepth(str, Enum):
"""Depth of research - maps to existing ResearchMode."""
QUICK = "quick" # Fast, surface-level (maps to BASIC)
STANDARD = "standard" # Balanced depth (maps to BASIC with more sources)
COMPREHENSIVE = "comprehensive" # Deep research (maps to COMPREHENSIVE)
EXPERT = "expert" # Maximum depth with expert sources
class ProviderPreference(str, Enum):
"""Provider preference - AUTO lets the engine decide."""
AUTO = "auto" # AI decides based on query (default)
EXA = "exa" # Force Exa neural search
TAVILY = "tavily" # Force Tavily AI search
GOOGLE = "google" # Force Google grounding
HYBRID = "hybrid" # Use multiple providers
class ResearchPersonalizationContext(BaseModel):
"""
Context from the calling tool (Blog Writer, Podcast Maker, etc.)
This personalizes the research without the Research Engine knowing
the specific tool implementation.
"""
# Who is creating the content
creator_id: Optional[str] = None # Clerk user ID
# Content context
content_type: ContentType = ContentType.GENERAL
industry: Optional[str] = None
target_audience: Optional[str] = None
tone: Optional[str] = None # professional, casual, technical, etc.
# Persona data (from onboarding)
persona_id: Optional[str] = None
brand_voice: Optional[str] = None
competitor_urls: List[str] = Field(default_factory=list)
# Content requirements
word_count_target: Optional[int] = None
include_statistics: bool = True
include_expert_quotes: bool = True
include_case_studies: bool = False
include_visuals: bool = False
# Platform-specific hints
platform: Optional[str] = None # medium, wordpress, youtube, spotify, etc.
class Config:
use_enum_values = True
class ResearchContext(BaseModel):
"""
Main input schema for the Research Engine.
This is what any tool passes to the Research Engine to get research results.
The engine uses AI to optimize parameters based on this context.
"""
# Primary research input
query: str = Field(..., description="Main research query or topic")
keywords: List[str] = Field(default_factory=list, description="Additional keywords")
# Research configuration
goal: ResearchGoal = ResearchGoal.FACTUAL
depth: ResearchDepth = ResearchDepth.STANDARD
provider_preference: ProviderPreference = ProviderPreference.AUTO
# Personalization from calling tool
personalization: Optional[ResearchPersonalizationContext] = None
# Constraints
max_sources: int = Field(default=10, ge=1, le=25)
recency: Optional[str] = None # "day", "week", "month", "year", None for all-time
# Domain filtering
include_domains: List[str] = Field(default_factory=list)
exclude_domains: List[str] = Field(default_factory=list)
# Advanced mode (exposes raw provider parameters)
advanced_mode: bool = False
# Raw provider parameters (only used if advanced_mode=True)
# Exa-specific
exa_category: Optional[str] = None
exa_search_type: Optional[str] = None # auto, keyword, neural
# Tavily-specific
tavily_topic: Optional[str] = None # general, news, finance
tavily_search_depth: Optional[str] = None # basic, advanced
tavily_include_answer: bool = False
tavily_include_raw_content: bool = False
tavily_time_range: Optional[str] = None
tavily_country: Optional[str] = None
class Config:
use_enum_values = True
def get_effective_query(self) -> str:
"""Build effective query combining query and keywords."""
if self.keywords:
return f"{self.query} {' '.join(self.keywords)}"
return self.query
def get_industry(self) -> str:
"""Get industry from personalization or default."""
if self.personalization and self.personalization.industry:
return self.personalization.industry
return "General"
def get_audience(self) -> str:
"""Get target audience from personalization or default."""
if self.personalization and self.personalization.target_audience:
return self.personalization.target_audience
return "General"
def get_user_id(self) -> Optional[str]:
"""Get user ID from personalization."""
if self.personalization:
return self.personalization.creator_id
return None
class ResearchResult(BaseModel):
"""
Output schema from the Research Engine.
Standardized format that any tool can consume.
"""
success: bool = True
# Content
summary: Optional[str] = None # AI-generated summary of findings
raw_content: Optional[str] = None # Raw aggregated content for LLM processing
# Sources
sources: List[Dict[str, Any]] = Field(default_factory=list)
# Analysis (reuses existing blog writer analysis)
keyword_analysis: Dict[str, Any] = Field(default_factory=dict)
competitor_analysis: Dict[str, Any] = Field(default_factory=dict)
suggested_angles: List[str] = Field(default_factory=list)
# Metadata
provider_used: str = "google" # Which provider was actually used
search_queries: List[str] = Field(default_factory=list)
grounding_metadata: Optional[Dict[str, Any]] = None
# Cost tracking
estimated_cost: float = 0.0
# Error handling
error_message: Optional[str] = None
error_code: Optional[str] = None
retry_suggested: bool = False
# Original context for reference
original_query: Optional[str] = None
class Config:
use_enum_values = True

View File

@@ -0,0 +1,558 @@
"""
Research Engine - Core Orchestrator
The main entry point for AI research across all ALwrity tools.
This engine wraps existing providers (Exa, Tavily, Google) and provides
a unified interface for any content generation tool.
Usage:
from services.research.core import ResearchEngine, ResearchContext, ContentType
engine = ResearchEngine()
result = await engine.research(ResearchContext(
query="AI trends in healthcare 2025",
content_type=ContentType.PODCAST,
personalization=ResearchPersonalizationContext(
industry="Healthcare",
target_audience="Medical professionals"
)
))
Author: ALwrity Team
Version: 2.0
"""
import os
import time
from typing import Dict, Any, Optional, Callable
from loguru import logger
from .research_context import (
ResearchContext,
ResearchResult,
ResearchDepth,
ContentType,
ResearchPersonalizationContext,
)
from .parameter_optimizer import ParameterOptimizer
# Reuse existing blog writer models and services
from models.blog_models import (
BlogResearchRequest,
BlogResearchResponse,
ResearchConfig,
ResearchProvider,
ResearchMode,
PersonaInfo,
ResearchSource,
)
# Research persona for personalization
from models.research_persona_models import ResearchPersona
class ResearchEngine:
"""
AI Research Engine - Standalone module for content research.
This engine:
1. Accepts a ResearchContext from any tool
2. Uses AI to optimize parameters for Exa/Tavily
3. Integrates research persona for personalization
4. Executes research using existing providers
5. Returns standardized ResearchResult
Can be imported by Blog Writer, Podcast Maker, YouTube Creator, etc.
"""
def __init__(self, db_session=None):
"""Initialize the Research Engine."""
self.optimizer = ParameterOptimizer()
self._providers_initialized = False
self._exa_provider = None
self._tavily_provider = None
self._google_provider = None
self._db_session = db_session
# Check provider availability
self.exa_available = bool(os.getenv("EXA_API_KEY"))
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
logger.info(f"ResearchEngine initialized: exa={self.exa_available}, tavily={self.tavily_available}")
def _get_research_persona(self, user_id: str, generate_if_missing: bool = True) -> Optional[ResearchPersona]:
"""
Fetch research persona for user, generating if missing.
Phase 2: Since onboarding is mandatory and always completes before accessing
any tool, we can safely generate research persona on first use. This ensures
hyper-personalization without requiring "General" fallbacks.
Args:
user_id: User ID (Clerk string)
generate_if_missing: If True, generate persona if not cached (default: True)
Returns:
ResearchPersona if successful, None only if user has no core persona
"""
if not user_id:
return None
try:
from services.research.research_persona_service import ResearchPersonaService
db = self._db_session
if not db:
from services.database import get_db_session
db = get_db_session()
persona_service = ResearchPersonaService(db_session=db)
if generate_if_missing:
# Phase 2: Use get_or_generate() to create persona on first visit
# This triggers LLM call if not cached, but onboarding guarantees
# core persona exists, so generation will succeed
logger.info(f"🔄 Getting/generating research persona for user {user_id}...")
persona = persona_service.get_or_generate(user_id, force_refresh=False)
if persona:
logger.info(f"✅ Research persona ready for user {user_id}: industry={persona.default_industry}")
else:
logger.warning(f"⚠️ Could not get/generate research persona for user {user_id} - using core persona fallback")
else:
# Fast path: only return cached (for config endpoints)
persona = persona_service.get_cached_only(user_id)
if persona:
logger.debug(f"Research persona loaded from cache for user {user_id}")
return persona
except Exception as e:
logger.warning(f"Failed to load research persona for user {user_id}: {e}")
return None
def _enrich_context_with_persona(
self,
context: ResearchContext,
persona: ResearchPersona
) -> ResearchContext:
"""
Enrich the research context with persona data.
Only applies persona defaults if the context doesn't already have values.
User-provided values always take precedence.
"""
# Create personalization context if not exists
if not context.personalization:
context.personalization = ResearchPersonalizationContext()
# Apply persona defaults only if not already set
if not context.personalization.industry or context.personalization.industry == "General":
if persona.default_industry:
context.personalization.industry = persona.default_industry
logger.debug(f"Applied persona industry: {persona.default_industry}")
if not context.personalization.target_audience or context.personalization.target_audience == "General":
if persona.default_target_audience:
context.personalization.target_audience = persona.default_target_audience
logger.debug(f"Applied persona target_audience: {persona.default_target_audience}")
# Apply suggested Exa domains if not already set
if not context.include_domains and persona.suggested_exa_domains:
context.include_domains = persona.suggested_exa_domains[:6] # Limit to 6 domains
logger.debug(f"Applied persona domains: {context.include_domains}")
# Apply suggested Exa category if not already set
if not context.exa_category and persona.suggested_exa_category:
context.exa_category = persona.suggested_exa_category
logger.debug(f"Applied persona exa_category: {persona.suggested_exa_category}")
return context
async def research(
self,
context: ResearchContext,
progress_callback: Optional[Callable[[str], None]] = None
) -> ResearchResult:
"""
Execute research based on the given context.
Args:
context: Research context with query, goals, and personalization
progress_callback: Optional callback for progress updates
Returns:
ResearchResult with sources, analysis, and content
"""
start_time = time.time()
try:
# Progress update
self._progress(progress_callback, "🔍 Analyzing research query...")
# Enrich context with research persona (Phase 2: generate if missing)
user_id = context.get_user_id()
if user_id:
self._progress(progress_callback, "👤 Loading personalized research profile...")
persona = self._get_research_persona(user_id, generate_if_missing=True)
if persona:
self._progress(progress_callback, "✨ Applying hyper-personalized settings...")
context = self._enrich_context_with_persona(context, persona)
else:
logger.warning(f"No research persona available for user {user_id} - proceeding with provided context")
# Optimize parameters based on enriched context
provider, config = self.optimizer.optimize(context)
self._progress(progress_callback, f"🤖 Selected {provider.value.upper()} for research")
# Build the request using existing blog models
request = self._build_request(context, config)
user_id = context.get_user_id() or ""
# Execute research using appropriate provider
self._progress(progress_callback, f"🌐 Connecting to {provider.value} search...")
if provider == ResearchProvider.EXA:
response = await self._execute_exa_research(request, config, user_id, progress_callback)
elif provider == ResearchProvider.TAVILY:
response = await self._execute_tavily_research(request, config, user_id, progress_callback)
else:
response = await self._execute_google_research(request, config, user_id, progress_callback)
# Transform response to ResearchResult
self._progress(progress_callback, "📊 Processing results...")
result = self._transform_response(response, provider, context)
duration_ms = (time.time() - start_time) * 1000
logger.info(f"Research completed in {duration_ms:.0f}ms: {len(result.sources)} sources")
self._progress(progress_callback, f"✅ Research complete: {len(result.sources)} sources found")
return result
except Exception as e:
logger.error(f"Research failed: {e}")
return ResearchResult(
success=False,
error_message=str(e),
error_code="RESEARCH_FAILED",
retry_suggested=True,
original_query=context.query
)
def _progress(self, callback: Optional[Callable[[str], None]], message: str):
"""Send progress update if callback provided."""
if callback:
callback(message)
logger.info(f"[Research] {message}")
def _build_request(self, context: ResearchContext, config: ResearchConfig) -> BlogResearchRequest:
"""Build BlogResearchRequest from ResearchContext."""
# Extract keywords from query
keywords = context.keywords if context.keywords else [context.query]
# Build persona info from personalization
persona = None
if context.personalization:
persona = PersonaInfo(
persona_id=context.personalization.persona_id,
tone=context.personalization.tone,
audience=context.personalization.target_audience,
industry=context.personalization.industry,
)
return BlogResearchRequest(
keywords=keywords,
topic=context.query,
industry=context.get_industry(),
target_audience=context.get_audience(),
tone=context.personalization.tone if context.personalization else None,
word_count_target=context.personalization.word_count_target if context.personalization else 1500,
persona=persona,
research_mode=config.mode,
config=config,
)
async def _execute_exa_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Exa provider."""
from services.blog_writer.research.exa_provider import ExaResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Exa neural search...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Execute Exa search
try:
exa_provider = ExaResearchProvider()
raw_result = await exa_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
# Track usage
cost = raw_result.get('cost', {}).get('total', 0.005) if isinstance(raw_result.get('cost'), dict) else 0.005
exa_provider.track_exa_usage(user_id, cost)
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
except RuntimeError as e:
if "EXA_API_KEY not configured" in str(e):
logger.warning("Exa not configured, falling back to Tavily")
self._progress(progress_callback, "⚠️ Exa unavailable, trying Tavily...")
config.provider = ResearchProvider.TAVILY
return await self._execute_tavily_research(request, config, user_id, progress_callback)
raise
async def _execute_tavily_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Tavily provider."""
from services.blog_writer.research.tavily_provider import TavilyResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Tavily AI search...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Execute Tavily search
try:
tavily_provider = TavilyResearchProvider()
raw_result = await tavily_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
# Track usage
cost = raw_result.get('cost', {}).get('total', 0.001) if isinstance(raw_result.get('cost'), dict) else 0.001
search_depth = config.tavily_search_depth or "basic"
tavily_provider.track_tavily_usage(user_id, cost, search_depth)
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
except RuntimeError as e:
if "TAVILY_API_KEY not configured" in str(e):
logger.warning("Tavily not configured, falling back to Google")
self._progress(progress_callback, "⚠️ Tavily unavailable, using Google Search...")
config.provider = ResearchProvider.GOOGLE
return await self._execute_google_research(request, config, user_id, progress_callback)
raise
async def _execute_google_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Google/Gemini grounding."""
from services.blog_writer.research.google_provider import GoogleResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Google Search grounding...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Execute Google search
google_provider = GoogleResearchProvider()
raw_result = await google_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
self._progress(progress_callback, "📝 Processing grounded results...")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback, is_google=True)
async def _run_analysis(
self,
request: BlogResearchRequest,
raw_result: Dict[str, Any],
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None,
is_google: bool = False
) -> BlogResearchResponse:
"""Run common analysis on raw results."""
from services.blog_writer.research.keyword_analyzer import KeywordAnalyzer
from services.blog_writer.research.competitor_analyzer import CompetitorAnalyzer
from services.blog_writer.research.content_angle_generator import ContentAngleGenerator
from services.blog_writer.research.data_filter import ResearchDataFilter
self._progress(progress_callback, "🔍 Analyzing keywords and content angles...")
# Extract content for analysis
if is_google:
content = raw_result.get("content", "")
sources = self._extract_sources_from_grounding(raw_result)
search_queries = raw_result.get("search_queries", []) or []
grounding_metadata = self._extract_grounding_metadata(raw_result)
else:
content = raw_result.get('content', '')
sources = [ResearchSource(**s) if isinstance(s, dict) else s for s in raw_result.get('sources', [])]
search_queries = raw_result.get('search_queries', [])
grounding_metadata = None
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
# Run analyzers
keyword_analyzer = KeywordAnalyzer()
competitor_analyzer = CompetitorAnalyzer()
content_angle_generator = ContentAngleGenerator()
data_filter = ResearchDataFilter()
keyword_analysis = keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
competitor_analysis = competitor_analyzer.analyze(content, user_id=user_id)
suggested_angles = content_angle_generator.generate(content, topic, industry, user_id=user_id)
# Build response
response = BlogResearchResponse(
success=True,
sources=sources,
keyword_analysis=keyword_analysis,
competitor_analysis=competitor_analysis,
suggested_angles=suggested_angles,
search_widget="",
search_queries=search_queries,
grounding_metadata=grounding_metadata,
original_keywords=request.keywords,
)
# Filter and clean research data
self._progress(progress_callback, "✨ Filtering and optimizing results...")
filtered_response = data_filter.filter_research_data(response)
return filtered_response
def _extract_sources_from_grounding(self, gemini_result: Dict[str, Any]) -> list:
"""Extract sources from Gemini grounding metadata."""
from models.blog_models import ResearchSource
sources = []
if not gemini_result or not isinstance(gemini_result, dict):
return sources
raw_sources = gemini_result.get("sources", []) or []
for src in raw_sources:
source = ResearchSource(
title=src.get("title", "Untitled"),
url=src.get("url", ""),
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
credibility_score=float(src.get("credibility_score", 0.8)),
published_at=str(src.get("publication_date", "2024-01-01")),
index=src.get("index"),
source_type=src.get("type", "web")
)
sources.append(source)
return sources
def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract grounding metadata from Gemini result."""
if not gemini_result or not isinstance(gemini_result, dict):
return None
return gemini_result.get("grounding_metadata")
def _transform_response(
self,
response: BlogResearchResponse,
provider: ResearchProvider,
context: ResearchContext
) -> ResearchResult:
"""Transform BlogResearchResponse to ResearchResult."""
# Convert sources to dicts
sources = []
for s in response.sources:
if hasattr(s, 'dict'):
sources.append(s.dict())
elif isinstance(s, dict):
sources.append(s)
else:
sources.append({
'title': getattr(s, 'title', ''),
'url': getattr(s, 'url', ''),
'excerpt': getattr(s, 'excerpt', ''),
})
# Extract grounding metadata
grounding = None
if response.grounding_metadata:
if hasattr(response.grounding_metadata, 'dict'):
grounding = response.grounding_metadata.dict()
else:
grounding = response.grounding_metadata
return ResearchResult(
success=response.success,
sources=sources,
keyword_analysis=response.keyword_analysis,
competitor_analysis=response.competitor_analysis,
suggested_angles=response.suggested_angles,
provider_used=provider.value,
search_queries=response.search_queries,
grounding_metadata=grounding,
original_query=context.query,
error_message=response.error_message,
error_code=response.error_code if hasattr(response, 'error_code') else None,
retry_suggested=response.retry_suggested if hasattr(response, 'retry_suggested') else False,
)
def get_provider_status(self) -> Dict[str, Any]:
"""Get status of available providers."""
return {
"exa": {
"available": self.exa_available,
"priority": 1,
"description": "Neural search for semantic understanding"
},
"tavily": {
"available": self.tavily_available,
"priority": 2,
"description": "AI-powered web search"
},
"google": {
"available": True, # Always available via Gemini
"priority": 3,
"description": "Google Search grounding"
}
}

View File

@@ -0,0 +1,794 @@
"""
Exa API Service for ALwrity
This service provides competitor discovery and analysis using the Exa API,
which uses neural search to find semantically similar websites and content.
Key Features:
- Competitor discovery using neural search
- Content analysis and summarization
- Competitive intelligence gathering
- Cost-effective API usage with caching
- Integration with onboarding Step 3
Dependencies:
- aiohttp (for async HTTP requests)
- os (for environment variables)
- logging (for debugging)
Author: ALwrity Team
Version: 1.0
Last Updated: January 2025
"""
import os
import json
import asyncio
from typing import Dict, List, Optional, Any, Union
from datetime import datetime, timedelta
from loguru import logger
from urllib.parse import urlparse
from exa_py import Exa
class ExaService:
"""
Service for competitor discovery and analysis using the Exa API.
This service provides neural search capabilities to find semantically similar
websites and analyze their content for competitive intelligence.
"""
def __init__(self):
"""Initialize the Exa Service with API credentials."""
self.api_key = os.getenv("EXA_API_KEY")
self.exa = None
self.enabled = False
# Don't assume key is available at import time in production.
# Keys may be injected per-request via middleware, so defer init.
self._try_initialize()
def _try_initialize(self) -> None:
"""Attempt to (re)initialize the Exa SDK from current environment."""
if self.enabled and self.exa:
return
try:
self.api_key = os.getenv("EXA_API_KEY")
if not self.api_key:
# Leave disabled; caller may try again after middleware injection
logger.warning("EXA_API_KEY not configured; Exa service will be disabled")
self.enabled = False
self.exa = None
return
self.exa = Exa(api_key=self.api_key)
self.enabled = True
logger.info("Exa Service initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize Exa service: {e}")
self.enabled = False
self.exa = None
async def discover_competitors(
self,
user_url: str,
num_results: int = 10,
include_domains: Optional[List[str]] = None,
exclude_domains: Optional[List[str]] = None,
industry_context: Optional[str] = None,
website_analysis_data: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Discover competitors for a given website using Exa's neural search.
Args:
user_url: The website URL to find competitors for
num_results: Number of competitor results to return (max 100)
include_domains: List of domains to include in search
exclude_domains: List of domains to exclude from search
industry_context: Industry context for better competitor discovery
Returns:
Dictionary containing competitor analysis results
"""
try:
# Ensure we pick up any per-request injected key
self._try_initialize()
if not self.enabled:
raise ValueError("Exa Service is not enabled - API key missing")
logger.info(f"Starting competitor discovery for: {user_url}")
# Extract user domain for exclusion
user_domain = urlparse(user_url).netloc
exclude_domains_list = exclude_domains or []
exclude_domains_list.append(user_domain)
logger.info(f"Excluding domains: {exclude_domains_list}")
# Extract insights from website analysis for better targeting
include_text_queries = []
summary_query = f"Business model, target audience, content strategy{f' in {industry_context}' if industry_context else ''}"
if website_analysis_data:
analysis = website_analysis_data.get('analysis', {})
# Extract key business terms from the analysis
if 'target_audience' in analysis:
audience = analysis['target_audience']
if isinstance(audience, dict) and 'primary_audience' in audience:
primary_audience = audience['primary_audience']
if len(primary_audience.split()) <= 5: # Exa limit
include_text_queries.append(primary_audience)
# Use industry context from analysis
if 'industry' in analysis and analysis['industry']:
industry = analysis['industry']
if len(industry.split()) <= 5:
include_text_queries.append(industry)
# Enhance summary query with analysis insights
if 'content_type' in analysis:
content_type = analysis['content_type']
summary_query += f", {content_type} content strategy"
logger.info(f"Enhanced targeting with analysis data: {include_text_queries}")
# Use the Exa SDK to find similar links with content and context
search_result = self.exa.find_similar_and_contents(
url=user_url,
num_results=min(num_results, 10), # Exa API limit
include_domains=include_domains,
exclude_domains=exclude_domains_list,
include_text=include_text_queries if include_text_queries else None,
text=True,
highlights={
"numSentences": 2,
"highlightsPerUrl": 3,
"query": "Unique value proposition, competitive advantages, market position"
},
summary={
"query": summary_query
}
)
# TODO: Add context generation once SDK supports it
# For now, we'll generate a basic context from the results
context_result = None
# Log the raw Exa API response summary (avoiding verbose markdown content)
logger.info(f"📊 Exa API response for {user_url}:")
logger.info(f" ├─ Request ID: {getattr(search_result, 'request_id', 'N/A')}")
logger.info(f" ├─ Results count: {len(getattr(search_result, 'results', []))}")
logger.info(f" └─ Cost: ${getattr(getattr(search_result, 'cost_dollars', None), 'total', 0)}")
# Note: Full raw response contains verbose markdown content - logging only summary
# To see full response, set EXA_DEBUG=true in environment
# Extract results from search
results = getattr(search_result, 'results', [])
# Log summary of results
logger.info(f" - Found {len(results)} competitors")
# Process and structure the results
competitors = self._process_competitor_results(search_result, user_url)
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
return {
"success": True,
"user_url": user_url,
"competitors": competitors,
"total_competitors": len(competitors),
"analysis_timestamp": datetime.utcnow().isoformat(),
"industry_context": industry_context,
"api_cost": getattr(getattr(search_result, 'cost_dollars', None), 'total', 0) if hasattr(search_result, 'cost_dollars') and getattr(search_result, 'cost_dollars', None) else 0,
"request_id": getattr(search_result, 'request_id', None) if hasattr(search_result, 'request_id') else None
}
except asyncio.TimeoutError:
logger.error("Exa API request timed out")
return {
"success": False,
"error": "Request timed out",
"details": "The competitor discovery request took too long to complete"
}
except Exception as e:
logger.error(f"Error in competitor discovery: {str(e)}")
return {
"success": False,
"error": str(e),
"details": "An unexpected error occurred during competitor discovery"
}
def _process_competitor_results(self, search_result, user_url: str) -> List[Dict[str, Any]]:
"""
Process and structure the Exa SDK response into competitor data.
Args:
search_result: Response from Exa SDK
user_url: Original user URL for reference
Returns:
List of processed competitor data
"""
competitors = []
user_domain = urlparse(user_url).netloc
# Extract results from the SDK response
results = getattr(search_result, 'results', [])
for result in results:
try:
# Extract basic information from the result object
competitor_url = getattr(result, 'url', '')
competitor_domain = urlparse(competitor_url).netloc
# Skip if it's the same domain as the user
if competitor_domain == user_domain:
continue
# Extract content insights
summary = getattr(result, 'summary', '')
highlights = getattr(result, 'highlights', [])
highlight_scores = getattr(result, 'highlight_scores', [])
# Calculate competitive relevance score
relevance_score = self._calculate_relevance_score(result, user_url)
competitor_data = {
"url": competitor_url,
"domain": competitor_domain,
"title": getattr(result, 'title', ''),
"published_date": getattr(result, 'published_date', None),
"author": getattr(result, 'author', None),
"favicon": getattr(result, 'favicon', None),
"image": getattr(result, 'image', None),
"summary": summary,
"highlights": highlights,
"highlight_scores": highlight_scores,
"relevance_score": relevance_score,
"competitive_insights": self._extract_competitive_insights(summary, highlights),
"content_analysis": self._analyze_content_quality(result)
}
competitors.append(competitor_data)
except Exception as e:
logger.warning(f"Error processing competitor result: {str(e)}")
continue
# Sort by relevance score (highest first)
competitors.sort(key=lambda x: x["relevance_score"], reverse=True)
return competitors
def _calculate_relevance_score(self, result, user_url: str) -> float:
"""
Calculate a relevance score for competitor ranking.
Args:
result: Competitor result from Exa SDK
user_url: Original user URL
Returns:
Relevance score between 0 and 1
"""
score = 0.0
# Base score from highlight scores
highlight_scores = getattr(result, 'highlight_scores', [])
if highlight_scores:
score += sum(highlight_scores) / len(highlight_scores) * 0.4
# Score from summary quality
summary = getattr(result, 'summary', '')
if summary and len(summary) > 100:
score += 0.3
# Score from title relevance
title = getattr(result, 'title', '').lower()
if any(keyword in title for keyword in ["business", "company", "service", "solution", "platform"]):
score += 0.2
# Score from URL structure similarity
competitor_url = getattr(result, 'url', '')
if self._url_structure_similarity(user_url, competitor_url) > 0.5:
score += 0.1
return min(score, 1.0)
def _url_structure_similarity(self, url1: str, url2: str) -> float:
"""
Calculate URL structure similarity.
Args:
url1: First URL
url2: Second URL
Returns:
Similarity score between 0 and 1
"""
try:
parsed1 = urlparse(url1)
parsed2 = urlparse(url2)
# Compare path structure
path1_parts = [part for part in parsed1.path.split('/') if part]
path2_parts = [part for part in parsed2.path.split('/') if part]
if not path1_parts or not path2_parts:
return 0.0
# Calculate similarity based on path length and structure
max_parts = max(len(path1_parts), len(path2_parts))
common_parts = sum(1 for p1, p2 in zip(path1_parts, path2_parts) if p1 == p2)
return common_parts / max_parts
except Exception:
return 0.0
def _extract_competitive_insights(self, summary: str, highlights: List[str]) -> Dict[str, Any]:
"""
Extract competitive insights from summary and highlights.
Args:
summary: Content summary
highlights: Content highlights
Returns:
Dictionary of competitive insights
"""
insights = {
"business_model": "",
"target_audience": "",
"value_proposition": "",
"competitive_advantages": [],
"content_strategy": ""
}
# Combine summary and highlights for analysis
content = f"{summary} {' '.join(highlights)}".lower()
# Extract business model indicators
business_models = ["saas", "platform", "service", "product", "consulting", "agency", "marketplace"]
for model in business_models:
if model in content:
insights["business_model"] = model.title()
break
# Extract target audience indicators
audiences = ["enterprise", "small business", "startups", "developers", "marketers", "consumers"]
for audience in audiences:
if audience in content:
insights["target_audience"] = audience.title()
break
# Extract value proposition from highlights
if highlights:
insights["value_proposition"] = highlights[0][:100] + "..." if len(highlights[0]) > 100 else highlights[0]
return insights
def _analyze_content_quality(self, result) -> Dict[str, Any]:
"""
Analyze the content quality of a competitor.
Args:
result: Competitor result from Exa SDK
Returns:
Dictionary of content quality metrics
"""
quality_metrics = {
"content_depth": "medium",
"technical_sophistication": "medium",
"content_freshness": "unknown",
"engagement_potential": "medium"
}
# Analyze content depth from summary length
summary = getattr(result, 'summary', '')
if len(summary) > 300:
quality_metrics["content_depth"] = "high"
elif len(summary) < 100:
quality_metrics["content_depth"] = "low"
# Analyze technical sophistication
technical_keywords = ["api", "integration", "automation", "analytics", "data", "platform"]
highlights = getattr(result, 'highlights', [])
content_text = f"{summary} {' '.join(highlights)}".lower()
technical_count = sum(1 for keyword in technical_keywords if keyword in content_text)
if technical_count >= 3:
quality_metrics["technical_sophistication"] = "high"
elif technical_count == 0:
quality_metrics["technical_sophistication"] = "low"
return quality_metrics
async def discover_social_media_accounts(self, user_url: str) -> Dict[str, Any]:
"""
Discover social media accounts for a given website using Exa's answer API.
Args:
user_url: The website URL to find social media accounts for
Returns:
Dictionary containing social media discovery results
"""
try:
# Ensure we pick up any per-request injected key
self._try_initialize()
if not self.enabled:
raise ValueError("Exa Service is not enabled - API key missing")
logger.info(f"Starting social media discovery for: {user_url}")
# Extract domain from URL for better targeting
domain = urlparse(user_url).netloc.replace('www.', '')
# Use Exa's answer API to find social media accounts
result = self.exa.answer(
f"Find all social media accounts of the url: {domain}. Return a JSON object with facebook, twitter, instagram, linkedin, youtube, and tiktok fields containing the URLs or empty strings if not found.",
model="exa-pro",
text=True
)
# Log the raw Exa API response for debugging
logger.info(f"Raw Exa social media response for {user_url}:")
logger.info(f" - Request ID: {getattr(result, 'request_id', 'N/A')}")
logger.info(f" └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}")
# Note: Full raw response contains verbose content - logging only summary
# To see full response, set EXA_DEBUG=true in environment
# Extract social media data
answer_text = getattr(result, 'answer', '')
citations = getattr(result, 'citations', [])
# Convert AnswerResult objects to dictionaries for JSON serialization
citations_dicts = []
for citation in citations:
if hasattr(citation, '__dict__'):
# Convert object to dictionary
citation_dict = {
'id': getattr(citation, 'id', ''),
'title': getattr(citation, 'title', ''),
'url': getattr(citation, 'url', ''),
'text': getattr(citation, 'text', ''),
'snippet': getattr(citation, 'snippet', ''),
'published_date': getattr(citation, 'published_date', None),
'author': getattr(citation, 'author', None),
'image': getattr(citation, 'image', None),
'favicon': getattr(citation, 'favicon', None)
}
citations_dicts.append(citation_dict)
else:
# If it's already a dict, use as is
citations_dicts.append(citation)
logger.info(f" - Raw answer text: {answer_text}")
logger.info(f" - Citations count: {len(citations_dicts)}")
# Parse the response from the answer (could be JSON or markdown format)
try:
import json
import re
if answer_text.strip().startswith('{'):
# Direct JSON format
answer_data = json.loads(answer_text.strip())
else:
# Parse markdown format with URLs
answer_data = {
"facebook": "",
"twitter": "",
"instagram": "",
"linkedin": "",
"youtube": "",
"tiktok": ""
}
# Extract URLs using regex patterns
facebook_match = re.search(r'Facebook.*?\[([^\]]+)\]', answer_text)
if facebook_match:
answer_data["facebook"] = facebook_match.group(1)
twitter_match = re.search(r'Twitter.*?\[([^\]]+)\]', answer_text)
if twitter_match:
answer_data["twitter"] = twitter_match.group(1)
instagram_match = re.search(r'Instagram.*?\[([^\]]+)\]', answer_text)
if instagram_match:
answer_data["instagram"] = instagram_match.group(1)
linkedin_match = re.search(r'LinkedIn.*?\[([^\]]+)\]', answer_text)
if linkedin_match:
answer_data["linkedin"] = linkedin_match.group(1)
youtube_match = re.search(r'YouTube.*?\[([^\]]+)\]', answer_text)
if youtube_match:
answer_data["youtube"] = youtube_match.group(1)
tiktok_match = re.search(r'TikTok.*?\[([^\]]+)\]', answer_text)
if tiktok_match:
answer_data["tiktok"] = tiktok_match.group(1)
except (json.JSONDecodeError, AttributeError, KeyError):
# If parsing fails, create empty structure
answer_data = {
"facebook": "",
"twitter": "",
"instagram": "",
"linkedin": "",
"youtube": "",
"tiktok": ""
}
logger.info(f" - Parsed social media accounts:")
for platform, url in answer_data.items():
if url:
logger.info(f" {platform}: {url}")
return {
"success": True,
"user_url": user_url,
"social_media_accounts": answer_data,
"citations": citations_dicts,
"analysis_timestamp": datetime.utcnow().isoformat(),
"api_cost": getattr(getattr(result, 'cost_dollars', None), 'total', 0) if hasattr(result, 'cost_dollars') and getattr(result, 'cost_dollars', None) else 0,
"request_id": getattr(result, 'request_id', None) if hasattr(result, 'request_id') else None
}
except Exception as e:
logger.error(f"Error in social media discovery: {str(e)}")
return {
"success": False,
"error": str(e),
"details": "An unexpected error occurred during social media discovery"
}
def _generate_basic_context(self, results: List[Any], user_url: str) -> str:
"""
Generate a basic context string from competitor results for LLM consumption.
Args:
results: List of competitor results from Exa API
user_url: Original user URL for reference
Returns:
Formatted context string
"""
context_parts = [
f"Competitive Analysis for: {user_url}",
f"Found {len(results)} similar websites/competitors:",
""
]
for i, result in enumerate(results[:5], 1): # Limit to top 5 for context
url = getattr(result, 'url', 'Unknown URL')
title = getattr(result, 'title', 'Unknown Title')
summary = getattr(result, 'summary', 'No summary available')
context_parts.extend([
f"{i}. {title}",
f" URL: {url}",
f" Summary: {summary[:200]}{'...' if len(summary) > 200 else ''}",
""
])
context_parts.append("Key insights:")
context_parts.append("- These competitors offer similar services or content")
context_parts.append("- Analyze their content strategy and positioning")
context_parts.append("- Identify opportunities for differentiation")
return "\n".join(context_parts)
async def analyze_competitor_content(
self,
competitor_url: str,
analysis_depth: str = "standard"
) -> Dict[str, Any]:
"""
Perform deeper analysis of a specific competitor.
Args:
competitor_url: URL of the competitor to analyze
analysis_depth: Depth of analysis ("quick", "standard", "deep")
Returns:
Dictionary containing detailed competitor analysis
"""
try:
logger.info(f"Starting detailed analysis for competitor: {competitor_url}")
# Get similar content from this competitor
similar_results = await self.discover_competitors(
competitor_url,
num_results=10,
include_domains=[urlparse(competitor_url).netloc]
)
if not similar_results["success"]:
return similar_results
# Analyze content patterns
content_patterns = self._analyze_content_patterns(similar_results["competitors"])
# Generate competitive insights
competitive_insights = self._generate_competitive_insights(
competitor_url,
similar_results["competitors"],
content_patterns
)
return {
"success": True,
"competitor_url": competitor_url,
"content_patterns": content_patterns,
"competitive_insights": competitive_insights,
"analysis_timestamp": datetime.utcnow().isoformat(),
"analysis_depth": analysis_depth
}
except Exception as e:
logger.error(f"Error in competitor content analysis: {str(e)}")
return {
"success": False,
"error": str(e),
"details": "An unexpected error occurred during competitor analysis"
}
def _analyze_content_patterns(self, competitors: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Analyze content patterns across competitors.
Args:
competitors: List of competitor data
Returns:
Dictionary of content patterns
"""
patterns = {
"common_themes": [],
"content_types": [],
"publishing_patterns": {},
"target_keywords": [],
"content_strategies": []
}
# Analyze common themes
all_summaries = [comp.get("summary", "") for comp in competitors]
# This would be enhanced with NLP analysis in a full implementation
# Analyze content types from URLs
content_types = set()
for comp in competitors:
url = comp.get("url", "")
if "/blog/" in url:
content_types.add("blog")
elif "/product/" in url or "/service/" in url:
content_types.add("product")
elif "/about/" in url:
content_types.add("about")
elif "/contact/" in url:
content_types.add("contact")
patterns["content_types"] = list(content_types)
return patterns
def _generate_competitive_insights(
self,
competitor_url: str,
competitors: List[Dict[str, Any]],
content_patterns: Dict[str, Any]
) -> Dict[str, Any]:
"""
Generate competitive insights from analysis data.
Args:
competitor_url: URL of the competitor
competitors: List of competitor data
content_patterns: Content pattern analysis
Returns:
Dictionary of competitive insights
"""
insights = {
"competitive_strengths": [],
"content_opportunities": [],
"market_positioning": "unknown",
"strategic_recommendations": []
}
# Analyze competitive strengths
for comp in competitors:
if comp.get("relevance_score", 0) > 0.7:
insights["competitive_strengths"].append({
"strength": comp.get("summary", "")[:100],
"relevance": comp.get("relevance_score", 0)
})
# Generate content opportunities
if content_patterns.get("content_types"):
insights["content_opportunities"] = [
f"Develop {content_type} content"
for content_type in content_patterns["content_types"]
]
return insights
def health_check(self) -> Dict[str, Any]:
"""
Check the health of the Exa service.
Returns:
Dictionary containing service health status
"""
try:
# Ensure latest env before health check
self._try_initialize()
if not self.enabled:
return {
"status": "disabled",
"message": "Exa API key not configured",
"timestamp": datetime.utcnow().isoformat()
}
# Test with a simple request using the SDK directly
test_result = self.exa.find_similar(
url="https://example.com",
num_results=1
)
# If we get here without an exception, the API is working
return {
"status": "healthy",
"message": "Exa API is operational",
"timestamp": datetime.utcnow().isoformat(),
"test_successful": True
}
except Exception as e:
return {
"status": "error",
"message": f"Health check failed: {str(e)}",
"timestamp": datetime.utcnow().isoformat()
}
def get_cost_estimate(self, num_results: int, include_content: bool = True) -> Dict[str, Any]:
"""
Get cost estimate for Exa API usage.
Args:
num_results: Number of results requested
include_content: Whether to include content analysis
Returns:
Dictionary containing cost estimate
"""
# Exa API pricing (as of documentation)
if num_results <= 25:
search_cost = 0.005
elif num_results <= 100:
search_cost = 0.025
else:
search_cost = 1.0
content_cost = 0.0
if include_content:
# Estimate content analysis cost
content_cost = num_results * 0.001 # Rough estimate
total_cost = search_cost + content_cost
return {
"search_cost": search_cost,
"content_cost": content_cost,
"total_estimated_cost": total_cost,
"num_results": num_results,
"include_content": include_content
}

View File

@@ -0,0 +1,497 @@
"""
Google Search Service for ALwrity
This service provides real-time industry research using Google Custom Search API,
replacing the mock research system with actual web search capabilities.
Key Features:
- Industry-specific search queries
- Source credibility scoring and ranking
- Content extraction and insight generation
- Real-time information from the last month
- Fallback mechanisms for API failures
Dependencies:
- google-api-python-client
- aiohttp (for async HTTP requests)
- os (for environment variables)
- logging (for debugging)
Author: ALwrity Team
Version: 1.0
Last Updated: January 2025
"""
import os
import json
import asyncio
import aiohttp
from typing import Dict, List, Optional, Any
from datetime import datetime, timedelta
from loguru import logger
class GoogleSearchService:
"""
Service for conducting real industry research using Google Custom Search API.
This service replaces the mock research system with actual web search capabilities,
providing current, relevant industry information for content grounding.
"""
def __init__(self):
"""Initialize the Google Search Service with API credentials."""
self.api_key = os.getenv("GOOGLE_SEARCH_API_KEY")
self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
self.base_url = "https://www.googleapis.com/customsearch/v1"
if not self.api_key or not self.search_engine_id:
raise ValueError("Google Search API credentials not configured. Please set GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables.")
else:
self.enabled = True
logger.info("Google Search Service initialized successfully")
async def search_industry_trends(
self,
topic: str,
industry: str,
max_results: int = 10
) -> List[Dict[str, Any]]:
"""
Search for current industry trends and insights.
Args:
topic: The specific topic to research
industry: The industry context for the search
max_results: Maximum number of search results to return
Returns:
List of search results with credibility scoring
"""
if not self.enabled:
raise RuntimeError("Google Search Service is not enabled. Please configure API credentials.")
try:
# Construct industry-specific search query
search_query = self._build_search_query(topic, industry)
logger.info(f"Searching for: {search_query}")
# Perform the search
search_results = await self._perform_search(search_query, max_results)
# Process and rank results
processed_results = await self._process_search_results(search_results, topic, industry)
# Extract insights and statistics
insights = await self._extract_insights(processed_results, topic, industry)
logger.info(f"Search completed successfully. Found {len(processed_results)} relevant sources.")
return {
"sources": processed_results,
"key_insights": insights["insights"],
"statistics": insights["statistics"],
"grounding_enabled": True,
"search_query": search_query,
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
logger.error(f"Google search failed: {str(e)}")
raise RuntimeError(f"Google search failed: {str(e)}")
def _build_search_query(self, topic: str, industry: str) -> str:
"""
Build an optimized search query for industry research.
Args:
topic: The specific topic to research
industry: The industry context
Returns:
Optimized search query string
"""
# Add industry-specific terms and current year for relevance
current_year = datetime.now().year
# Industry-specific search patterns
industry_patterns = {
"Technology": ["trends", "innovations", "developments", "insights"],
"Healthcare": ["advances", "research", "treatments", "studies"],
"Finance": ["market analysis", "trends", "reports", "insights"],
"Marketing": ["strategies", "trends", "best practices", "case studies"],
"Education": ["innovations", "trends", "research", "best practices"]
}
# Get industry-specific terms
industry_terms = industry_patterns.get(industry, ["trends", "insights", "developments"])
# Build the query
query_components = [
topic,
industry,
f"{current_year}",
"latest",
"trends",
"insights"
]
# Add industry-specific terms
query_components.extend(industry_terms[:2])
return " ".join(query_components)
async def _perform_search(self, query: str, max_results: int) -> List[Dict[str, Any]]:
"""
Perform the actual Google Custom Search API call.
Args:
query: The search query to execute
max_results: Maximum number of results to return
Returns:
Raw search results from Google API
"""
params = {
"key": self.api_key,
"cx": self.search_engine_id,
"q": query,
"num": min(max_results, 10), # Google CSE max is 10 per request
"dateRestrict": "m1", # Last month
"sort": "date", # Sort by date for current information
"safe": "active" # Safe search for professional content
}
async with aiohttp.ClientSession() as session:
async with session.get(self.base_url, params=params) as response:
if response.status == 200:
data = await response.json()
return data.get("items", [])
else:
error_text = await response.text()
logger.error(f"Google Search API error: {response.status} - {error_text}")
raise Exception(f"Search API returned status {response.status}")
async def _process_search_results(
self,
raw_results: List[Dict[str, Any]],
topic: str,
industry: str
) -> List[Dict[str, Any]]:
"""
Process and rank search results by relevance and credibility.
Args:
raw_results: Raw search results from Google API
topic: The research topic for relevance scoring
industry: The industry context for relevance scoring
Returns:
Processed and ranked search results
"""
processed_results = []
for result in raw_results:
try:
# Extract basic information
title = result.get("title", "")
url = result.get("link", "")
snippet = result.get("snippet", "")
# Calculate relevance score
relevance_score = self._calculate_relevance_score(title, snippet, topic, industry)
# Calculate credibility score
credibility_score = self._calculate_credibility_score(url, title)
# Extract publication date if available
publication_date = self._extract_publication_date(result)
# Calculate domain authority
domain_authority = self._calculate_domain_authority(url)
processed_result = {
"title": title,
"url": url,
"content": snippet,
"relevance_score": relevance_score,
"credibility_score": credibility_score,
"domain_authority": domain_authority,
"publication_date": publication_date,
"source_type": self._categorize_source(url, title),
"raw_result": result
}
processed_results.append(processed_result)
except Exception as e:
logger.warning(f"Failed to process search result: {str(e)}")
continue
# Sort by combined score (relevance + credibility)
processed_results.sort(
key=lambda x: (x["relevance_score"] + x["credibility_score"]) / 2,
reverse=True
)
return processed_results
def _calculate_relevance_score(self, title: str, snippet: str, topic: str, industry: str) -> float:
"""
Calculate relevance score based on topic and industry alignment.
Args:
title: The title of the search result
snippet: The snippet/description of the result
topic: The research topic
industry: The industry context
Returns:
Relevance score between 0.0 and 1.0
"""
score = 0.0
text = f"{title} {snippet}".lower()
# Topic relevance (40% of score)
topic_words = topic.lower().split()
topic_matches = sum(1 for word in topic_words if word in text)
topic_score = min(topic_matches / len(topic_words), 1.0) * 0.4
# Industry relevance (30% of score)
industry_words = industry.lower().split()
industry_matches = sum(1 for word in industry_words if word in text)
industry_score = min(industry_matches / len(industry_words), 1.0) * 0.3
# Content quality indicators (30% of score)
quality_indicators = [
"research", "study", "analysis", "report", "insights",
"trends", "data", "statistics", "findings", "expert"
]
quality_matches = sum(1 for indicator in quality_indicators if indicator in text)
quality_score = min(quality_matches / len(quality_indicators), 1.0) * 0.3
score = topic_score + industry_score + quality_score
return round(score, 3)
def _calculate_credibility_score(self, url: str, title: str) -> float:
"""
Calculate credibility score based on URL and title analysis.
Args:
url: The URL of the source
title: The title of the content
Returns:
Credibility score between 0.0 and 1.0
"""
score = 0.5 # Base score
# Domain credibility indicators
credible_domains = [
"harvard.edu", "stanford.edu", "mit.edu", "berkeley.edu", # Academic
"forbes.com", "bloomberg.com", "reuters.com", "wsj.com", # Business
"nature.com", "science.org", "ieee.org", "acm.org", # Scientific
"linkedin.com", "medium.com", "substack.com" # Professional
]
# Check if domain is in credible list
domain = self._extract_domain(url)
if any(credible_domain in domain for credible_domain in credible_domains):
score += 0.3
# Title credibility indicators
credible_indicators = [
"research", "study", "analysis", "report", "insights",
"expert", "professional", "industry", "trends"
]
title_lower = title.lower()
credible_matches = sum(1 for indicator in credible_indicators if indicator in title_lower)
score += min(credible_matches * 0.1, 0.2)
return round(min(score, 1.0), 3)
def _calculate_domain_authority(self, url: str) -> float:
"""
Calculate domain authority based on URL analysis.
Args:
url: The URL to analyze
Returns:
Domain authority score between 0.0 and 1.0
"""
domain = self._extract_domain(url)
# High authority domains
high_authority = [
"harvard.edu", "stanford.edu", "mit.edu", "berkeley.edu",
"forbes.com", "bloomberg.com", "reuters.com", "wsj.com",
"nature.com", "science.org", "ieee.org", "acm.org"
]
# Medium authority domains
medium_authority = [
"linkedin.com", "medium.com", "substack.com", "techcrunch.com",
"venturebeat.com", "wired.com", "theverge.com"
]
if any(auth_domain in domain for auth_domain in high_authority):
return 0.9
elif any(auth_domain in domain for auth_domain in medium_authority):
return 0.7
else:
# Basic scoring for other domains
return 0.5
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL."""
try:
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.netloc.lower()
except:
return url.lower()
def _extract_publication_date(self, result: Dict[str, Any]) -> Optional[str]:
"""Extract publication date from search result if available."""
# Check for various date fields
date_fields = ["pagemap", "metatags", "date"]
for field in date_fields:
if field in result:
date_value = result[field]
if isinstance(date_value, dict):
# Look for common date keys
for date_key in ["date", "pubdate", "article:published_time"]:
if date_key in date_value:
return date_value[date_key]
elif isinstance(date_value, str):
return date_value
return None
def _categorize_source(self, url: str, title: str) -> str:
"""Categorize the source type based on URL and title."""
domain = self._extract_domain(url)
title_lower = title.lower()
# Academic sources
if any(edu in domain for edu in [".edu", "harvard", "stanford", "mit"]):
return "academic"
# Business/News sources
if any(biz in domain for biz in ["forbes", "bloomberg", "reuters", "wsj"]):
return "business_news"
# Professional platforms
if any(prof in domain for prof in ["linkedin", "medium", "substack"]):
return "professional_platform"
# Research/Scientific
if any(research in domain for research in ["nature", "science", "ieee", "acm"]):
return "research_scientific"
# Industry reports
if any(report in title_lower for report in ["report", "study", "analysis", "research"]):
return "industry_report"
return "general"
async def _extract_insights(
self,
sources: List[Dict[str, Any]],
topic: str,
industry: str
) -> Dict[str, List[str]]:
"""
Extract key insights and statistics from search results.
Args:
sources: Processed search results
topic: The research topic
industry: The industry context
Returns:
Dictionary containing insights and statistics
"""
insights = []
statistics = []
# Extract insights from top sources
top_sources = sources[:5] # Top 5 most relevant sources
for source in top_sources:
content = source.get("content", "")
# Look for insight patterns
insight_patterns = [
"shows", "indicates", "suggests", "reveals", "demonstrates",
"highlights", "emphasizes", "points to", "suggests that"
]
for pattern in insight_patterns:
if pattern in content.lower():
# Extract the sentence containing the insight
sentences = content.split(". ")
for sentence in sentences:
if pattern in sentence.lower():
insights.append(sentence.strip())
break
# Look for statistical patterns
stat_patterns = [
r'\d+%', # Percentages
r'\d+ percent', # Written percentages
r'\$\d+', # Dollar amounts
r'\d+ million', # Millions
r'\d+ billion', # Billions
r'\d+ out of \d+', # Ratios
]
import re
for pattern in stat_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches:
statistics.append(f"{match}")
# Limit the number of insights and statistics
insights = insights[:10] # Top 10 insights
statistics = statistics[:10] # Top 10 statistics
return {
"insights": insights,
"statistics": statistics
}
async def test_api_connection(self) -> Dict[str, Any]:
"""
Test the Google Search API connection.
Returns:
Test results and status information
"""
if not self.enabled:
raise RuntimeError("Google Search Service is not enabled. Please configure API credentials.")
try:
# Perform a simple test search
test_query = "AI technology trends 2024"
test_results = await self._perform_search(test_query, 1)
return {
"status": "success",
"message": "Google Search API connection successful",
"enabled": True,
"test_results_count": len(test_results),
"api_key_configured": bool(self.api_key),
"search_engine_configured": bool(self.search_engine_id)
}
except Exception as e:
return {
"status": "error",
"message": f"Google Search API connection failed: {str(e)}",
"enabled": False,
"error": str(e)
}

View File

@@ -0,0 +1,23 @@
"""
Research Intent Package
This package provides intent-driven research capabilities:
- Intent inference from user input
- Targeted query generation
- Intent-aware result analysis
Author: ALwrity Team
Version: 1.0
"""
from .research_intent_inference import ResearchIntentInference
from .intent_query_generator import IntentQueryGenerator
from .intent_aware_analyzer import IntentAwareAnalyzer
from .intent_prompt_builder import IntentPromptBuilder
__all__ = [
"ResearchIntentInference",
"IntentQueryGenerator",
"IntentAwareAnalyzer",
"IntentPromptBuilder",
]

View File

@@ -0,0 +1,547 @@
"""
Intent-Aware Result Analyzer
Analyzes research results based on user intent.
Extracts exactly what the user needs from raw research data.
This is the key innovation - instead of generic analysis,
we analyze results through the lens of what the user wants to accomplish.
Author: ALwrity Team
Version: 1.0
"""
import json
from typing import Dict, Any, List, Optional
from loguru import logger
from models.research_intent_models import (
ResearchIntent,
IntentDrivenResearchResult,
ExpectedDeliverable,
StatisticWithCitation,
ExpertQuote,
CaseStudySummary,
TrendAnalysis,
ComparisonTable,
ComparisonItem,
ProsCons,
SourceWithRelevance,
)
from models.research_persona_models import ResearchPersona
from .intent_prompt_builder import IntentPromptBuilder
class IntentAwareAnalyzer:
"""
Analyzes research results based on user intent.
Instead of generic summaries, this extracts exactly what the user
needs: statistics, quotes, case studies, trends, etc.
"""
def __init__(self):
"""Initialize the analyzer."""
self.prompt_builder = IntentPromptBuilder()
logger.info("IntentAwareAnalyzer initialized")
async def analyze(
self,
raw_results: Dict[str, Any],
intent: ResearchIntent,
research_persona: Optional[ResearchPersona] = None,
) -> IntentDrivenResearchResult:
"""
Analyze raw research results based on user intent.
Args:
raw_results: Raw results from Exa/Tavily/Google
intent: The user's research intent
research_persona: Optional persona for context
Returns:
IntentDrivenResearchResult with extracted deliverables
"""
try:
logger.info(f"Analyzing results for intent: {intent.primary_question[:50]}...")
# Format raw results for analysis
formatted_results = self._format_raw_results(raw_results)
# Build the analysis prompt
prompt = self.prompt_builder.build_intent_aware_analysis_prompt(
raw_results=formatted_results,
intent=intent,
research_persona=research_persona,
)
# Define the expected JSON schema
analysis_schema = self._build_analysis_schema(intent.expected_deliverables)
# Call LLM for analysis
from services.llm_providers.main_text_generation import llm_text_gen
result = llm_text_gen(
prompt=prompt,
json_struct=analysis_schema,
user_id=None
)
if isinstance(result, dict) and "error" in result:
logger.error(f"Intent-aware analysis failed: {result.get('error')}")
return self._create_fallback_result(raw_results, intent)
# Parse and validate the result
analyzed_result = self._parse_analysis_result(result, intent, raw_results)
logger.info(
f"Analysis complete: {len(analyzed_result.key_takeaways)} takeaways, "
f"{len(analyzed_result.statistics)} stats, "
f"{len(analyzed_result.sources)} sources"
)
return analyzed_result
except Exception as e:
logger.error(f"Error in intent-aware analysis: {e}")
return self._create_fallback_result(raw_results, intent)
def _format_raw_results(self, raw_results: Dict[str, Any]) -> str:
"""Format raw research results for LLM analysis."""
formatted_parts = []
# Extract content
content = raw_results.get("content", "")
if content:
formatted_parts.append(f"=== MAIN CONTENT ===\n{content[:8000]}")
# Extract sources with their content
sources = raw_results.get("sources", [])
if sources:
formatted_parts.append("\n=== SOURCES ===")
for i, source in enumerate(sources[:15], 1): # Limit to 15 sources
title = source.get("title", "Untitled")
url = source.get("url", "")
excerpt = source.get("excerpt", source.get("text", source.get("content", "")))
formatted_parts.append(f"\nSource {i}: {title}")
formatted_parts.append(f"URL: {url}")
if excerpt:
formatted_parts.append(f"Content: {excerpt[:500]}")
# Extract grounding metadata if available (from Google)
grounding = raw_results.get("grounding_metadata", {})
if grounding:
formatted_parts.append("\n=== GROUNDING DATA ===")
formatted_parts.append(json.dumps(grounding, indent=2)[:2000])
# Extract any AI answers (from Tavily)
answer = raw_results.get("answer", "")
if answer:
formatted_parts.append(f"\n=== AI-GENERATED ANSWER ===\n{answer}")
return "\n".join(formatted_parts)
def _build_analysis_schema(self, expected_deliverables: List[str]) -> Dict[str, Any]:
"""Build JSON schema based on expected deliverables."""
# Base schema
schema = {
"type": "object",
"properties": {
"primary_answer": {"type": "string"},
"secondary_answers": {
"type": "object",
"additionalProperties": {"type": "string"}
},
"executive_summary": {"type": "string"},
"key_takeaways": {
"type": "array",
"items": {"type": "string"},
"maxItems": 7
},
"confidence": {"type": "number"},
"gaps_identified": {
"type": "array",
"items": {"type": "string"}
},
"follow_up_queries": {
"type": "array",
"items": {"type": "string"}
},
},
"required": ["primary_answer", "executive_summary", "key_takeaways", "confidence"]
}
# Add deliverable-specific properties
if ExpectedDeliverable.KEY_STATISTICS.value in expected_deliverables:
schema["properties"]["statistics"] = {
"type": "array",
"items": {
"type": "object",
"properties": {
"statistic": {"type": "string"},
"value": {"type": "string"},
"context": {"type": "string"},
"source": {"type": "string"},
"url": {"type": "string"},
"credibility": {"type": "number"},
"recency": {"type": "string"}
},
"required": ["statistic", "context", "source", "url"]
}
}
if ExpectedDeliverable.EXPERT_QUOTES.value in expected_deliverables:
schema["properties"]["expert_quotes"] = {
"type": "array",
"items": {
"type": "object",
"properties": {
"quote": {"type": "string"},
"speaker": {"type": "string"},
"title": {"type": "string"},
"organization": {"type": "string"},
"source": {"type": "string"},
"url": {"type": "string"}
},
"required": ["quote", "speaker", "source", "url"]
}
}
if ExpectedDeliverable.CASE_STUDIES.value in expected_deliverables:
schema["properties"]["case_studies"] = {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"organization": {"type": "string"},
"challenge": {"type": "string"},
"solution": {"type": "string"},
"outcome": {"type": "string"},
"key_metrics": {"type": "array", "items": {"type": "string"}},
"source": {"type": "string"},
"url": {"type": "string"}
},
"required": ["title", "organization", "challenge", "solution", "outcome"]
}
}
if ExpectedDeliverable.TRENDS.value in expected_deliverables:
schema["properties"]["trends"] = {
"type": "array",
"items": {
"type": "object",
"properties": {
"trend": {"type": "string"},
"direction": {"type": "string"},
"evidence": {"type": "array", "items": {"type": "string"}},
"impact": {"type": "string"},
"timeline": {"type": "string"},
"sources": {"type": "array", "items": {"type": "string"}}
},
"required": ["trend", "direction", "evidence"]
}
}
if ExpectedDeliverable.COMPARISONS.value in expected_deliverables:
schema["properties"]["comparisons"] = {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"criteria": {"type": "array", "items": {"type": "string"}},
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"pros": {"type": "array", "items": {"type": "string"}},
"cons": {"type": "array", "items": {"type": "string"}},
"features": {"type": "object"}
}
}
},
"verdict": {"type": "string"}
}
}
}
if ExpectedDeliverable.PROS_CONS.value in expected_deliverables:
schema["properties"]["pros_cons"] = {
"type": "object",
"properties": {
"subject": {"type": "string"},
"pros": {"type": "array", "items": {"type": "string"}},
"cons": {"type": "array", "items": {"type": "string"}},
"balanced_verdict": {"type": "string"}
}
}
if ExpectedDeliverable.BEST_PRACTICES.value in expected_deliverables:
schema["properties"]["best_practices"] = {
"type": "array",
"items": {"type": "string"}
}
if ExpectedDeliverable.STEP_BY_STEP.value in expected_deliverables:
schema["properties"]["step_by_step"] = {
"type": "array",
"items": {"type": "string"}
}
if ExpectedDeliverable.DEFINITIONS.value in expected_deliverables:
schema["properties"]["definitions"] = {
"type": "object",
"additionalProperties": {"type": "string"}
}
if ExpectedDeliverable.EXAMPLES.value in expected_deliverables:
schema["properties"]["examples"] = {
"type": "array",
"items": {"type": "string"}
}
if ExpectedDeliverable.PREDICTIONS.value in expected_deliverables:
schema["properties"]["predictions"] = {
"type": "array",
"items": {"type": "string"}
}
# Always include sources and suggested outline
schema["properties"]["sources"] = {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"url": {"type": "string"},
"relevance_score": {"type": "number"},
"relevance_reason": {"type": "string"},
"content_type": {"type": "string"},
"credibility_score": {"type": "number"}
},
"required": ["title", "url"]
}
}
schema["properties"]["suggested_outline"] = {
"type": "array",
"items": {"type": "string"}
}
return schema
def _parse_analysis_result(
self,
result: Dict[str, Any],
intent: ResearchIntent,
raw_results: Dict[str, Any],
) -> IntentDrivenResearchResult:
"""Parse LLM analysis result into structured format."""
# Parse statistics
statistics = []
for stat in result.get("statistics", []):
try:
statistics.append(StatisticWithCitation(
statistic=stat.get("statistic", ""),
value=stat.get("value"),
context=stat.get("context", ""),
source=stat.get("source", ""),
url=stat.get("url", ""),
credibility=float(stat.get("credibility", 0.8)),
recency=stat.get("recency"),
))
except Exception as e:
logger.warning(f"Failed to parse statistic: {e}")
# Parse expert quotes
expert_quotes = []
for quote in result.get("expert_quotes", []):
try:
expert_quotes.append(ExpertQuote(
quote=quote.get("quote", ""),
speaker=quote.get("speaker", ""),
title=quote.get("title"),
organization=quote.get("organization"),
context=quote.get("context"),
source=quote.get("source", ""),
url=quote.get("url", ""),
))
except Exception as e:
logger.warning(f"Failed to parse expert quote: {e}")
# Parse case studies
case_studies = []
for cs in result.get("case_studies", []):
try:
case_studies.append(CaseStudySummary(
title=cs.get("title", ""),
organization=cs.get("organization", ""),
challenge=cs.get("challenge", ""),
solution=cs.get("solution", ""),
outcome=cs.get("outcome", ""),
key_metrics=cs.get("key_metrics", []),
source=cs.get("source", ""),
url=cs.get("url", ""),
))
except Exception as e:
logger.warning(f"Failed to parse case study: {e}")
# Parse trends
trends = []
for trend in result.get("trends", []):
try:
trends.append(TrendAnalysis(
trend=trend.get("trend", ""),
direction=trend.get("direction", "growing"),
evidence=trend.get("evidence", []),
impact=trend.get("impact"),
timeline=trend.get("timeline"),
sources=trend.get("sources", []),
))
except Exception as e:
logger.warning(f"Failed to parse trend: {e}")
# Parse comparisons
comparisons = []
for comp in result.get("comparisons", []):
try:
items = []
for item in comp.get("items", []):
items.append(ComparisonItem(
name=item.get("name", ""),
description=item.get("description"),
pros=item.get("pros", []),
cons=item.get("cons", []),
features=item.get("features", {}),
rating=item.get("rating"),
source=item.get("source"),
))
comparisons.append(ComparisonTable(
title=comp.get("title", ""),
criteria=comp.get("criteria", []),
items=items,
winner=comp.get("winner"),
verdict=comp.get("verdict"),
))
except Exception as e:
logger.warning(f"Failed to parse comparison: {e}")
# Parse pros/cons
pros_cons = None
pc_data = result.get("pros_cons")
if pc_data:
try:
pros_cons = ProsCons(
subject=pc_data.get("subject", intent.original_input),
pros=pc_data.get("pros", []),
cons=pc_data.get("cons", []),
balanced_verdict=pc_data.get("balanced_verdict", ""),
)
except Exception as e:
logger.warning(f"Failed to parse pros/cons: {e}")
# Parse sources
sources = []
for src in result.get("sources", []):
try:
sources.append(SourceWithRelevance(
title=src.get("title", ""),
url=src.get("url", ""),
excerpt=src.get("excerpt"),
relevance_score=float(src.get("relevance_score", 0.8)),
relevance_reason=src.get("relevance_reason"),
content_type=src.get("content_type"),
published_date=src.get("published_date"),
credibility_score=float(src.get("credibility_score", 0.8)),
))
except Exception as e:
logger.warning(f"Failed to parse source: {e}")
# If no sources from analysis, extract from raw results
if not sources:
sources = self._extract_sources_from_raw(raw_results)
return IntentDrivenResearchResult(
success=True,
primary_answer=result.get("primary_answer", ""),
secondary_answers=result.get("secondary_answers", {}),
statistics=statistics,
expert_quotes=expert_quotes,
case_studies=case_studies,
comparisons=comparisons,
trends=trends,
best_practices=result.get("best_practices", []),
step_by_step=result.get("step_by_step", []),
pros_cons=pros_cons,
definitions=result.get("definitions", {}),
examples=result.get("examples", []),
predictions=result.get("predictions", []),
executive_summary=result.get("executive_summary", ""),
key_takeaways=result.get("key_takeaways", []),
suggested_outline=result.get("suggested_outline", []),
sources=sources,
raw_content=self._format_raw_results(raw_results)[:5000],
confidence=float(result.get("confidence", 0.7)),
gaps_identified=result.get("gaps_identified", []),
follow_up_queries=result.get("follow_up_queries", []),
original_intent=intent,
)
def _extract_sources_from_raw(self, raw_results: Dict[str, Any]) -> List[SourceWithRelevance]:
"""Extract sources from raw results when analysis doesn't provide them."""
sources = []
for src in raw_results.get("sources", [])[:10]:
try:
sources.append(SourceWithRelevance(
title=src.get("title", "Untitled"),
url=src.get("url", ""),
excerpt=src.get("excerpt", src.get("text", ""))[:200],
relevance_score=0.8,
credibility_score=float(src.get("credibility_score", 0.8)),
))
except Exception as e:
logger.warning(f"Failed to extract source: {e}")
return sources
def _create_fallback_result(
self,
raw_results: Dict[str, Any],
intent: ResearchIntent,
) -> IntentDrivenResearchResult:
"""Create a fallback result when AI analysis fails."""
# Extract basic information from raw results
content = raw_results.get("content", "")
sources = self._extract_sources_from_raw(raw_results)
# Create basic takeaways from content
key_takeaways = []
if content:
sentences = content.split(". ")[:5]
key_takeaways = [s.strip() + "." for s in sentences if len(s) > 20]
return IntentDrivenResearchResult(
success=True,
primary_answer=f"Research findings for: {intent.primary_question}",
secondary_answers={},
executive_summary=content[:300] if content else "Research completed",
key_takeaways=key_takeaways,
sources=sources,
raw_content=self._format_raw_results(raw_results)[:5000],
confidence=0.5,
gaps_identified=[
"AI analysis failed - showing raw results",
"Manual review recommended"
],
follow_up_queries=[],
original_intent=intent,
)

View File

@@ -0,0 +1,627 @@
"""
Intent Prompt Builder
Builds comprehensive AI prompts for:
1. Intent inference from user input
2. Targeted query generation
3. Intent-aware result analysis
Author: ALwrity Team
Version: 1.0
"""
import json
from typing import Dict, Any, List, Optional
from loguru import logger
from models.research_intent_models import (
ResearchIntent,
ResearchPurpose,
ContentOutput,
ExpectedDeliverable,
ResearchDepthLevel,
)
from models.research_persona_models import ResearchPersona
class IntentPromptBuilder:
"""Builds prompts for intent-driven research."""
# Purpose explanations for the AI
PURPOSE_EXPLANATIONS = {
ResearchPurpose.LEARN: "User wants to understand a topic for personal knowledge",
ResearchPurpose.CREATE_CONTENT: "User will create content (blog, video, podcast) from this research",
ResearchPurpose.MAKE_DECISION: "User needs to make a choice/decision based on research",
ResearchPurpose.COMPARE: "User wants to compare alternatives or competitors",
ResearchPurpose.SOLVE_PROBLEM: "User is looking for a solution to a specific problem",
ResearchPurpose.FIND_DATA: "User needs specific statistics, facts, or citations",
ResearchPurpose.EXPLORE_TRENDS: "User wants to understand current/future trends",
ResearchPurpose.VALIDATE: "User wants to verify or fact-check information",
ResearchPurpose.GENERATE_IDEAS: "User wants to brainstorm content ideas",
}
# Deliverable descriptions
DELIVERABLE_DESCRIPTIONS = {
ExpectedDeliverable.KEY_STATISTICS: "Numbers, percentages, data points with citations",
ExpectedDeliverable.EXPERT_QUOTES: "Authoritative quotes from industry experts",
ExpectedDeliverable.CASE_STUDIES: "Real examples and success stories",
ExpectedDeliverable.COMPARISONS: "Side-by-side analysis tables",
ExpectedDeliverable.TRENDS: "Current and emerging industry trends",
ExpectedDeliverable.BEST_PRACTICES: "Recommended approaches and guidelines",
ExpectedDeliverable.STEP_BY_STEP: "Process guides and how-to instructions",
ExpectedDeliverable.PROS_CONS: "Advantages and disadvantages analysis",
ExpectedDeliverable.DEFINITIONS: "Clear explanations of concepts and terms",
ExpectedDeliverable.CITATIONS: "Authoritative sources for reference",
ExpectedDeliverable.EXAMPLES: "Concrete examples to illustrate points",
ExpectedDeliverable.PREDICTIONS: "Future outlook and predictions",
}
def build_intent_inference_prompt(
self,
user_input: str,
keywords: List[str],
research_persona: Optional[ResearchPersona] = None,
competitor_data: Optional[List[Dict]] = None,
industry: Optional[str] = None,
target_audience: Optional[str] = None,
) -> str:
"""
Build prompt for inferring user's research intent.
This prompt analyzes the user's input and determines:
- What they want to accomplish
- What questions they need answered
- What specific deliverables they need
"""
# Build persona context
persona_context = self._build_persona_context(research_persona, industry, target_audience)
# Build competitor context
competitor_context = self._build_competitor_context(competitor_data)
prompt = f"""You are an expert research intent analyzer. Your job is to understand what a content creator REALLY needs from their research.
## USER INPUT
"{user_input}"
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
## USER CONTEXT
{persona_context}
{competitor_context}
## YOUR TASK
Analyze the user's input and infer their research intent. Determine:
1. **INPUT TYPE**: Is this:
- "keywords": Simple topic keywords (e.g., "AI healthcare 2025")
- "question": A specific question (e.g., "What are the best AI tools for healthcare?")
- "goal": A goal statement (e.g., "I need to write a blog about AI in healthcare")
- "mixed": Combination of above
2. **PRIMARY QUESTION**: What is the main question to answer? Convert their input into a clear question.
3. **SECONDARY QUESTIONS**: What related questions should also be answered? (3-5 questions)
4. **PURPOSE**: Why are they researching? Choose ONE:
- "learn": Understand a topic for personal knowledge
- "create_content": Create content (blog, video, podcast)
- "make_decision": Make a choice between options
- "compare": Compare alternatives/competitors
- "solve_problem": Find a solution
- "find_data": Get specific statistics/facts
- "explore_trends": Understand industry trends
- "validate": Verify claims/information
- "generate_ideas": Brainstorm ideas
5. **CONTENT OUTPUT**: What will they create? Choose ONE:
- "blog", "podcast", "video", "social_post", "newsletter", "presentation", "report", "whitepaper", "email", "general"
6. **EXPECTED DELIVERABLES**: What specific outputs do they need? Choose ALL that apply:
- "key_statistics": Numbers, data points
- "expert_quotes": Authoritative quotes
- "case_studies": Real examples
- "comparisons": Side-by-side analysis
- "trends": Industry trends
- "best_practices": Recommendations
- "step_by_step": How-to guides
- "pros_cons": Advantages/disadvantages
- "definitions": Concept explanations
- "citations": Source references
- "examples": Concrete examples
- "predictions": Future outlook
7. **DEPTH**: How deep should the research go?
- "overview": Quick summary
- "detailed": In-depth analysis
- "expert": Comprehensive expert-level
8. **FOCUS AREAS**: What specific aspects should be researched? (2-4 areas)
9. **PERSPECTIVE**: From whose viewpoint? (e.g., "marketing manager", "small business owner")
10. **TIME SENSITIVITY**: Is recency important?
- "real_time": Latest only (past 24-48 hours)
- "recent": Past week/month
- "historical": Include older content
- "evergreen": Timeless content
11. **CONFIDENCE**: How confident are you in this inference? (0.0-1.0)
- If < 0.7, set needs_clarification to true and provide clarifying_questions
## OUTPUT FORMAT
Return a JSON object:
```json
{{
"input_type": "keywords|question|goal|mixed",
"primary_question": "The main question to answer",
"secondary_questions": ["question 1", "question 2", "question 3"],
"purpose": "one of the purpose options",
"content_output": "one of the content options",
"expected_deliverables": ["deliverable1", "deliverable2"],
"depth": "overview|detailed|expert",
"focus_areas": ["area1", "area2"],
"perspective": "target perspective or null",
"time_sensitivity": "real_time|recent|historical|evergreen",
"confidence": 0.85,
"needs_clarification": false,
"clarifying_questions": [],
"analysis_summary": "Brief summary of what the user wants"
}}
```
## IMPORTANT RULES
1. Always convert vague input into a specific primary question
2. Infer deliverables based on purpose (e.g., create_content → statistics + examples)
3. Use persona context to refine perspective and focus areas
4. If input is ambiguous, provide clarifying questions
5. Default to "detailed" depth unless input suggests otherwise
6. For content creation, include relevant deliverables automatically
"""
return prompt
def build_query_generation_prompt(
self,
intent: ResearchIntent,
research_persona: Optional[ResearchPersona] = None,
) -> str:
"""
Build prompt for generating targeted research queries.
Generates multiple queries, each targeting a specific deliverable.
"""
deliverables_list = "\n".join([
f"- {d}: {self.DELIVERABLE_DESCRIPTIONS.get(ExpectedDeliverable(d), d)}"
for d in intent.expected_deliverables
])
persona_keywords = ""
if research_persona and research_persona.suggested_keywords:
persona_keywords = f"\nSUGGESTED KEYWORDS FROM PERSONA: {', '.join(research_persona.suggested_keywords[:10])}"
prompt = f"""You are a research query optimizer. Generate multiple targeted search queries based on the user's research intent.
## RESEARCH INTENT
PRIMARY QUESTION: {intent.primary_question}
SECONDARY QUESTIONS:
{chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None'}
PURPOSE: {intent.purpose} - {self.PURPOSE_EXPLANATIONS.get(ResearchPurpose(intent.purpose), intent.purpose)}
CONTENT OUTPUT: {intent.content_output}
EXPECTED DELIVERABLES:
{deliverables_list}
DEPTH: {intent.depth}
FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'}
PERSPECTIVE: {intent.perspective or 'General audience'}
TIME SENSITIVITY: {intent.time_sensitivity or 'No specific requirement'}
{persona_keywords}
## YOUR TASK
Generate 4-8 targeted research queries. Each query should:
1. Target a specific deliverable or question
2. Be optimized for semantic search (Exa/Tavily)
3. Include relevant context for better results
For each query, specify:
- The query string
- What deliverable it targets
- Best provider (exa for semantic/deep, tavily for news/real-time, google for factual)
- Priority (1-5, higher = more important)
- What we expect to find
## OUTPUT FORMAT
Return a JSON object:
```json
{{
"queries": [
{{
"query": "Healthcare AI adoption statistics 2025 hospitals implementation data",
"purpose": "key_statistics",
"provider": "exa",
"priority": 5,
"expected_results": "Statistics on hospital AI adoption rates"
}},
{{
"query": "AI healthcare trends predictions future outlook 2025 2026",
"purpose": "trends",
"provider": "tavily",
"priority": 4,
"expected_results": "Current trends and future predictions in healthcare AI"
}}
],
"enhanced_keywords": ["keyword1", "keyword2", "keyword3"],
"research_angles": [
"Angle 1: Focus on adoption challenges",
"Angle 2: Focus on ROI and outcomes"
]
}}
```
## QUERY OPTIMIZATION RULES
1. For STATISTICS: Include words like "statistics", "data", "percentage", "report", "study"
2. For CASE STUDIES: Include "case study", "success story", "implementation", "example"
3. For TRENDS: Include "trends", "future", "predictions", "emerging", year numbers
4. For EXPERT QUOTES: Include expert names if known, or "expert opinion", "interview"
5. For COMPARISONS: Include "vs", "compare", "comparison", "alternative"
6. For NEWS/REAL-TIME: Use Tavily, include recent year/month
7. For ACADEMIC/DEEP: Use Exa with neural search
"""
return prompt
def build_intent_aware_analysis_prompt(
self,
raw_results: str,
intent: ResearchIntent,
research_persona: Optional[ResearchPersona] = None,
) -> str:
"""
Build prompt for analyzing research results based on user intent.
This is the key prompt that extracts exactly what the user needs.
"""
purpose_explanation = self.PURPOSE_EXPLANATIONS.get(
ResearchPurpose(intent.purpose),
intent.purpose
)
deliverables_instructions = self._build_deliverables_instructions(intent.expected_deliverables)
perspective_instruction = ""
if intent.perspective:
perspective_instruction = f"\n**PERSPECTIVE**: Analyze results from the viewpoint of: {intent.perspective}"
prompt = f"""You are a research analyst helping a content creator find exactly what they need. Your job is to analyze raw research results and extract precisely what the user is looking for.
## USER'S RESEARCH INTENT
PRIMARY QUESTION: {intent.primary_question}
SECONDARY QUESTIONS:
{chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None specified'}
PURPOSE: {intent.purpose}
{purpose_explanation}
CONTENT OUTPUT: {intent.content_output}
EXPECTED DELIVERABLES: {', '.join(intent.expected_deliverables)}
FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'}
{perspective_instruction}
## RAW RESEARCH RESULTS
{raw_results[:15000]} # Truncated for token limits
## YOUR TASK
Analyze the raw research results and extract EXACTLY what the user needs.
{deliverables_instructions}
## OUTPUT REQUIREMENTS
Provide results in this JSON structure:
```json
{{
"primary_answer": "Direct 2-3 sentence answer to the primary question",
"secondary_answers": {{
"Question 1?": "Answer to question 1",
"Question 2?": "Answer to question 2"
}},
"executive_summary": "2-3 sentence executive summary of all findings",
"key_takeaways": [
"Key takeaway 1 - most important finding",
"Key takeaway 2",
"Key takeaway 3",
"Key takeaway 4",
"Key takeaway 5"
],
"statistics": [
{{
"statistic": "72% of hospitals plan to adopt AI by 2025",
"value": "72%",
"context": "Survey of 500 US hospitals in 2024",
"source": "Healthcare AI Report 2024",
"url": "https://example.com/report",
"credibility": 0.9,
"recency": "2024"
}}
],
"expert_quotes": [
{{
"quote": "AI will revolutionize patient care within 5 years",
"speaker": "Dr. Jane Smith",
"title": "Chief Medical Officer",
"organization": "HealthTech Inc",
"source": "TechCrunch",
"url": "https://example.com/article"
}}
],
"case_studies": [
{{
"title": "Mayo Clinic AI Implementation",
"organization": "Mayo Clinic",
"challenge": "High patient wait times",
"solution": "AI-powered triage system",
"outcome": "40% reduction in wait times",
"key_metrics": ["40% faster triage", "95% patient satisfaction"],
"source": "Healthcare IT News",
"url": "https://example.com"
}}
],
"trends": [
{{
"trend": "AI-assisted diagnostics adoption",
"direction": "growing",
"evidence": ["25% YoY growth", "Major hospital chains investing"],
"impact": "Could reduce misdiagnosis by 30%",
"timeline": "Expected mainstream by 2027",
"sources": ["url1", "url2"]
}}
],
"comparisons": [
{{
"title": "Top AI Healthcare Platforms",
"criteria": ["Cost", "Features", "Support"],
"items": [
{{
"name": "Platform A",
"pros": ["Easy integration", "Good support"],
"cons": ["Higher cost"],
"features": {{"Cost": "$500/month", "Support": "24/7"}}
}}
],
"verdict": "Platform A best for large hospitals"
}}
],
"best_practices": [
"Start with a pilot program before full deployment",
"Ensure staff training is comprehensive"
],
"step_by_step": [
"Step 1: Assess current infrastructure",
"Step 2: Define use cases",
"Step 3: Select vendor"
],
"pros_cons": {{
"subject": "AI in Healthcare",
"pros": ["Improved accuracy", "Cost savings"],
"cons": ["Initial investment", "Training required"],
"balanced_verdict": "Benefits outweigh costs for most hospitals"
}},
"definitions": {{
"Clinical AI": "AI systems designed for medical diagnosis and treatment recommendations"
}},
"examples": [
"Example: Hospital X reduced readmissions by 25% using predictive AI"
],
"predictions": [
"By 2030, AI will assist in 80% of initial diagnoses"
],
"suggested_outline": [
"1. Introduction: The AI Healthcare Revolution",
"2. Current State: Where We Are Today",
"3. Key Statistics and Trends",
"4. Case Studies: Success Stories",
"5. Implementation Guide",
"6. Future Outlook"
],
"sources": [
{{
"title": "Healthcare AI Report 2024",
"url": "https://example.com",
"relevance_score": 0.95,
"relevance_reason": "Directly addresses adoption statistics",
"content_type": "research report",
"credibility_score": 0.9
}}
],
"confidence": 0.85,
"gaps_identified": [
"Specific cost data for small clinics not found",
"Limited information on regulatory challenges"
],
"follow_up_queries": [
"AI healthcare regulations FDA 2025",
"Small clinic AI implementation costs"
]
}}
```
## CRITICAL RULES
1. **ONLY include information directly from the raw results** - do not make up data
2. **ALWAYS include source URLs** for every statistic, quote, and case study
3. **If a deliverable type has no relevant data**, return an empty array for it
4. **Prioritize recency and credibility** when multiple sources conflict
5. **Answer the PRIMARY QUESTION directly** in 2-3 clear sentences
6. **Keep KEY TAKEAWAYS to 5-7 points** - the most important findings
7. **Add to gaps_identified** if expected information is missing
8. **Suggest follow_up_queries** for gaps or incomplete areas
9. **Rate confidence** based on how well results match the user's intent
10. **Include deliverables ONLY if they are in expected_deliverables** or critical to the question
"""
return prompt
def _build_persona_context(
self,
research_persona: Optional[ResearchPersona],
industry: Optional[str],
target_audience: Optional[str],
) -> str:
"""Build persona context section for prompts."""
if not research_persona and not industry:
return "No specific persona context available."
context_parts = []
if research_persona:
context_parts.append(f"INDUSTRY: {research_persona.default_industry}")
context_parts.append(f"TARGET AUDIENCE: {research_persona.default_target_audience}")
if research_persona.suggested_keywords:
context_parts.append(f"TYPICAL TOPICS: {', '.join(research_persona.suggested_keywords[:5])}")
if research_persona.research_angles:
context_parts.append(f"RESEARCH ANGLES: {', '.join(research_persona.research_angles[:3])}")
else:
if industry:
context_parts.append(f"INDUSTRY: {industry}")
if target_audience:
context_parts.append(f"TARGET AUDIENCE: {target_audience}")
return "\n".join(context_parts)
def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
"""Build competitor context section for prompts."""
if not competitor_data:
return ""
competitor_names = []
for comp in competitor_data[:5]: # Limit to 5
name = comp.get("name") or comp.get("domain") or comp.get("url", "Unknown")
competitor_names.append(name)
if competitor_names:
return f"\nKNOWN COMPETITORS: {', '.join(competitor_names)}"
return ""
def _build_deliverables_instructions(self, expected_deliverables: List[str]) -> str:
"""Build specific extraction instructions for each expected deliverable."""
instructions = ["### EXTRACTION INSTRUCTIONS\n"]
instructions.append("For each requested deliverable, extract the following:\n")
deliverable_instructions = {
ExpectedDeliverable.KEY_STATISTICS: """
**STATISTICS**:
- Extract ALL relevant statistics with exact numbers
- Include source attribution (publication name, URL)
- Note the recency of the data
- Rate credibility based on source authority
- Format: statistic statement, value, context, source, URL, credibility score
""",
ExpectedDeliverable.EXPERT_QUOTES: """
**EXPERT QUOTES**:
- Extract authoritative quotes from named experts
- Include speaker name, title, and organization
- Provide context for the quote
- Include source URL
""",
ExpectedDeliverable.CASE_STUDIES: """
**CASE STUDIES**:
- Summarize each case study: challenge → solution → outcome
- Include key metrics and results
- Name the organization involved
- Provide source URL
""",
ExpectedDeliverable.TRENDS: """
**TRENDS**:
- Identify current and emerging trends
- Note direction: growing, declining, emerging, or stable
- List supporting evidence
- Include timeline predictions if available
- Cite sources
""",
ExpectedDeliverable.COMPARISONS: """
**COMPARISONS**:
- Build comparison tables where applicable
- Define clear comparison criteria
- List pros and cons for each option
- Provide a verdict/recommendation if data supports it
""",
ExpectedDeliverable.BEST_PRACTICES: """
**BEST PRACTICES**:
- Extract recommended approaches
- Provide actionable guidelines
- Order by importance or sequence
""",
ExpectedDeliverable.STEP_BY_STEP: """
**STEP BY STEP**:
- Extract process/how-to instructions
- Number steps clearly
- Include any prerequisites or requirements
""",
ExpectedDeliverable.PROS_CONS: """
**PROS AND CONS**:
- List advantages (pros)
- List disadvantages (cons)
- Provide a balanced verdict
""",
ExpectedDeliverable.DEFINITIONS: """
**DEFINITIONS**:
- Extract clear explanations of key terms and concepts
- Keep definitions concise but comprehensive
""",
ExpectedDeliverable.EXAMPLES: """
**EXAMPLES**:
- Extract concrete examples that illustrate key points
- Include real-world applications
""",
ExpectedDeliverable.PREDICTIONS: """
**PREDICTIONS**:
- Extract future outlook and predictions
- Note the source and their track record if known
- Include timeframes where mentioned
""",
ExpectedDeliverable.CITATIONS: """
**CITATIONS**:
- List all authoritative sources with URLs
- Rate credibility and relevance
- Note content type (research, news, opinion, etc.)
""",
}
for deliverable in expected_deliverables:
try:
d_enum = ExpectedDeliverable(deliverable)
if d_enum in deliverable_instructions:
instructions.append(deliverable_instructions[d_enum])
except ValueError:
pass
return "\n".join(instructions)

View File

@@ -0,0 +1,387 @@
"""
Intent Query Generator
Generates multiple targeted research queries based on user intent.
Each query targets a specific deliverable or question.
Author: ALwrity Team
Version: 1.0
"""
import json
from typing import Dict, Any, List, Optional
from loguru import logger
from models.research_intent_models import (
ResearchIntent,
ResearchQuery,
ExpectedDeliverable,
ResearchPurpose,
)
from models.research_persona_models import ResearchPersona
from .intent_prompt_builder import IntentPromptBuilder
class IntentQueryGenerator:
"""
Generates targeted research queries based on user intent.
Instead of a single generic search, generates multiple queries
each targeting a specific deliverable or question.
"""
def __init__(self):
"""Initialize the query generator."""
self.prompt_builder = IntentPromptBuilder()
logger.info("IntentQueryGenerator initialized")
async def generate_queries(
self,
intent: ResearchIntent,
research_persona: Optional[ResearchPersona] = None,
) -> Dict[str, Any]:
"""
Generate targeted research queries based on intent.
Args:
intent: The inferred research intent
research_persona: Optional persona for context
Returns:
Dict with queries, enhanced_keywords, and research_angles
"""
try:
logger.info(f"Generating queries for: {intent.primary_question[:50]}...")
# Build the query generation prompt
prompt = self.prompt_builder.build_query_generation_prompt(
intent=intent,
research_persona=research_persona,
)
# Define the expected JSON schema
query_schema = {
"type": "object",
"properties": {
"queries": {
"type": "array",
"items": {
"type": "object",
"properties": {
"query": {"type": "string"},
"purpose": {"type": "string"},
"provider": {"type": "string"},
"priority": {"type": "integer"},
"expected_results": {"type": "string"}
},
"required": ["query", "purpose", "provider", "priority", "expected_results"]
}
},
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
"research_angles": {"type": "array", "items": {"type": "string"}}
},
"required": ["queries", "enhanced_keywords", "research_angles"]
}
# Call LLM for query generation
from services.llm_providers.main_text_generation import llm_text_gen
result = llm_text_gen(
prompt=prompt,
json_struct=query_schema,
user_id=None
)
if isinstance(result, dict) and "error" in result:
logger.error(f"Query generation failed: {result.get('error')}")
return self._create_fallback_queries(intent)
# Parse queries
queries = self._parse_queries(result.get("queries", []))
# Ensure we have queries for all expected deliverables
queries = self._ensure_deliverable_coverage(queries, intent)
# Sort by priority
queries.sort(key=lambda q: q.priority, reverse=True)
logger.info(f"Generated {len(queries)} targeted queries")
return {
"queries": queries,
"enhanced_keywords": result.get("enhanced_keywords", []),
"research_angles": result.get("research_angles", []),
}
except Exception as e:
logger.error(f"Error generating queries: {e}")
return self._create_fallback_queries(intent)
def _parse_queries(self, raw_queries: List[Dict]) -> List[ResearchQuery]:
"""Parse raw query data into ResearchQuery objects."""
queries = []
for q in raw_queries:
try:
# Validate purpose
purpose_str = q.get("purpose", "key_statistics")
try:
purpose = ExpectedDeliverable(purpose_str)
except ValueError:
purpose = ExpectedDeliverable.KEY_STATISTICS
query = ResearchQuery(
query=q.get("query", ""),
purpose=purpose,
provider=q.get("provider", "exa"),
priority=min(max(int(q.get("priority", 3)), 1), 5), # Clamp 1-5
expected_results=q.get("expected_results", ""),
)
queries.append(query)
except Exception as e:
logger.warning(f"Failed to parse query: {e}")
continue
return queries
def _ensure_deliverable_coverage(
self,
queries: List[ResearchQuery],
intent: ResearchIntent,
) -> List[ResearchQuery]:
"""Ensure we have queries for all expected deliverables."""
# Get deliverables already covered
covered = set(q.purpose.value for q in queries)
# Check for missing deliverables
for deliverable in intent.expected_deliverables:
if deliverable not in covered:
# Generate a query for this deliverable
query = self._generate_query_for_deliverable(
deliverable=deliverable,
intent=intent,
)
queries.append(query)
return queries
def _generate_query_for_deliverable(
self,
deliverable: str,
intent: ResearchIntent,
) -> ResearchQuery:
"""Generate a query targeting a specific deliverable."""
# Extract topic from primary question
topic = intent.original_input
# Query templates by deliverable type
templates = {
ExpectedDeliverable.KEY_STATISTICS.value: {
"query": f"{topic} statistics data report study",
"provider": "exa",
"priority": 5,
"expected": "Statistical data and research findings",
},
ExpectedDeliverable.EXPERT_QUOTES.value: {
"query": f"{topic} expert opinion interview insights",
"provider": "exa",
"priority": 4,
"expected": "Expert opinions and authoritative quotes",
},
ExpectedDeliverable.CASE_STUDIES.value: {
"query": f"{topic} case study success story implementation example",
"provider": "exa",
"priority": 4,
"expected": "Real-world case studies and examples",
},
ExpectedDeliverable.TRENDS.value: {
"query": f"{topic} trends 2025 future predictions emerging",
"provider": "tavily",
"priority": 4,
"expected": "Current trends and future predictions",
},
ExpectedDeliverable.COMPARISONS.value: {
"query": f"{topic} comparison vs versus alternatives",
"provider": "exa",
"priority": 4,
"expected": "Comparison and alternative options",
},
ExpectedDeliverable.BEST_PRACTICES.value: {
"query": f"{topic} best practices recommendations guidelines",
"provider": "exa",
"priority": 3,
"expected": "Best practices and recommendations",
},
ExpectedDeliverable.STEP_BY_STEP.value: {
"query": f"{topic} how to guide tutorial steps",
"provider": "exa",
"priority": 3,
"expected": "Step-by-step guides and tutorials",
},
ExpectedDeliverable.PROS_CONS.value: {
"query": f"{topic} advantages disadvantages pros cons benefits",
"provider": "exa",
"priority": 3,
"expected": "Pros, cons, and trade-offs",
},
ExpectedDeliverable.DEFINITIONS.value: {
"query": f"what is {topic} definition explained",
"provider": "exa",
"priority": 3,
"expected": "Clear definitions and explanations",
},
ExpectedDeliverable.EXAMPLES.value: {
"query": f"{topic} examples real world applications",
"provider": "exa",
"priority": 3,
"expected": "Real-world examples and applications",
},
ExpectedDeliverable.PREDICTIONS.value: {
"query": f"{topic} future outlook predictions 2025 2030",
"provider": "tavily",
"priority": 4,
"expected": "Future predictions and outlook",
},
ExpectedDeliverable.CITATIONS.value: {
"query": f"{topic} research paper study academic",
"provider": "exa",
"priority": 4,
"expected": "Authoritative academic sources",
},
}
template = templates.get(deliverable, {
"query": f"{topic}",
"provider": "exa",
"priority": 3,
"expected": "General information",
})
return ResearchQuery(
query=template["query"],
purpose=ExpectedDeliverable(deliverable) if deliverable in [e.value for e in ExpectedDeliverable] else ExpectedDeliverable.KEY_STATISTICS,
provider=template["provider"],
priority=template["priority"],
expected_results=template["expected"],
)
def _create_fallback_queries(self, intent: ResearchIntent) -> Dict[str, Any]:
"""Create fallback queries when AI generation fails."""
topic = intent.original_input
# Generate basic queries for each expected deliverable
queries = []
for deliverable in intent.expected_deliverables[:5]: # Limit to 5
query = self._generate_query_for_deliverable(deliverable, intent)
queries.append(query)
# Add a general query if we have none
if not queries:
queries.append(ResearchQuery(
query=topic,
purpose=ExpectedDeliverable.KEY_STATISTICS,
provider="exa",
priority=5,
expected_results="General information and insights",
))
return {
"queries": queries,
"enhanced_keywords": topic.split()[:10],
"research_angles": [
f"Overview of {topic}",
f"Latest trends in {topic}",
],
}
class QueryOptimizer:
"""
Optimizes queries for different research providers.
Different providers have different strengths:
- Exa: Semantic search, good for deep research
- Tavily: Real-time search, good for news/trends
- Google: Factual search, good for basic info
"""
@staticmethod
def optimize_for_exa(query: str, intent: ResearchIntent) -> Dict[str, Any]:
"""Optimize query and parameters for Exa."""
# Determine best Exa settings based on deliverable
deliverables = intent.expected_deliverables
# Determine category
category = None
if ExpectedDeliverable.CITATIONS.value in deliverables:
category = "research paper"
elif ExpectedDeliverable.TRENDS.value in deliverables:
category = "news"
elif intent.purpose == ResearchPurpose.COMPARE.value:
category = "company"
# Determine search type
search_type = "neural" # Default to neural for semantic understanding
if ExpectedDeliverable.TRENDS.value in deliverables:
search_type = "auto" # Auto is better for time-sensitive queries
# Number of results
num_results = 10
if intent.depth == "expert":
num_results = 20
elif intent.depth == "overview":
num_results = 5
return {
"query": query,
"type": search_type,
"category": category,
"num_results": num_results,
"text": True,
"highlights": True,
}
@staticmethod
def optimize_for_tavily(query: str, intent: ResearchIntent) -> Dict[str, Any]:
"""Optimize query and parameters for Tavily."""
deliverables = intent.expected_deliverables
# Determine topic
topic = "general"
if ExpectedDeliverable.TRENDS.value in deliverables:
topic = "news"
# Determine search depth
search_depth = "basic"
if intent.depth in ["detailed", "expert"]:
search_depth = "advanced"
# Include answer for factual queries
include_answer = False
if ExpectedDeliverable.DEFINITIONS.value in deliverables:
include_answer = "advanced"
elif ExpectedDeliverable.KEY_STATISTICS.value in deliverables:
include_answer = "basic"
# Time range for trends
time_range = None
if intent.time_sensitivity == "real_time":
time_range = "day"
elif intent.time_sensitivity == "recent":
time_range = "week"
elif ExpectedDeliverable.TRENDS.value in deliverables:
time_range = "month"
return {
"query": query,
"topic": topic,
"search_depth": search_depth,
"include_answer": include_answer,
"time_range": time_range,
"max_results": 10,
}

View File

@@ -0,0 +1,378 @@
"""
Research Intent Inference Service
Analyzes user input to understand their research intent.
Uses AI to infer:
- What the user wants to accomplish
- What questions need answering
- What deliverables they expect
Author: ALwrity Team
Version: 1.0
"""
import json
from typing import Dict, Any, List, Optional
from loguru import logger
from models.research_intent_models import (
ResearchIntent,
ResearchPurpose,
ContentOutput,
ExpectedDeliverable,
ResearchDepthLevel,
InputType,
IntentInferenceRequest,
IntentInferenceResponse,
ResearchQuery,
)
from models.research_persona_models import ResearchPersona
from .intent_prompt_builder import IntentPromptBuilder
class ResearchIntentInference:
"""
Infers user research intent from minimal input.
Instead of asking a formal questionnaire, this service
uses AI to understand what the user really wants.
"""
def __init__(self):
"""Initialize the intent inference service."""
self.prompt_builder = IntentPromptBuilder()
logger.info("ResearchIntentInference initialized")
async def infer_intent(
self,
user_input: str,
keywords: Optional[List[str]] = None,
research_persona: Optional[ResearchPersona] = None,
competitor_data: Optional[List[Dict]] = None,
industry: Optional[str] = None,
target_audience: Optional[str] = None,
) -> IntentInferenceResponse:
"""
Analyze user input and infer their research intent.
Args:
user_input: User's keywords, question, or goal
keywords: Extracted keywords (optional)
research_persona: User's research persona (optional)
competitor_data: Competitor analysis data (optional)
industry: Industry context (optional)
target_audience: Target audience context (optional)
Returns:
IntentInferenceResponse with inferred intent and suggested queries
"""
try:
logger.info(f"Inferring intent for: {user_input[:100]}...")
keywords = keywords or []
# Build the inference prompt
prompt = self.prompt_builder.build_intent_inference_prompt(
user_input=user_input,
keywords=keywords,
research_persona=research_persona,
competitor_data=competitor_data,
industry=industry,
target_audience=target_audience,
)
# Define the expected JSON schema
intent_schema = {
"type": "object",
"properties": {
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
"primary_question": {"type": "string"},
"secondary_questions": {"type": "array", "items": {"type": "string"}},
"purpose": {"type": "string"},
"content_output": {"type": "string"},
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
"focus_areas": {"type": "array", "items": {"type": "string"}},
"perspective": {"type": "string"},
"time_sensitivity": {"type": "string"},
"confidence": {"type": "number"},
"needs_clarification": {"type": "boolean"},
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
"analysis_summary": {"type": "string"}
},
"required": [
"input_type", "primary_question", "purpose", "content_output",
"expected_deliverables", "depth", "confidence", "analysis_summary"
]
}
# Call LLM for intent inference
from services.llm_providers.main_text_generation import llm_text_gen
result = llm_text_gen(
prompt=prompt,
json_struct=intent_schema,
user_id=None
)
if isinstance(result, dict) and "error" in result:
logger.error(f"Intent inference failed: {result.get('error')}")
return self._create_fallback_response(user_input, keywords)
# Parse and validate the result
intent = self._parse_intent_result(result, user_input)
# Generate quick options for UI
quick_options = self._generate_quick_options(intent, result)
# Create response
response = IntentInferenceResponse(
success=True,
intent=intent,
analysis_summary=result.get("analysis_summary", "Research intent analyzed"),
suggested_queries=[], # Will be populated by query generator
suggested_keywords=self._extract_keywords_from_input(user_input, keywords),
suggested_angles=result.get("focus_areas", []),
quick_options=quick_options,
)
logger.info(f"Intent inferred: purpose={intent.purpose}, confidence={intent.confidence}")
return response
except Exception as e:
logger.error(f"Error inferring intent: {e}")
return self._create_fallback_response(user_input, keywords or [])
def _parse_intent_result(self, result: Dict[str, Any], user_input: str) -> ResearchIntent:
"""Parse LLM result into ResearchIntent model."""
# Map string values to enums safely
input_type = self._safe_enum(InputType, result.get("input_type", "keywords"), InputType.KEYWORDS)
purpose = self._safe_enum(ResearchPurpose, result.get("purpose", "learn"), ResearchPurpose.LEARN)
content_output = self._safe_enum(ContentOutput, result.get("content_output", "general"), ContentOutput.GENERAL)
depth = self._safe_enum(ResearchDepthLevel, result.get("depth", "detailed"), ResearchDepthLevel.DETAILED)
# Parse expected deliverables
raw_deliverables = result.get("expected_deliverables", [])
expected_deliverables = []
for d in raw_deliverables:
try:
expected_deliverables.append(ExpectedDeliverable(d).value)
except ValueError:
# Skip invalid deliverables
pass
# Ensure we have at least some deliverables
if not expected_deliverables:
expected_deliverables = self._infer_deliverables_from_purpose(purpose)
return ResearchIntent(
primary_question=result.get("primary_question", user_input),
secondary_questions=result.get("secondary_questions", []),
purpose=purpose.value,
content_output=content_output.value,
expected_deliverables=expected_deliverables,
depth=depth.value,
focus_areas=result.get("focus_areas", []),
perspective=result.get("perspective"),
time_sensitivity=result.get("time_sensitivity"),
input_type=input_type.value,
original_input=user_input,
confidence=float(result.get("confidence", 0.7)),
needs_clarification=result.get("needs_clarification", False),
clarifying_questions=result.get("clarifying_questions", []),
)
def _safe_enum(self, enum_class, value: str, default):
"""Safely convert string to enum, returning default if invalid."""
try:
return enum_class(value)
except ValueError:
return default
def _infer_deliverables_from_purpose(self, purpose: ResearchPurpose) -> List[str]:
"""Infer expected deliverables based on research purpose."""
purpose_deliverables = {
ResearchPurpose.LEARN: [
ExpectedDeliverable.DEFINITIONS.value,
ExpectedDeliverable.EXAMPLES.value,
ExpectedDeliverable.KEY_STATISTICS.value,
],
ResearchPurpose.CREATE_CONTENT: [
ExpectedDeliverable.KEY_STATISTICS.value,
ExpectedDeliverable.EXPERT_QUOTES.value,
ExpectedDeliverable.EXAMPLES.value,
ExpectedDeliverable.CASE_STUDIES.value,
],
ResearchPurpose.MAKE_DECISION: [
ExpectedDeliverable.PROS_CONS.value,
ExpectedDeliverable.COMPARISONS.value,
ExpectedDeliverable.BEST_PRACTICES.value,
],
ResearchPurpose.COMPARE: [
ExpectedDeliverable.COMPARISONS.value,
ExpectedDeliverable.PROS_CONS.value,
ExpectedDeliverable.KEY_STATISTICS.value,
],
ResearchPurpose.SOLVE_PROBLEM: [
ExpectedDeliverable.STEP_BY_STEP.value,
ExpectedDeliverable.BEST_PRACTICES.value,
ExpectedDeliverable.CASE_STUDIES.value,
],
ResearchPurpose.FIND_DATA: [
ExpectedDeliverable.KEY_STATISTICS.value,
ExpectedDeliverable.CITATIONS.value,
],
ResearchPurpose.EXPLORE_TRENDS: [
ExpectedDeliverable.TRENDS.value,
ExpectedDeliverable.PREDICTIONS.value,
ExpectedDeliverable.KEY_STATISTICS.value,
],
ResearchPurpose.VALIDATE: [
ExpectedDeliverable.CITATIONS.value,
ExpectedDeliverable.KEY_STATISTICS.value,
ExpectedDeliverable.EXPERT_QUOTES.value,
],
ResearchPurpose.GENERATE_IDEAS: [
ExpectedDeliverable.EXAMPLES.value,
ExpectedDeliverable.TRENDS.value,
ExpectedDeliverable.CASE_STUDIES.value,
],
}
return purpose_deliverables.get(purpose, [ExpectedDeliverable.KEY_STATISTICS.value])
def _generate_quick_options(self, intent: ResearchIntent, result: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Generate quick options for UI confirmation."""
options = []
# Purpose option
options.append({
"id": "purpose",
"label": "Research Purpose",
"value": intent.purpose,
"display": self._purpose_display(intent.purpose),
"alternatives": [p.value for p in ResearchPurpose],
"confidence": result.get("confidence", 0.7),
})
# Content output option
if intent.content_output != ContentOutput.GENERAL.value:
options.append({
"id": "content_output",
"label": "Content Type",
"value": intent.content_output,
"display": intent.content_output.replace("_", " ").title(),
"alternatives": [c.value for c in ContentOutput],
"confidence": result.get("confidence", 0.7),
})
# Deliverables option
options.append({
"id": "deliverables",
"label": "What I'll Find",
"value": intent.expected_deliverables,
"display": [d.replace("_", " ").title() for d in intent.expected_deliverables[:4]],
"alternatives": [d.value for d in ExpectedDeliverable],
"confidence": result.get("confidence", 0.7),
"multi_select": True,
})
# Depth option
options.append({
"id": "depth",
"label": "Research Depth",
"value": intent.depth,
"display": intent.depth.title(),
"alternatives": [d.value for d in ResearchDepthLevel],
"confidence": result.get("confidence", 0.7),
})
return options
def _purpose_display(self, purpose: str) -> str:
"""Get display-friendly purpose text."""
display_map = {
"learn": "Understand this topic",
"create_content": "Create content about this",
"make_decision": "Make a decision",
"compare": "Compare options",
"solve_problem": "Solve a problem",
"find_data": "Find specific data",
"explore_trends": "Explore trends",
"validate": "Validate information",
"generate_ideas": "Generate ideas",
}
return display_map.get(purpose, purpose.replace("_", " ").title())
def _extract_keywords_from_input(self, user_input: str, keywords: List[str]) -> List[str]:
"""Extract and enhance keywords from user input."""
# Start with provided keywords
extracted = list(keywords) if keywords else []
# Simple extraction from input (split on common delimiters)
words = user_input.lower().replace(",", " ").replace(";", " ").split()
# Filter out common words
stop_words = {
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "must", "shall", "can", "need", "dare",
"to", "of", "in", "for", "on", "with", "at", "by", "from", "up",
"about", "into", "through", "during", "before", "after", "above",
"below", "between", "under", "again", "further", "then", "once",
"here", "there", "when", "where", "why", "how", "all", "each",
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
"only", "own", "same", "so", "than", "too", "very", "just", "and",
"but", "if", "or", "because", "as", "until", "while", "i", "we",
"you", "they", "what", "which", "who", "whom", "this", "that",
"these", "those", "am", "want", "write", "blog", "post", "article",
}
for word in words:
if word not in stop_words and len(word) > 2 and word not in extracted:
extracted.append(word)
return extracted[:15] # Limit to 15 keywords
def _create_fallback_response(self, user_input: str, keywords: List[str]) -> IntentInferenceResponse:
"""Create a fallback response when AI inference fails."""
# Create a basic intent from the input
fallback_intent = ResearchIntent(
primary_question=f"What are the key insights about: {user_input}?",
secondary_questions=[
f"What are the latest trends in {user_input}?",
f"What are best practices for {user_input}?",
],
purpose=ResearchPurpose.LEARN.value,
content_output=ContentOutput.GENERAL.value,
expected_deliverables=[
ExpectedDeliverable.KEY_STATISTICS.value,
ExpectedDeliverable.EXAMPLES.value,
ExpectedDeliverable.BEST_PRACTICES.value,
],
depth=ResearchDepthLevel.DETAILED.value,
focus_areas=[],
input_type=InputType.KEYWORDS.value,
original_input=user_input,
confidence=0.5,
needs_clarification=True,
clarifying_questions=[
"What type of content are you creating?",
"What specific aspects are you most interested in?",
],
)
return IntentInferenceResponse(
success=True, # Still return success, just with lower confidence
intent=fallback_intent,
analysis_summary=f"Basic research analysis for: {user_input}",
suggested_queries=[],
suggested_keywords=keywords,
suggested_angles=[],
quick_options=[],
)

View File

@@ -0,0 +1,660 @@
"""
Research Persona Prompt Builder
Handles building comprehensive prompts for research persona generation.
Generates personalized research defaults, suggestions, and configurations.
"""
from typing import Dict, Any, List
import json
from loguru import logger
class ResearchPersonaPromptBuilder:
"""Builds comprehensive prompts for research persona generation."""
def build_research_persona_prompt(self, onboarding_data: Dict[str, Any]) -> str:
"""Build the research persona generation prompt with comprehensive data."""
# Extract data from onboarding_data
website_analysis = onboarding_data.get("website_analysis", {}) or {}
persona_data = onboarding_data.get("persona_data", {}) or {}
research_prefs = onboarding_data.get("research_preferences", {}) or {}
business_info = onboarding_data.get("business_info", {}) or {}
competitor_analysis = onboarding_data.get("competitor_analysis", []) or []
# Extract core persona - handle both camelCase and snake_case
core_persona = persona_data.get("corePersona") or persona_data.get("core_persona") or {}
# Phase 1: Extract key website analysis fields for enhanced personalization
writing_style = website_analysis.get("writing_style", {}) or {}
content_type = website_analysis.get("content_type", {}) or {}
crawl_result = website_analysis.get("crawl_result", {}) or {}
# Phase 2: Extract additional fields for pattern-based personalization
style_patterns = website_analysis.get("style_patterns", {}) or {}
content_characteristics = website_analysis.get("content_characteristics", {}) or {}
style_guidelines = website_analysis.get("style_guidelines", {}) or {}
# Extract topics/keywords from crawl_result (if available)
extracted_topics = self._extract_topics_from_crawl(crawl_result)
extracted_keywords = self._extract_keywords_from_crawl(crawl_result)
# Phase 2: Extract patterns and vocabulary level
extracted_patterns = self._extract_writing_patterns(style_patterns)
vocabulary_level = content_characteristics.get("vocabulary_level", "medium") if content_characteristics else "medium"
extracted_guidelines = self._extract_style_guidelines(style_guidelines)
# Phase 3: Full crawl analysis and comprehensive mapping
crawl_analysis = self._analyze_crawl_result_comprehensive(crawl_result)
writing_style_mapping = self._map_writing_style_comprehensive(writing_style, content_characteristics)
content_themes = self._extract_content_themes(crawl_result, extracted_topics)
prompt = f"""
COMPREHENSIVE RESEARCH PERSONA GENERATION TASK: Create a highly detailed, personalized research persona based on the user's business, writing style, and content strategy. This persona will provide intelligent defaults and suggestions for research inputs.
=== USER CONTEXT ===
BUSINESS INFORMATION:
{json.dumps(business_info, indent=2)}
WEBSITE ANALYSIS:
{json.dumps(website_analysis, indent=2)}
CORE PERSONA:
{json.dumps(core_persona, indent=2)}
RESEARCH PREFERENCES:
{json.dumps(research_prefs, indent=2)}
COMPETITOR ANALYSIS:
{json.dumps(competitor_analysis, indent=2) if competitor_analysis else "No competitor data available"}
=== PHASE 1: WEBSITE ANALYSIS INTELLIGENCE ===
WRITING STYLE (for research depth mapping):
{json.dumps(writing_style, indent=2) if writing_style else "Not available"}
CONTENT TYPE (for preset generation):
{json.dumps(content_type, indent=2) if content_type else "Not available"}
EXTRACTED TOPICS FROM WEBSITE CONTENT:
{json.dumps(extracted_topics, indent=2) if extracted_topics else "No topics extracted"}
EXTRACTED KEYWORDS FROM WEBSITE CONTENT:
{json.dumps(extracted_keywords[:20], indent=2) if extracted_keywords else "No keywords extracted"}
=== PHASE 2: WRITING PATTERNS & STYLE INTELLIGENCE ===
STYLE PATTERNS (for research angles):
{json.dumps(style_patterns, indent=2) if style_patterns else "Not available"}
EXTRACTED WRITING PATTERNS:
{json.dumps(extracted_patterns, indent=2) if extracted_patterns else "No patterns extracted"}
CONTENT CHARACTERISTICS (for keyword sophistication):
{json.dumps(content_characteristics, indent=2) if content_characteristics else "Not available"}
VOCABULARY LEVEL:
{vocabulary_level}
STYLE GUIDELINES (for query enhancement):
{json.dumps(style_guidelines, indent=2) if style_guidelines else "Not available"}
EXTRACTED GUIDELINES:
{json.dumps(extracted_guidelines, indent=2) if extracted_guidelines else "No guidelines extracted"}
=== PHASE 3: COMPREHENSIVE ANALYSIS & MAPPING ===
CRAWL ANALYSIS (Full Content Intelligence):
{json.dumps(crawl_analysis, indent=2) if crawl_analysis else "No crawl analysis available"}
WRITING STYLE COMPREHENSIVE MAPPING:
{json.dumps(writing_style_mapping, indent=2) if writing_style_mapping else "No style mapping available"}
CONTENT THEMES (Extracted from Website):
{json.dumps(content_themes, indent=2) if content_themes else "No themes extracted"}
=== RESEARCH PERSONA GENERATION REQUIREMENTS ===
Generate a comprehensive research persona in JSON format with the following structure:
1. DEFAULT VALUES:
- "default_industry": Extract from core_persona.industry, business_info.industry, or website_analysis target_audience. If none available, infer from content patterns in website_analysis or research_preferences. Never use "General" - always provide a specific industry based on context.
- "default_target_audience": Extract from core_persona.target_audience, website_analysis.target_audience, or business_info.target_audience. Be specific and descriptive.
- "default_research_mode": **PHASE 3 ENHANCEMENT** - Use comprehensive writing_style_mapping:
* **PRIMARY**: Use writing_style_mapping.research_depth_preference (from comprehensive analysis)
* **SECONDARY**: Map from writing_style.complexity:
- If writing_style.complexity == "high": Use "comprehensive" (deep research needed)
- If writing_style.complexity == "medium": Use "targeted" (balanced research)
- If writing_style.complexity == "low": Use "basic" (quick research)
* **FALLBACK**: Use research_preferences.research_depth if complexity not available
* This ensures research depth matches the user's writing sophistication level and comprehensive style analysis
- "default_provider": **PHASE 3 ENHANCEMENT** - Use writing_style_mapping.provider_preference:
* **PRIMARY**: Use writing_style_mapping.provider_preference (from comprehensive style analysis)
* **SECONDARY**: Suggest based on user's typical research needs:
- Academic/research users: "exa" (semantic search, papers)
- News/current events users: "tavily" (real-time, AI answers)
- General business users: "exa" (better for content creation)
* **DEFAULT**: "exa" (generally better for content creators)
2. KEYWORD INTELLIGENCE:
- "suggested_keywords": **PHASE 1 ENHANCEMENT** - Prioritize extracted keywords from crawl_result:
* First, use extracted_keywords from website content (top 8-10 most relevant)
* Then, supplement with keywords from user's industry, interests (from core_persona), and content goals
* Total: 8-12 keywords, with at least 50% from extracted_keywords if available
* This ensures keywords reflect the user's actual content topics
- "keyword_expansion_patterns": **PHASE 2 ENHANCEMENT** - Create a dictionary mapping common keywords to expanded, industry-specific terms based on vocabulary_level:
* If vocabulary_level == "advanced": Use sophisticated, technical, industry-specific terminology
Example: {{"AI": ["machine learning algorithms", "neural network architectures", "deep learning frameworks", "algorithmic intelligence systems"], "tools": ["enterprise software platforms", "integrated development environments", "cloud-native solutions"]}}
* If vocabulary_level == "medium": Use balanced, professional terminology
Example: {{"AI": ["artificial intelligence", "automated systems", "smart technology", "intelligent automation"], "tools": ["software solutions", "digital platforms", "business applications"]}}
* If vocabulary_level == "simple": Use accessible, beginner-friendly terminology
Example: {{"AI": ["smart technology", "automated tools", "helpful software", "intelligent helpers"], "tools": ["apps", "software", "platforms", "online services"]}}
* Include 10-15 patterns, matching the user's vocabulary sophistication level
* Focus on industry-specific terminology from the user's domain, but at the appropriate complexity level
3. PROVIDER-SPECIFIC OPTIMIZATION:
- "suggested_exa_domains": List 4-6 authoritative domains for the user's industry (e.g., Healthcare: ["pubmed.gov", "nejm.org", "thelancet.com"]).
- "suggested_exa_category": Suggest appropriate Exa category based on industry:
- Healthcare/Science: "research paper"
- Finance: "financial report"
- Technology/Business: "company" or "news"
- Social Media/Marketing: "tweet" or "linkedin profile"
- Default: null (empty string for all categories)
- "suggested_exa_search_type": Suggest Exa search algorithm:
- Academic/research content: "neural" (semantic understanding)
- Current news/trends: "fast" (speed optimized)
- General research: "auto" (balanced)
- Code/technical: "neural"
- "suggested_tavily_topic": Choose based on content type:
- Financial content: "finance"
- News/current events: "news"
- General research: "general"
- "suggested_tavily_search_depth": Choose based on research needs:
- Quick overview: "basic" (1 credit, faster)
- In-depth analysis: "advanced" (2 credits, more comprehensive)
- Breaking news: "fast" (speed optimized)
- "suggested_tavily_include_answer": AI-generated answers:
- For factual queries needing quick answers: "advanced"
- For research summaries: "basic"
- When building custom content: "false" (use raw results)
- "suggested_tavily_time_range": Time filtering:
- Breaking news: "day"
- Recent developments: "week"
- Industry analysis: "month"
- Historical research: null (no time limit)
- "suggested_tavily_raw_content_format": Raw content for LLM processing:
- For blog content creation: "markdown" (structured)
- For simple text extraction: "text"
- No raw content needed: "false"
- "provider_recommendations": Map use cases to best providers:
{{"trends": "tavily", "deep_research": "exa", "factual": "google", "news": "tavily", "academic": "exa"}}
4. RESEARCH ANGLES:
- "research_angles": **PHASE 2 ENHANCEMENT** - Generate 5-8 alternative research angles/focuses based on:
* **PRIMARY SOURCE**: Extract from extracted_patterns (writing patterns from style_patterns):
- If "comparison" in patterns: "Compare {{topic}} solutions and alternatives"
- If "how-to" or "tutorial" in patterns: "Step-by-step guide to {{topic}} implementation"
- If "case-study" or "case_study" in patterns: "Real-world {{topic}} case studies and success stories"
- If "trend-analysis" or "trends" in patterns: "Latest {{topic}} trends and future predictions"
- If "best-practices" or "best_practices" in patterns: "{{topic}} best practices and industry standards"
- If "review" or "evaluation" in patterns: "{{topic}} review and evaluation criteria"
- If "problem-solving" in patterns: "{{topic}} problem-solving strategies and solutions"
* **SECONDARY SOURCES** (if patterns not available):
- User's pain points and challenges (from core_persona.identity or core_persona)
- Industry trends and opportunities (from website_analysis or business_info)
- Content goals (from research_preferences.content_types)
- Audience interests (from core_persona or website_analysis.target_audience)
- Competitive landscape (if competitor_analysis exists, include competitive angles)
* Make angles specific to the user's industry and actionable for content creation
* Use the same language style and structure as the user's writing patterns
5. QUERY ENHANCEMENT:
- "query_enhancement_rules": **PHASE 2 ENHANCEMENT** - Create templates for improving vague user queries based on extracted_guidelines:
* **PRIMARY SOURCE**: Use extracted_guidelines (from style_guidelines) to create enhancement rules:
- If guidelines include "Use specific examples": {{"vague_query": "Research: {{query}} with specific examples and case studies"}}
- If guidelines include "Include data points" or "statistics": {{"general_query": "Research: {{query}} including statistics, metrics, and data analysis"}}
- If guidelines include "Reference industry standards": {{"basic_query": "Research: {{query}} with industry benchmarks and best practices"}}
- If guidelines include "Cite authoritative sources": {{"factual_query": "Research: {{query}} from authoritative sources and expert opinions"}}
- If guidelines include "Provide actionable insights": {{"theoretical_query": "Research: {{query}} with actionable strategies and implementation steps"}}
- If guidelines include "Compare alternatives": {{"single_item_query": "Research: Compare {{query}} alternatives and evaluate options"}}
* **FALLBACK PATTERNS** (if guidelines not available):
{{"vague_ai": "Research: AI applications in {{industry}} for {{audience}}", "vague_tools": "Compare top {{industry}} tools", "vague_trends": "Research latest {{industry}} trends and developments", ...}}
* Include 5-8 enhancement patterns
* Match the enhancement style to the user's writing guidelines and preferences
6. RECOMMENDED PRESETS:
- "recommended_presets": **PHASE 3 ENHANCEMENT** - Generate 3-5 personalized research preset templates using comprehensive analysis:
* **USE CONTENT THEMES**: If content_themes available, create at least one preset per major theme (up to 3 themes)
- Example: If themes include ["AI automation", "content marketing", "SEO strategies"], create presets for each
- Use theme names in preset keywords: "Research latest {theme} trends and best practices"
* **USE CRAWL ANALYSIS**: Leverage crawl_analysis.content_categories and crawl_analysis.main_topics for preset generation
- Create presets that match the user's actual website content categories
- Use main_topics for preset keywords and descriptions
* **CONTENT TYPE BASED**: Generate presets based on content_type (from Phase 1):
* **Content-Type-Specific Presets**: Use content_type.primary_type and content_type.secondary_types to create presets:
- If primary_type == "blog": Create "Blog Topic Research" preset with trending topics
- If primary_type == "article": Create "Article Research" preset with in-depth analysis
- If primary_type == "case_study": Create "Case Study Research" preset with real-world examples
- If primary_type == "tutorial": Create "Tutorial Research" preset with step-by-step guides
- If "tutorial" in secondary_types: Add "How-To Guide Research" preset
- If "comparison" in secondary_types or style_patterns: Add "Comparison Research" preset
- If content_type.purpose == "thought_leadership": Create "Thought Leadership Research" with expert insights
- If content_type.purpose == "education": Create "Educational Content Research" preset
* **Use Extracted Topics**: If extracted_topics available, create at least one preset using actual website topics:
- "Latest {extracted_topic} Trends" preset
- "{extracted_topic} Best Practices" preset
* Each preset should include:
- name: Descriptive, action-oriented name that clearly indicates what research will be done
* Use research_angles as inspiration for preset names (e.g., "Compare {Industry} Tools", "{Industry} ROI Analysis")
* If competitor_analysis exists, create at least one competitive analysis preset (e.g., "Competitive Landscape Analysis")
* Make names specific and actionable, not generic
* **NEW**: Include content type in name when relevant (e.g., "Blog: {Industry} Trends", "Tutorial: {Topic} Guide")
- keywords: Research query string that is:
* **NEW**: Use extracted_topics and extracted_keywords when available for more relevant queries
* Specific and detailed (not vague like "AI tools")
* Industry-focused (includes industry context)
* Audience-aware (considers target audience needs)
* Actionable (user can immediately understand what research will provide)
* Examples: "Research latest AI-powered marketing automation platforms for B2B SaaS companies" (GOOD)
* Avoid: "AI tools" or "marketing research" (TOO VAGUE)
- industry: User's industry (from business_info or inferred)
- target_audience: User's target audience (from business_info or inferred)
- research_mode: "basic", "comprehensive", or "targeted" based on:
* **NEW**: Also consider content_type.purpose:
- "thought_leadership""comprehensive" (needs deep research)
- "education""comprehensive" (needs thorough coverage)
- "marketing""targeted" (needs specific insights)
- "entertainment""basic" (needs quick facts)
* "comprehensive" for deep analysis, trends, competitive research
* "targeted" for specific questions, quick insights
* "basic" for simple fact-finding
- config: Complete ResearchConfig object with:
* provider: Use suggested_exa_category to determine if "exa" or "tavily" is better
* exa_category: Use suggested_exa_category if available
* exa_include_domains: Use suggested_exa_domains if available (limit to 3-5 most relevant)
* exa_search_type: Use suggested_exa_search_type if available
* max_sources: 15-25 for comprehensive, 10-15 for targeted, 8-12 for basic
* include_competitors: true if competitor_analysis exists and preset is about competitive research
* include_trends: true for trend-focused presets
* include_statistics: true for data-driven research
* include_expert_quotes: true for comprehensive research or thought_leadership content
- description: Brief (1-2 sentences) explaining what this preset researches and why it's valuable
- icon: Optional emoji that represents the preset (e.g., "📊" for trends, "🎯" for targeted, "🔍" for analysis, "📝" for blog, "📚" for tutorial)
- gradient: Optional CSS gradient for visual appeal
PRESET GENERATION GUIDELINES:
- **PHASE 1 PRIORITY**: Create presets that match the user's actual content types (from content_type)
- Use extracted_topics to create presets based on actual website content
- Create presets that the user would actually want to use for their content creation
- Use research_angles to inspire preset names and keywords
- If competitor_analysis has data, create at least one competitive analysis preset
- Make each preset unique with different research focus (trends, tools, best practices, competitive, etc.)
- Ensure keywords are detailed enough to generate meaningful research
- Vary research_mode across presets to offer different depth levels
- Use industry-specific terminology in preset names and keywords
7. RESEARCH PREFERENCES:
- "research_preferences": Extract and structure research preferences from onboarding:
- research_depth: From research_preferences.research_depth
- content_types: From research_preferences.content_types
- auto_research: From research_preferences.auto_research
- factual_content: From research_preferences.factual_content
=== OUTPUT REQUIREMENTS ===
Return a valid JSON object matching this exact structure:
{{
"default_industry": "string",
"default_target_audience": "string",
"default_research_mode": "basic" | "comprehensive" | "targeted",
"default_provider": "google" | "exa",
"suggested_keywords": ["keyword1", "keyword2", ...],
"keyword_expansion_patterns": {{
"keyword": ["expansion1", "expansion2", ...]
}},
"suggested_exa_domains": ["domain1.com", "domain2.com", ...],
"suggested_exa_category": "string or null",
"suggested_exa_search_type": "auto | neural | keyword | fast | deep",
"suggested_tavily_topic": "general | news | finance",
"suggested_tavily_search_depth": "basic | advanced | fast | ultra-fast",
"suggested_tavily_include_answer": "false | basic | advanced",
"suggested_tavily_time_range": "day | week | month | year or null",
"suggested_tavily_raw_content_format": "false | markdown | text",
"provider_recommendations": {{
"trends": "tavily",
"deep_research": "exa",
"factual": "google"
}},
"research_angles": ["angle1", "angle2", ...],
"query_enhancement_rules": {{
"pattern": "template"
}},
"recommended_presets": [
{{
"name": "string",
"keywords": "string",
"industry": "string",
"target_audience": "string",
"research_mode": "basic" | "comprehensive" | "targeted",
"config": {{
"mode": "basic" | "comprehensive" | "targeted",
"provider": "google" | "exa",
"max_sources": 10 | 15 | 12,
"include_statistics": true | false,
"include_expert_quotes": true | false,
"include_competitors": true | false,
"include_trends": true | false,
"exa_category": "string or null",
"exa_include_domains": ["domain1.com", ...],
"exa_search_type": "auto" | "keyword" | "neural"
}},
"description": "string"
}}
],
"research_preferences": {{
"research_depth": "string",
"content_types": ["type1", "type2", ...],
"auto_research": true | false,
"factual_content": true | false
}},
"version": "1.0",
"confidence_score": 85.0
}}
=== IMPORTANT INSTRUCTIONS ===
1. Be highly specific and personalized - use actual data from the user's business, persona, and preferences.
2. NEVER use "General" for industry or target_audience - always infer or create specific categories based on available context.
3. For minimal data scenarios:
- If industry is unclear, infer from research_preferences.content_types or website_analysis.content_characteristics
- If target_audience is unclear, infer from writing_style patterns or content goals
- Use business_info to fill gaps when persona_data is incomplete
4. Generate industry-specific intelligence even with limited data:
- For content creators: assume "Content Marketing" or "Digital Publishing"
- For business users: assume "Business Consulting" or "Professional Services"
- For technical users: assume "Technology" or "Software Development"
5. Ensure all suggested keywords, domains, and angles are relevant to the user's industry and audience.
6. Generate realistic, actionable presets that the user would actually want to use.
7. Confidence score should reflect data richness (0-100): higher if rich onboarding data, lower if minimal data.
8. Return ONLY valid JSON - no markdown formatting, no explanatory text.
Generate the research persona now:
"""
return prompt
def _extract_topics_from_crawl(self, crawl_result: Dict[str, Any]) -> List[str]:
"""
Extract topics from crawl_result JSON data.
Args:
crawl_result: Dictionary containing crawled website data
Returns:
List of extracted topics (max 15)
"""
topics = []
if not crawl_result:
return topics
try:
# Try to extract from common crawl result structures
# Method 1: Direct topics field
if isinstance(crawl_result.get('topics'), list):
topics.extend(crawl_result['topics'][:10])
# Method 2: Extract from headings
if isinstance(crawl_result.get('headings'), list):
headings = crawl_result['headings']
# Filter out common non-topic headings
filtered_headings = [
h for h in headings[:15]
if h and len(h.strip()) > 3
and h.lower() not in ['home', 'about', 'contact', 'menu', 'navigation', 'footer', 'header']
]
topics.extend(filtered_headings)
# Method 3: Extract from page titles
if isinstance(crawl_result.get('titles'), list):
titles = crawl_result['titles']
topics.extend([t for t in titles[:10] if t and len(t.strip()) > 3])
# Method 4: Extract from content sections
if isinstance(crawl_result.get('sections'), list):
sections = crawl_result['sections']
for section in sections[:10]:
if isinstance(section, dict):
section_title = section.get('title') or section.get('heading')
if section_title and len(section_title.strip()) > 3:
topics.append(section_title)
# Method 5: Extract from metadata
if isinstance(crawl_result.get('metadata'), dict):
meta = crawl_result['metadata']
if meta.get('title'):
topics.append(meta['title'])
if isinstance(meta.get('keywords'), list):
topics.extend(meta['keywords'][:5])
# Remove duplicates and clean
unique_topics = []
seen = set()
for topic in topics:
if topic and isinstance(topic, str):
cleaned = topic.strip()
if cleaned and cleaned.lower() not in seen:
seen.add(cleaned.lower())
unique_topics.append(cleaned)
return unique_topics[:15] # Limit to 15 topics
except Exception as e:
logger.debug(f"Error extracting topics from crawl_result: {e}")
return []
def _extract_keywords_from_crawl(self, crawl_result: Dict[str, Any]) -> List[str]:
"""
Extract keywords from crawl_result JSON data.
Args:
crawl_result: Dictionary containing crawled website data
Returns:
List of extracted keywords (max 20)
"""
keywords = []
if not crawl_result:
return keywords
try:
# Method 1: Direct keywords field
if isinstance(crawl_result.get('keywords'), list):
keywords.extend(crawl_result['keywords'][:15])
# Method 2: Extract from metadata keywords
if isinstance(crawl_result.get('metadata'), dict):
meta = crawl_result['metadata']
if isinstance(meta.get('keywords'), list):
keywords.extend(meta['keywords'][:10])
if meta.get('description'):
# Extract potential keywords from description (simple word extraction)
desc = meta['description']
words = [w.strip() for w in desc.split() if len(w.strip()) > 4]
keywords.extend(words[:5])
# Method 3: Extract from tags
if isinstance(crawl_result.get('tags'), list):
keywords.extend(crawl_result['tags'][:10])
# Method 4: Extract from content (simple frequency-based, if available)
if isinstance(crawl_result.get('content'), str):
content = crawl_result['content']
# Simple extraction: words that appear multiple times and are > 4 chars
words = content.lower().split()
word_freq = {}
for word in words:
cleaned = ''.join(c for c in word if c.isalnum())
if len(cleaned) > 4:
word_freq[cleaned] = word_freq.get(cleaned, 0) + 1
# Get top keywords by frequency
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
keywords.extend([word for word, freq in sorted_words[:10] if freq > 1])
# Remove duplicates and clean
unique_keywords = []
seen = set()
for keyword in keywords:
if keyword and isinstance(keyword, str):
cleaned = keyword.strip().lower()
if cleaned and len(cleaned) > 2 and cleaned not in seen:
seen.add(cleaned)
unique_keywords.append(keyword.strip())
return unique_keywords[:20] # Limit to 20 keywords
except Exception as e:
logger.debug(f"Error extracting keywords from crawl_result: {e}")
return []
def _extract_writing_patterns(self, style_patterns: Dict[str, Any]) -> List[str]:
"""
Extract writing patterns from style_patterns JSON data.
Args:
style_patterns: Dictionary containing writing patterns analysis
Returns:
List of extracted patterns (max 10)
"""
patterns = []
if not style_patterns:
return patterns
try:
# Method 1: Direct patterns field
if isinstance(style_patterns.get('patterns'), list):
patterns.extend(style_patterns['patterns'][:10])
# Method 2: Common patterns field
if isinstance(style_patterns.get('common_patterns'), list):
patterns.extend(style_patterns['common_patterns'][:10])
# Method 3: Writing patterns field
if isinstance(style_patterns.get('writing_patterns'), list):
patterns.extend(style_patterns['writing_patterns'][:10])
# Method 4: Content structure patterns
if isinstance(style_patterns.get('content_structure'), dict):
structure = style_patterns['content_structure']
if isinstance(structure.get('patterns'), list):
patterns.extend(structure['patterns'][:5])
# Method 5: Extract from analysis field
if isinstance(style_patterns.get('analysis'), dict):
analysis = style_patterns['analysis']
if isinstance(analysis.get('identified_patterns'), list):
patterns.extend(analysis['identified_patterns'][:10])
# Normalize patterns (lowercase, remove duplicates)
normalized_patterns = []
seen = set()
for pattern in patterns:
if pattern and isinstance(pattern, str):
cleaned = pattern.strip().lower().replace('_', '-').replace(' ', '-')
if cleaned and cleaned not in seen:
seen.add(cleaned)
normalized_patterns.append(cleaned)
return normalized_patterns[:10] # Limit to 10 patterns
except Exception as e:
logger.debug(f"Error extracting writing patterns: {e}")
return []
def _extract_style_guidelines(self, style_guidelines: Dict[str, Any]) -> List[str]:
"""
Extract style guidelines from style_guidelines JSON data.
Args:
style_guidelines: Dictionary containing generated style guidelines
Returns:
List of extracted guidelines (max 15)
"""
guidelines = []
if not style_guidelines:
return guidelines
try:
# Method 1: Direct guidelines field
if isinstance(style_guidelines.get('guidelines'), list):
guidelines.extend(style_guidelines['guidelines'][:15])
# Method 2: Recommendations field
if isinstance(style_guidelines.get('recommendations'), list):
guidelines.extend(style_guidelines['recommendations'][:15])
# Method 3: Best practices field
if isinstance(style_guidelines.get('best_practices'), list):
guidelines.extend(style_guidelines['best_practices'][:10])
# Method 4: Tone recommendations
if isinstance(style_guidelines.get('tone_recommendations'), list):
guidelines.extend(style_guidelines['tone_recommendations'][:5])
# Method 5: Structure guidelines
if isinstance(style_guidelines.get('structure_guidelines'), list):
guidelines.extend(style_guidelines['structure_guidelines'][:5])
# Method 6: Vocabulary suggestions
if isinstance(style_guidelines.get('vocabulary_suggestions'), list):
guidelines.extend(style_guidelines['vocabulary_suggestions'][:5])
# Method 7: Engagement tips
if isinstance(style_guidelines.get('engagement_tips'), list):
guidelines.extend(style_guidelines['engagement_tips'][:5])
# Method 8: Audience considerations
if isinstance(style_guidelines.get('audience_considerations'), list):
guidelines.extend(style_guidelines['audience_considerations'][:5])
# Method 9: SEO optimization (if available)
if isinstance(style_guidelines.get('seo_optimization'), list):
guidelines.extend(style_guidelines['seo_optimization'][:3])
# Method 10: Conversion optimization (if available)
if isinstance(style_guidelines.get('conversion_optimization'), list):
guidelines.extend(style_guidelines['conversion_optimization'][:3])
# Remove duplicates and clean
unique_guidelines = []
seen = set()
for guideline in guidelines:
if guideline and isinstance(guideline, str):
cleaned = guideline.strip()
# Normalize for comparison (lowercase, remove extra spaces)
normalized = ' '.join(cleaned.lower().split())
if cleaned and normalized not in seen and len(cleaned) > 5:
seen.add(normalized)
unique_guidelines.append(cleaned)
return unique_guidelines[:15] # Limit to 15 guidelines
except Exception as e:
logger.debug(f"Error extracting style guidelines: {e}")
return []
def get_json_schema(self) -> Dict[str, Any]:
"""Return JSON schema for structured LLM response."""
# This will be used with llm_text_gen(json_struct=...)
from models.research_persona_models import ResearchPersona, ResearchPreset
# Convert Pydantic model to JSON schema
return ResearchPersona.schema()

View File

@@ -0,0 +1,194 @@
"""
Research Persona Scheduler
Handles scheduled generation of research personas after onboarding.
"""
from datetime import datetime, timedelta, timezone
from typing import Dict, Any
from loguru import logger
from services.database import get_db_session
from services.research.research_persona_service import ResearchPersonaService
from models.scheduler_models import SchedulerEventLog
async def generate_research_persona_task(user_id: str):
"""
Async task function to generate research persona for a user.
This function is called by the scheduler 20 minutes after onboarding completion.
Args:
user_id: User ID (Clerk string)
"""
db = None
try:
logger.info(f"Scheduled research persona generation started for user {user_id}")
# Get database session
db = get_db_session()
if not db:
logger.error(f"Failed to get database session for research persona generation (user: {user_id})")
return
# Generate research persona
persona_service = ResearchPersonaService(db_session=db)
# Check if persona already exists to avoid unnecessary API calls
persona_data = persona_service._get_persona_data_record(user_id)
if persona_data and persona_data.research_persona:
logger.info(f"Research persona already exists for user {user_id}, skipping generation")
return
start_time = datetime.utcnow()
try:
research_persona = persona_service.get_or_generate(user_id, force_refresh=False)
execution_time = (datetime.utcnow() - start_time).total_seconds()
if research_persona:
logger.info(f"✅ Scheduled research persona generation completed for user {user_id}")
# Log success to scheduler event log for dashboard
try:
event_log = SchedulerEventLog(
event_type='job_completed',
event_date=start_time,
job_id=f"research_persona_{user_id}",
job_type='one_time',
user_id=user_id,
event_data={
'job_function': 'generate_research_persona_task',
'execution_time_seconds': execution_time,
'status': 'success'
}
)
db.add(event_log)
db.commit()
except Exception as log_error:
logger.warning(f"Failed to log persona generation success to scheduler event log: {log_error}")
if db:
db.rollback()
else:
error_msg = (
f"Scheduled research persona generation FAILED for user {user_id}. "
f"Expensive API call was made but generation failed. "
f"Will NOT automatically retry to prevent wasteful API calls."
)
logger.error(f"{error_msg}")
# Log failure to scheduler event log for dashboard visibility
try:
event_log = SchedulerEventLog(
event_type='job_failed',
event_date=start_time,
job_id=f"research_persona_{user_id}",
job_type='one_time',
user_id=user_id,
error_message=error_msg,
event_data={
'job_function': 'generate_research_persona_task',
'execution_time_seconds': execution_time,
'status': 'failed',
'failure_reason': 'generation_returned_none',
'expensive_api_call': True
}
)
db.add(event_log)
db.commit()
except Exception as log_error:
logger.warning(f"Failed to log persona generation failure to scheduler event log: {log_error}")
if db:
db.rollback()
# DO NOT reschedule - this prevents infinite retry loops
# User can manually trigger generation from frontend if needed
except Exception as gen_error:
execution_time = (datetime.utcnow() - start_time).total_seconds()
error_msg = (
f"Exception during scheduled research persona generation for user {user_id}: {str(gen_error)}. "
f"Expensive API call may have been made. Will NOT automatically retry."
)
logger.error(f"{error_msg}")
# Log exception to scheduler event log for dashboard visibility
try:
event_log = SchedulerEventLog(
event_type='job_failed',
event_date=start_time,
job_id=f"research_persona_{user_id}", # Match scheduled job ID format
job_type='one_time',
user_id=user_id,
error_message=error_msg,
event_data={
'job_function': 'generate_research_persona_task',
'execution_time_seconds': execution_time,
'status': 'failed',
'failure_reason': 'exception',
'exception_type': type(gen_error).__name__,
'exception_message': str(gen_error),
'expensive_api_call': True
}
)
db.add(event_log)
db.commit()
except Exception as log_error:
logger.warning(f"Failed to log persona generation exception to scheduler event log: {log_error}")
if db:
db.rollback()
# DO NOT reschedule - prevent infinite retry loops
except Exception as e:
logger.error(f"Error in scheduled research persona generation for user {user_id}: {e}")
finally:
if db:
try:
db.close()
except Exception as e:
logger.error(f"Error closing database session: {e}")
def schedule_research_persona_generation(user_id: str, delay_minutes: int = 20) -> str:
"""
Schedule research persona generation for a user after a delay.
Args:
user_id: User ID (Clerk string)
delay_minutes: Delay in minutes before generating persona (default: 20)
Returns:
Job ID
"""
try:
from services.scheduler import get_scheduler
scheduler = get_scheduler()
# Calculate run date (current time + delay) - ensure UTC timezone-aware
run_date = datetime.now(timezone.utc) + timedelta(minutes=delay_minutes)
# Generate consistent job ID (without timestamp) for proper restoration
# This allows restoration to find and restore the job with original scheduled time
# Note: Clerk user_id already includes "user_" prefix, so we don't add it again
job_id = f"research_persona_{user_id}"
# Schedule the task
scheduled_job_id = scheduler.schedule_one_time_task(
func=generate_research_persona_task,
run_date=run_date,
job_id=job_id,
kwargs={"user_id": user_id},
replace_existing=True
)
logger.info(
f"Scheduled research persona generation for user {user_id} "
f"at {run_date} (job_id: {scheduled_job_id})"
)
return scheduled_job_id
except Exception as e:
logger.error(f"Failed to schedule research persona generation for user {user_id}: {e}")
raise

View File

@@ -0,0 +1,421 @@
"""
Research Persona Service
Handles generation, caching, and retrieval of AI-powered research personas.
"""
from typing import Dict, Any, Optional
from datetime import datetime, timedelta
from loguru import logger
from fastapi import HTTPException
from services.database import get_db_session
from models.onboarding import PersonaData, OnboardingSession
from models.research_persona_models import ResearchPersona
from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
from services.llm_providers.main_text_generation import llm_text_gen
from services.onboarding.database_service import OnboardingDatabaseService
from services.persona_data_service import PersonaDataService
class ResearchPersonaService:
"""Service for generating and managing research personas."""
CACHE_TTL_DAYS = 7 # 7-day cache TTL
def __init__(self, db_session=None):
self.db = db_session or get_db_session()
self.prompt_builder = ResearchPersonaPromptBuilder()
self.onboarding_service = OnboardingDatabaseService(db=self.db)
self.persona_data_service = PersonaDataService(db_session=self.db)
def get_cached_only(
self,
user_id: str
) -> Optional[ResearchPersona]:
"""
Get research persona for user ONLY if it exists in cache.
This method NEVER generates - it only returns cached personas.
Use this for config endpoints to avoid triggering rate limit checks.
Args:
user_id: User ID (Clerk string)
Returns:
ResearchPersona if cached and valid, None otherwise
"""
try:
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.debug(f"No persona data found for user {user_id}")
return None
# Only return if cache is valid and persona exists
if self.is_cache_valid(persona_data) and persona_data.research_persona:
try:
logger.debug(f"Returning cached research persona for user {user_id}")
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse cached research persona: {e}")
return None
# Cache invalid or persona missing - return None (don't generate)
logger.debug(f"No valid cached research persona for user {user_id}")
return None
except Exception as e:
logger.error(f"Error getting cached research persona for user {user_id}: {e}")
return None
def get_or_generate(
self,
user_id: str,
force_refresh: bool = False
) -> Optional[ResearchPersona]:
"""
Get research persona for user, generating if missing or expired.
Args:
user_id: User ID (Clerk string)
force_refresh: If True, regenerate even if cache is valid
Returns:
ResearchPersona if successful, None otherwise
"""
try:
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
return None
# Check cache if not forcing refresh
if not force_refresh and self.is_cache_valid(persona_data):
if persona_data.research_persona:
logger.info(f"Using cached research persona for user {user_id}")
try:
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse cached research persona: {e}, regenerating...")
# Fall through to regeneration
else:
logger.info(f"Research persona missing for user {user_id}, generating...")
else:
if force_refresh:
logger.info(f"Forcing refresh of research persona for user {user_id}")
else:
logger.info(f"Cache expired for user {user_id}, regenerating...")
# Generate new research persona
try:
research_persona = self.generate_research_persona(user_id)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
if research_persona:
# Save to database
if self.save_research_persona(user_id, research_persona):
logger.info(f"✅ Research persona generated and saved for user {user_id}")
else:
logger.warning(f"Failed to save research persona for user {user_id}")
return research_persona
else:
# Log detailed error for debugging expensive failures
logger.error(
f"❌ Failed to generate research persona for user {user_id} - "
f"This is an expensive failure (API call consumed). Check logs above for details."
)
# Don't return None silently - let the caller know this failed
return None
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
except Exception as e:
logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
return None
def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
"""
Generate a new research persona for the user.
Args:
user_id: User ID (Clerk string)
Returns:
ResearchPersona if successful, None otherwise
"""
try:
logger.info(f"Generating research persona for user {user_id}")
# Collect onboarding data
onboarding_data = self._collect_onboarding_data(user_id)
if not onboarding_data:
logger.warning(f"Insufficient onboarding data for user {user_id}")
return None
# Build prompt
prompt = self.prompt_builder.build_research_persona_prompt(onboarding_data)
# Get JSON schema for structured response
json_schema = self.prompt_builder.get_json_schema()
# Call LLM with structured JSON response
logger.info(f"Calling LLM for research persona generation (user: {user_id})")
try:
response_text = llm_text_gen(
prompt=prompt,
json_struct=json_schema,
user_id=user_id
)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
logger.warning(f"HTTPException during LLM call for user {user_id} - re-raising")
raise
except RuntimeError as e:
# Re-raise RuntimeError (subscription limits) as HTTPException
logger.warning(f"RuntimeError during LLM call for user {user_id}: {e}")
raise HTTPException(status_code=429, detail=str(e))
if not response_text:
logger.error("Empty response from LLM")
return None
# Parse JSON response
import json
try:
# When json_struct is provided, llm_text_gen may return a dict directly
if isinstance(response_text, dict):
# Already parsed, use directly
persona_dict = response_text
elif isinstance(response_text, str):
# Handle case where LLM returns markdown-wrapped JSON or plain JSON string
response_text = response_text.strip()
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.startswith("```"):
response_text = response_text[3:]
if response_text.endswith("```"):
response_text = response_text[:-3]
response_text = response_text.strip()
persona_dict = json.loads(response_text)
else:
logger.error(f"Unexpected response type from LLM: {type(response_text)}")
return None
# Add generated_at timestamp
persona_dict["generated_at"] = datetime.utcnow().isoformat()
# Validate and create ResearchPersona
# Log the dict structure for debugging if validation fails
try:
research_persona = ResearchPersona(**persona_dict)
logger.info(f"✅ Research persona generated successfully for user {user_id}")
return research_persona
except Exception as validation_error:
logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
logger.debug(f"Persona dict keys: {list(persona_dict.keys()) if isinstance(persona_dict, dict) else 'Not a dict'}")
logger.debug(f"Persona dict sample: {str(persona_dict)[:500]}")
# Re-raise to be caught by outer exception handler
raise
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response as JSON: {e}")
logger.debug(f"Response text: {response_text[:500] if isinstance(response_text, str) else str(response_text)[:500]}")
return None
except Exception as e:
logger.error(f"Failed to create ResearchPersona from response: {e}")
return None
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
except Exception as e:
logger.error(f"Error generating research persona for user {user_id}: {e}")
return None
def is_cache_valid(self, persona_data: PersonaData) -> bool:
"""
Check if cached research persona is still valid (within TTL).
Args:
persona_data: PersonaData database record
Returns:
True if cache is valid, False otherwise
"""
if not persona_data.research_persona_generated_at:
return False
# Check if within TTL
cache_age = datetime.utcnow() - persona_data.research_persona_generated_at
is_valid = cache_age < timedelta(days=self.CACHE_TTL_DAYS)
if not is_valid:
logger.debug(f"Cache expired (age: {cache_age.days} days, TTL: {self.CACHE_TTL_DAYS} days)")
return is_valid
def save_research_persona(
self,
user_id: str,
research_persona: ResearchPersona
) -> bool:
"""
Save research persona to database.
Args:
user_id: User ID (Clerk string)
research_persona: ResearchPersona to save
Returns:
True if successful, False otherwise
"""
try:
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.error(f"No persona data record found for user {user_id}")
return False
# Convert ResearchPersona to dict for JSON storage
persona_dict = research_persona.dict()
# Update database record
persona_data.research_persona = persona_dict
persona_data.research_persona_generated_at = datetime.utcnow()
self.db.commit()
logger.info(f"✅ Research persona saved for user {user_id}")
return True
except Exception as e:
logger.error(f"Error saving research persona for user {user_id}: {e}")
self.db.rollback()
return False
def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
"""Get PersonaData database record for user."""
try:
# Ensure research_persona columns exist before querying
self.onboarding_service._ensure_research_persona_columns(self.db)
# Get onboarding session
session = self.db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).first()
if not session:
return None
# Get persona data
persona_data = self.db.query(PersonaData).filter(
PersonaData.session_id == session.id
).first()
return persona_data
except Exception as e:
logger.error(f"Error getting persona data record for user {user_id}: {e}")
return None
def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
"""
Collect all onboarding data needed for research persona generation.
Returns:
Dictionary with website_analysis, persona_data, research_preferences, business_info
"""
try:
# Get website analysis
website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}
# Get persona data
persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}
# Get research preferences
research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}
# Get business info - construct from persona data and website analysis
business_info = {}
# Try to extract from persona data
if persona_data_dict:
core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
if core_persona:
if core_persona.get('industry'):
business_info['industry'] = core_persona['industry']
if core_persona.get('target_audience'):
business_info['target_audience'] = core_persona['target_audience']
# Fallback to website analysis if not in persona
if not business_info.get('industry') and website_analysis:
target_audience_data = website_analysis.get('target_audience', {})
if isinstance(target_audience_data, dict):
industry_focus = target_audience_data.get('industry_focus')
if industry_focus:
business_info['industry'] = industry_focus
demographics = target_audience_data.get('demographics')
if demographics:
business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
# Check if we have enough data - be more lenient since we can infer from minimal data
# We need at least some basic information to generate a meaningful persona
has_basic_data = bool(
website_analysis or
persona_data_dict or
research_prefs.get('content_types') or
business_info.get('industry')
)
if not has_basic_data:
logger.warning(f"Insufficient onboarding data for user {user_id} - no basic data found")
return None
# If we have minimal data, add intelligent defaults to help the AI
if not business_info.get('industry'):
# Try to infer industry from research preferences or content types
content_types = research_prefs.get('content_types', [])
if 'blog' in content_types or 'article' in content_types:
business_info['industry'] = 'Content Marketing'
business_info['inferred'] = True
elif 'social_media' in content_types:
business_info['industry'] = 'Social Media Marketing'
business_info['inferred'] = True
elif 'video' in content_types:
business_info['industry'] = 'Video Content Creation'
business_info['inferred'] = True
if not business_info.get('target_audience'):
# Default to professionals for content creators
business_info['target_audience'] = 'Professionals and content consumers'
business_info['inferred'] = True
# Get competitor analysis data (if available)
competitor_analysis = None
try:
competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
if competitor_analysis:
logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
except Exception as e:
logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")
return {
"website_analysis": website_analysis,
"persona_data": persona_data_dict,
"research_preferences": research_prefs,
"business_info": business_info,
"competitor_analysis": competitor_analysis # Add competitor data for better preset generation
}
except Exception as e:
logger.error(f"Error collecting onboarding data for user {user_id}: {e}")
return None

View File

@@ -0,0 +1,425 @@
"""
Tavily API Service for ALwrity
This service provides web search and research capabilities using the Tavily API,
which offers AI-powered search with real-time information retrieval.
Key Features:
- Web search with AI-powered results
- Content extraction and summarization
- Real-time information retrieval
- Topic-based search (general, news, finance)
- Advanced search depth options
- Cost-effective API usage with caching
Dependencies:
- aiohttp (for async HTTP requests)
- os (for environment variables)
- logging (for debugging)
Author: ALwrity Team
Version: 1.0
Last Updated: January 2025
"""
import os
import json
import aiohttp
from typing import Dict, List, Optional, Any, Union
from datetime import datetime, timedelta
from loguru import logger
from urllib.parse import urlparse
class TavilyService:
"""
Service for web search and research using the Tavily API.
This service provides AI-powered search capabilities to find relevant
content and information for research purposes.
"""
def __init__(self):
"""Initialize the Tavily Service with API credentials."""
self.api_key = os.getenv("TAVILY_API_KEY")
self.base_url = "https://api.tavily.com"
self.enabled = False
# Don't assume key is available at import time in production.
# Keys may be injected per-request via middleware, so defer init.
self._try_initialize()
def _try_initialize(self) -> None:
"""Attempt to (re)initialize the Tavily service from current environment."""
if self.enabled and self.api_key:
return
try:
self.api_key = os.getenv("TAVILY_API_KEY")
if not self.api_key:
# Leave disabled; caller may try again after middleware injection
logger.warning("TAVILY_API_KEY not configured; Tavily service will be disabled")
self.enabled = False
return
self.enabled = True
logger.info("Tavily Service initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize Tavily service: {e}")
self.enabled = False
async def search(
self,
query: str,
topic: str = "general",
search_depth: str = "basic",
max_results: int = 10,
include_domains: Optional[List[str]] = None,
exclude_domains: Optional[List[str]] = None,
include_answer: Union[bool, str] = False,
include_raw_content: Union[bool, str] = False,
include_images: bool = False,
include_image_descriptions: bool = False,
include_favicon: bool = False,
time_range: Optional[str] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
country: Optional[str] = None,
chunks_per_source: int = 3,
auto_parameters: bool = False
) -> Dict[str, Any]:
"""
Execute a search query using Tavily API.
Args:
query: The search query to execute
topic: Category of search (general, news, finance)
search_depth: Depth of search (basic, advanced) - basic costs 1 credit, advanced costs 2
max_results: Maximum number of results to return (0-20)
include_domains: List of domains to specifically include
exclude_domains: List of domains to specifically exclude
include_answer: Include LLM-generated answer (basic/advanced/true/false)
include_raw_content: Include raw HTML content (markdown/text/true/false)
include_images: Include image search results
include_image_descriptions: Include image descriptions
include_favicon: Include favicon URLs
time_range: Time range filter (day, week, month, year, d, w, m, y)
start_date: Start date filter (YYYY-MM-DD)
end_date: End date filter (YYYY-MM-DD)
country: Country filter (boost results from specific country)
chunks_per_source: Maximum chunks per source (1-3, only for advanced search)
auto_parameters: Auto-configure parameters based on query
Returns:
Dictionary containing search results
"""
try:
# Ensure we pick up any per-request injected key
self._try_initialize()
if not self.enabled:
raise ValueError("Tavily Service is not enabled - API key missing")
logger.info(f"Starting Tavily search for: {query}")
# Build request payload
payload = {
"api_key": self.api_key,
"query": query,
"topic": topic,
"search_depth": search_depth,
"max_results": min(max_results, 20), # Tavily limit
"include_favicon": include_favicon
}
# Add optional parameters
if include_domains:
payload["include_domains"] = include_domains[:300] # Tavily limit
if exclude_domains:
payload["exclude_domains"] = exclude_domains[:150] # Tavily limit
if include_answer:
payload["include_answer"] = include_answer
if include_raw_content:
payload["include_raw_content"] = include_raw_content
if include_images:
payload["include_images"] = include_images
if include_image_descriptions:
payload["include_image_descriptions"] = include_image_descriptions
if time_range:
payload["time_range"] = time_range
if start_date:
payload["start_date"] = start_date
if end_date:
payload["end_date"] = end_date
if country and topic == "general":
payload["country"] = country
if search_depth == "advanced" and 1 <= chunks_per_source <= 3:
payload["chunks_per_source"] = chunks_per_source
if auto_parameters:
payload["auto_parameters"] = True
# Make API request
async with aiohttp.ClientSession() as session:
async with session.post(
f"{self.base_url}/search",
json=payload,
headers={"Content-Type": "application/json"},
timeout=aiohttp.ClientTimeout(total=60)
) as response:
if response.status == 200:
result = await response.json()
logger.info(f"Tavily search completed successfully. Found {len(result.get('results', []))} results.")
# Process and structure results
processed_results = self._process_search_results(result, query)
return {
"success": True,
"query": result.get("query", query),
"answer": result.get("answer"), # If include_answer was requested
"results": processed_results,
"images": result.get("images", []),
"response_time": result.get("response_time"),
"request_id": result.get("request_id"),
"auto_parameters": result.get("auto_parameters"),
"total_results": len(processed_results),
"timestamp": datetime.utcnow().isoformat()
}
else:
error_text = await response.text()
logger.error(f"Tavily API error: {response.status} - {error_text}")
raise RuntimeError(f"Tavily API error: {response.status} - {error_text}")
except aiohttp.ClientTimeout:
logger.error("Tavily API request timed out")
return {
"success": False,
"error": "Request timed out",
"details": "The search request took too long to complete"
}
except Exception as e:
logger.error(f"Error in Tavily search: {str(e)}")
return {
"success": False,
"error": str(e),
"details": "An unexpected error occurred during search"
}
def _process_search_results(self, api_response: Dict[str, Any], query: str) -> List[Dict[str, Any]]:
"""
Process and structure Tavily API response into standardized format.
Args:
api_response: Raw response from Tavily API
query: Original search query
Returns:
List of processed search results
"""
results = []
raw_results = api_response.get("results", [])
for result in raw_results:
try:
# Extract domain from URL
url = result.get("url", "")
domain = urlparse(url).netloc if url else ""
# Calculate relevance score (Tavily provides score field)
relevance_score = result.get("score", 0.5)
processed_result = {
"url": url,
"domain": domain,
"title": result.get("title", ""),
"content": result.get("content", ""),
"raw_content": result.get("raw_content"), # If include_raw_content was requested
"score": relevance_score,
"relevance_score": relevance_score, # Alias for compatibility
"favicon": result.get("favicon"),
"published_date": result.get("published_date"),
}
results.append(processed_result)
except Exception as e:
logger.warning(f"Error processing Tavily result: {str(e)}")
continue
# Sort by relevance score (highest first)
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
return results
async def search_industry_trends(
self,
topic: str,
industry: str,
max_results: int = 10,
search_depth: str = "basic"
) -> Dict[str, Any]:
"""
Search for current industry trends and insights.
Args:
topic: The specific topic to research
industry: The industry context for the search
max_results: Maximum number of search results to return
search_depth: Depth of search (basic or advanced)
Returns:
Dictionary containing search results with industry context
"""
# Build industry-specific query
search_query = f"{topic} {industry} trends insights"
# Use news topic for current trends
return await self.search(
query=search_query,
topic="news" if search_depth == "basic" else "general",
search_depth=search_depth,
max_results=max_results,
include_answer="basic",
include_favicon=True,
time_range="month" # Last month for current trends
)
async def discover_competitors(
self,
user_url: str,
num_results: int = 10,
include_domains: Optional[List[str]] = None,
exclude_domains: Optional[List[str]] = None,
industry_context: Optional[str] = None,
website_analysis_data: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Discover competitors for a given website using Tavily search.
Args:
user_url: The website URL to find competitors for
num_results: Number of competitor results to return
include_domains: List of domains to include in search
exclude_domains: List of domains to exclude from search
industry_context: Industry context for better competitor discovery
Returns:
Dictionary containing competitor analysis results
"""
try:
# Ensure we pick up any per-request injected key
self._try_initialize()
if not self.enabled:
raise ValueError("Tavily Service is not enabled - API key missing")
logger.info(f"Starting competitor discovery for: {user_url}")
# Extract user domain for exclusion
user_domain = urlparse(user_url).netloc
exclude_domains_list = exclude_domains or []
exclude_domains_list.append(user_domain)
# Build search query
query_parts = ["similar websites", "competitors"]
if industry_context:
query_parts.append(f"in {industry_context}")
# Extract insights from website analysis if available
if website_analysis_data:
analysis = website_analysis_data.get('analysis', {})
if 'target_audience' in analysis:
audience = analysis['target_audience']
if isinstance(audience, dict) and 'primary_audience' in audience:
query_parts.append(audience['primary_audience'])
search_query = " ".join(query_parts)
# Perform search
search_result = await self.search(
query=search_query,
topic="general",
search_depth="advanced", # Use advanced for better competitor discovery
max_results=num_results,
include_domains=include_domains,
exclude_domains=exclude_domains_list,
include_favicon=True,
chunks_per_source=3
)
if not search_result.get("success"):
return search_result
# Process results into competitor format
competitors = []
for result in search_result.get("results", []):
competitor_data = {
"url": result.get("url"),
"domain": result.get("domain"),
"title": result.get("title"),
"summary": result.get("content", ""),
"relevance_score": result.get("relevance_score", 0.5),
"favicon": result.get("favicon"),
"published_date": result.get("published_date"),
"highlights": self._extract_highlights(result.get("content", "")),
"competitive_insights": self._extract_competitive_insights(result),
"content_insights": self._analyze_content_quality(result)
}
competitors.append(competitor_data)
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
return {
"success": True,
"user_url": user_url,
"competitors": competitors,
"total_competitors": len(competitors),
"analysis_timestamp": datetime.utcnow().isoformat(),
"industry_context": industry_context,
"request_id": search_result.get("request_id")
}
except Exception as e:
logger.error(f"Error in competitor discovery: {str(e)}")
return {
"success": False,
"error": str(e),
"details": "An unexpected error occurred during competitor discovery"
}
def _extract_highlights(self, content: str, num_sentences: int = 3) -> List[str]:
"""Extract key highlights from content."""
if not content:
return []
# Simple sentence extraction (can be enhanced with NLP)
sentences = [s.strip() for s in content.split('.') if s.strip()]
return sentences[:num_sentences]
def _extract_competitive_insights(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""Extract competitive insights from search result."""
content = result.get("content", "")
title = result.get("title", "")
return {
"business_model": "unknown",
"target_audience": "unknown",
"key_differentiators": []
}
def _analyze_content_quality(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze content quality metrics."""
content = result.get("content", "")
return {
"content_focus": "general",
"content_quality": "medium",
"publishing_frequency": "unknown"
}