Base code
This commit is contained in:
55
backend/services/research/__init__.py
Normal file
55
backend/services/research/__init__.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Research Services Module for ALwrity
|
||||
|
||||
This module provides research and grounding capabilities for content generation,
|
||||
replacing mock research with real-time industry information.
|
||||
|
||||
Available Services:
|
||||
- GoogleSearchService: Real-time industry research using Google Custom Search API
|
||||
- ExaService: Competitor discovery and analysis using Exa API
|
||||
- TavilyService: AI-powered web search with real-time information
|
||||
- Source ranking and credibility assessment
|
||||
- Content extraction and insight generation
|
||||
|
||||
Core Module (v2.0):
|
||||
- ResearchEngine: Standalone AI research engine for any content tool
|
||||
- ResearchContext: Unified input schema for research requests
|
||||
- ParameterOptimizer: AI-driven parameter optimization
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
Last Updated: December 2025
|
||||
"""
|
||||
|
||||
from .google_search_service import GoogleSearchService
|
||||
from .exa_service import ExaService
|
||||
from .tavily_service import TavilyService
|
||||
|
||||
# Core Research Engine (v2.0)
|
||||
from .core import (
|
||||
ResearchEngine,
|
||||
ResearchContext,
|
||||
ResearchPersonalizationContext,
|
||||
ContentType,
|
||||
ResearchGoal,
|
||||
ResearchDepth,
|
||||
ProviderPreference,
|
||||
ParameterOptimizer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Legacy services (still used by blog writer)
|
||||
"GoogleSearchService",
|
||||
"ExaService",
|
||||
"TavilyService",
|
||||
|
||||
# Core Research Engine (v2.0)
|
||||
"ResearchEngine",
|
||||
"ResearchContext",
|
||||
"ResearchPersonalizationContext",
|
||||
"ContentType",
|
||||
"ResearchGoal",
|
||||
"ResearchDepth",
|
||||
"ProviderPreference",
|
||||
"ParameterOptimizer",
|
||||
]
|
||||
270
backend/services/research/competitor_analysis_prompts.py
Normal file
270
backend/services/research/competitor_analysis_prompts.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""
|
||||
AI Prompts for Competitor Analysis
|
||||
|
||||
This module contains prompts for analyzing competitor data from Exa API
|
||||
to generate actionable insights for content strategy and competitive positioning.
|
||||
"""
|
||||
|
||||
COMPETITOR_ANALYSIS_PROMPT = """
|
||||
You are a competitive intelligence analyst specializing in content strategy and market positioning.
|
||||
|
||||
**TASK**: Analyze competitor data to provide actionable insights for content strategy and competitive positioning.
|
||||
|
||||
**COMPETITOR DATA**:
|
||||
{competitor_context}
|
||||
|
||||
**USER'S WEBSITE**: {user_url}
|
||||
**INDUSTRY CONTEXT**: {industry_context}
|
||||
|
||||
**ANALYSIS REQUIREMENTS**:
|
||||
|
||||
1. **Market Position Analysis**
|
||||
- Identify the competitive landscape structure
|
||||
- Determine market leaders vs. challengers
|
||||
- Assess market saturation and opportunities
|
||||
|
||||
2. **Content Strategy Insights**
|
||||
- Analyze competitor content themes and topics
|
||||
- Identify content gaps and opportunities
|
||||
- Suggest unique content angles for differentiation
|
||||
|
||||
3. **Competitive Advantages**
|
||||
- Highlight what makes each competitor unique
|
||||
- Identify areas where the user can differentiate
|
||||
- Suggest positioning strategies
|
||||
|
||||
4. **SEO and Marketing Insights**
|
||||
- Analyze competitor positioning and messaging
|
||||
- Identify keyword and content opportunities
|
||||
- Suggest marketing strategies
|
||||
|
||||
**OUTPUT FORMAT** (JSON):
|
||||
{{
|
||||
"market_analysis": {{
|
||||
"competitive_landscape": "Description of market structure",
|
||||
"market_leaders": ["List of top 3 competitors"],
|
||||
"market_opportunities": ["List of 3-5 opportunities"],
|
||||
"saturation_level": "high/medium/low"
|
||||
}},
|
||||
"content_strategy": {{
|
||||
"common_themes": ["List of common content themes"],
|
||||
"content_gaps": ["List of 5 content opportunities"],
|
||||
"unique_angles": ["List of 3 unique content angles"],
|
||||
"content_frequency_insights": "Analysis of publishing patterns"
|
||||
}},
|
||||
"competitive_positioning": {{
|
||||
"differentiation_opportunities": ["List of 5 ways to differentiate"],
|
||||
"unique_value_propositions": ["List of 3 unique positioning ideas"],
|
||||
"target_audience_insights": "Analysis of competitor audience targeting"
|
||||
}},
|
||||
"seo_opportunities": {{
|
||||
"keyword_gaps": ["List of 5 keyword opportunities"],
|
||||
"content_topics": ["List of 5 high-value content topics"],
|
||||
"marketing_channels": ["List of competitor marketing strategies"]
|
||||
}},
|
||||
"actionable_recommendations": [
|
||||
"List of 5 specific, actionable recommendations"
|
||||
],
|
||||
"risk_assessment": {{
|
||||
"competitive_threats": ["List of 3 main threats"],
|
||||
"market_barriers": ["List of 2-3 barriers to entry"],
|
||||
"success_factors": ["List of 3 key success factors"]
|
||||
}}
|
||||
}}
|
||||
|
||||
**INSTRUCTIONS**:
|
||||
- Be specific and actionable in your recommendations
|
||||
- Focus on opportunities for differentiation
|
||||
- Consider the user's industry context
|
||||
- Prioritize recommendations by impact and feasibility
|
||||
- Use data from the competitor analysis to support insights
|
||||
- Keep recommendations practical and implementable
|
||||
|
||||
**QUALITY STANDARDS**:
|
||||
- Each recommendation should be specific and actionable
|
||||
- Insights should be based on actual competitor data
|
||||
- Focus on differentiation and competitive advantage
|
||||
- Consider both short-term and long-term strategies
|
||||
- Ensure recommendations are relevant to the user's industry
|
||||
"""
|
||||
|
||||
CONTENT_GAP_ANALYSIS_PROMPT = """
|
||||
You are a content strategist analyzing competitor content to identify gaps and opportunities.
|
||||
|
||||
**TASK**: Analyze competitor content patterns to identify content gaps and opportunities.
|
||||
|
||||
**COMPETITOR CONTENT DATA**:
|
||||
{competitor_context}
|
||||
|
||||
**USER'S INDUSTRY**: {industry_context}
|
||||
**TARGET AUDIENCE**: {target_audience}
|
||||
|
||||
**ANALYSIS FOCUS**:
|
||||
|
||||
1. **Content Topic Analysis**
|
||||
- Identify most common content topics across competitors
|
||||
- Find underserved or missing topics
|
||||
- Analyze content depth and quality patterns
|
||||
|
||||
2. **Content Format Opportunities**
|
||||
- Identify popular content formats among competitors
|
||||
- Find format gaps and opportunities
|
||||
- Suggest innovative content approaches
|
||||
|
||||
3. **Audience Targeting Gaps**
|
||||
- Analyze competitor audience targeting
|
||||
- Identify underserved audience segments
|
||||
- Suggest audience expansion opportunities
|
||||
|
||||
4. **SEO Content Opportunities**
|
||||
- Identify high-value keywords competitors are missing
|
||||
- Find long-tail keyword opportunities
|
||||
- Suggest content clusters for SEO
|
||||
|
||||
**OUTPUT FORMAT** (JSON):
|
||||
{{
|
||||
"content_gaps": [
|
||||
{{
|
||||
"topic": "Specific content topic",
|
||||
"opportunity_level": "high/medium/low",
|
||||
"reasoning": "Why this is an opportunity",
|
||||
"content_angle": "Unique angle for this topic",
|
||||
"estimated_difficulty": "easy/medium/hard"
|
||||
}}
|
||||
],
|
||||
"format_opportunities": [
|
||||
{{
|
||||
"format": "Content format type",
|
||||
"gap_reason": "Why competitors aren't using this",
|
||||
"potential_impact": "Expected impact level",
|
||||
"implementation_tips": "How to implement"
|
||||
}}
|
||||
],
|
||||
"audience_gaps": [
|
||||
{{
|
||||
"audience_segment": "Underserved audience",
|
||||
"opportunity_size": "large/medium/small",
|
||||
"content_needs": "What content this audience needs",
|
||||
"engagement_strategy": "How to engage this audience"
|
||||
}}
|
||||
],
|
||||
"seo_opportunities": [
|
||||
{{
|
||||
"keyword_theme": "Keyword cluster theme",
|
||||
"search_volume": "estimated_high/medium/low",
|
||||
"competition_level": "low/medium/high",
|
||||
"content_ideas": ["3-5 content ideas for this theme"]
|
||||
}}
|
||||
],
|
||||
"priority_recommendations": [
|
||||
"Top 5 prioritized content opportunities with implementation order"
|
||||
]
|
||||
}}
|
||||
"""
|
||||
|
||||
COMPETITIVE_INTELLIGENCE_PROMPT = """
|
||||
You are a competitive intelligence expert providing strategic insights for market positioning.
|
||||
|
||||
**TASK**: Generate comprehensive competitive intelligence insights for strategic decision-making.
|
||||
|
||||
**COMPETITOR INTELLIGENCE DATA**:
|
||||
{competitor_context}
|
||||
|
||||
**BUSINESS CONTEXT**:
|
||||
- User Website: {user_url}
|
||||
- Industry: {industry_context}
|
||||
- Business Model: {business_model}
|
||||
- Target Market: {target_market}
|
||||
|
||||
**INTELLIGENCE AREAS**:
|
||||
|
||||
1. **Competitive Landscape Mapping**
|
||||
- Market positioning analysis
|
||||
- Competitive strength assessment
|
||||
- Market share estimation
|
||||
|
||||
2. **Strategic Positioning Opportunities**
|
||||
- Blue ocean opportunities
|
||||
- Differentiation strategies
|
||||
- Competitive moats
|
||||
|
||||
3. **Threat Assessment**
|
||||
- Competitive threats
|
||||
- Market disruption risks
|
||||
- Barrier to entry analysis
|
||||
|
||||
4. **Growth Strategy Insights**
|
||||
- Market expansion opportunities
|
||||
- Partnership possibilities
|
||||
- Acquisition targets
|
||||
|
||||
**OUTPUT FORMAT** (JSON):
|
||||
{{
|
||||
"competitive_landscape": {{
|
||||
"market_structure": "Description of market structure",
|
||||
"key_players": [
|
||||
{{
|
||||
"name": "Competitor name",
|
||||
"position": "market_leader/challenger/niche",
|
||||
"strengths": ["List of key strengths"],
|
||||
"weaknesses": ["List of key weaknesses"],
|
||||
"market_share": "estimated_percentage"
|
||||
}}
|
||||
],
|
||||
"market_dynamics": "Analysis of market trends and forces"
|
||||
}},
|
||||
"positioning_opportunities": {{
|
||||
"blue_ocean_opportunities": ["List of uncontested market spaces"],
|
||||
"differentiation_strategies": ["List of positioning strategies"],
|
||||
"competitive_advantages": ["List of potential advantages to build"]
|
||||
}},
|
||||
"threat_analysis": {{
|
||||
"immediate_threats": ["List of current competitive threats"],
|
||||
"future_risks": ["List of potential future risks"],
|
||||
"market_barriers": ["List of barriers to success"]
|
||||
}},
|
||||
"strategic_recommendations": {{
|
||||
"short_term_actions": ["List of 3-5 immediate actions"],
|
||||
"medium_term_strategy": ["List of 3-5 strategic initiatives"],
|
||||
"long_term_vision": ["List of 2-3 long-term strategic goals"]
|
||||
}},
|
||||
"success_metrics": {{
|
||||
"kpis_to_track": ["List of key performance indicators"],
|
||||
"competitive_benchmarks": ["List of metrics to benchmark against"],
|
||||
"success_thresholds": ["List of success criteria"]
|
||||
}}
|
||||
}}
|
||||
"""
|
||||
|
||||
# Utility function to format prompts with data
|
||||
def format_competitor_analysis_prompt(competitor_context: str, user_url: str, industry_context: str = None) -> str:
|
||||
"""Format the competitor analysis prompt with actual data."""
|
||||
return COMPETITOR_ANALYSIS_PROMPT.format(
|
||||
competitor_context=competitor_context,
|
||||
user_url=user_url,
|
||||
industry_context=industry_context or "Not specified"
|
||||
)
|
||||
|
||||
def format_content_gap_prompt(competitor_context: str, industry_context: str = None, target_audience: str = None) -> str:
|
||||
"""Format the content gap analysis prompt with actual data."""
|
||||
return CONTENT_GAP_ANALYSIS_PROMPT.format(
|
||||
competitor_context=competitor_context,
|
||||
industry_context=industry_context or "Not specified",
|
||||
target_audience=target_audience or "Not specified"
|
||||
)
|
||||
|
||||
def format_competitive_intelligence_prompt(
|
||||
competitor_context: str,
|
||||
user_url: str,
|
||||
industry_context: str = None,
|
||||
business_model: str = None,
|
||||
target_market: str = None
|
||||
) -> str:
|
||||
"""Format the competitive intelligence prompt with actual data."""
|
||||
return COMPETITIVE_INTELLIGENCE_PROMPT.format(
|
||||
competitor_context=competitor_context,
|
||||
user_url=user_url,
|
||||
industry_context=industry_context or "Not specified",
|
||||
business_model=business_model or "Not specified",
|
||||
target_market=target_market or "Not specified"
|
||||
)
|
||||
51
backend/services/research/core/__init__.py
Normal file
51
backend/services/research/core/__init__.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Research Engine Core Module
|
||||
|
||||
This is the standalone AI Research Engine that can be imported by
|
||||
Blog Writer, Podcast Maker, YouTube Creator, and other ALwrity tools.
|
||||
|
||||
Design Goals:
|
||||
- Tool-agnostic: Any content tool can import and use this
|
||||
- AI-driven parameter optimization: Users don't need to understand Exa/Tavily internals
|
||||
- Provider priority: Exa → Tavily → Google (fallback)
|
||||
- Personalization-aware: Accepts context from calling tools
|
||||
- Advanced by default: Prioritizes quality over speed
|
||||
|
||||
Usage:
|
||||
from services.research.core import ResearchEngine, ResearchContext
|
||||
|
||||
engine = ResearchEngine()
|
||||
result = await engine.research(ResearchContext(
|
||||
query="AI trends in healthcare 2025",
|
||||
content_type=ContentType.BLOG,
|
||||
persona_context={"industry": "Healthcare", "audience": "Medical professionals"}
|
||||
))
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
Last Updated: December 2025
|
||||
"""
|
||||
|
||||
from .research_context import (
|
||||
ResearchContext,
|
||||
ResearchPersonalizationContext,
|
||||
ContentType,
|
||||
ResearchGoal,
|
||||
ResearchDepth,
|
||||
ProviderPreference,
|
||||
)
|
||||
from .parameter_optimizer import ParameterOptimizer
|
||||
from .research_engine import ResearchEngine
|
||||
|
||||
__all__ = [
|
||||
# Context schemas
|
||||
"ResearchContext",
|
||||
"ResearchPersonalizationContext",
|
||||
"ContentType",
|
||||
"ResearchGoal",
|
||||
"ResearchDepth",
|
||||
"ProviderPreference",
|
||||
# Core classes
|
||||
"ParameterOptimizer",
|
||||
"ResearchEngine",
|
||||
]
|
||||
384
backend/services/research/core/parameter_optimizer.py
Normal file
384
backend/services/research/core/parameter_optimizer.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""
|
||||
AI Parameter Optimizer for Research Engine
|
||||
|
||||
Uses AI to analyze the research query and context to select optimal
|
||||
parameters for Exa and Tavily APIs. This abstracts the complexity
|
||||
from non-technical users.
|
||||
|
||||
Key Decisions:
|
||||
- Provider selection (Exa vs Tavily vs Google)
|
||||
- Search type (neural vs keyword)
|
||||
- Category/topic selection
|
||||
- Depth and result limits
|
||||
- Domain filtering
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
from loguru import logger
|
||||
|
||||
from .research_context import (
|
||||
ResearchContext,
|
||||
ResearchGoal,
|
||||
ResearchDepth,
|
||||
ProviderPreference,
|
||||
ContentType,
|
||||
)
|
||||
from models.blog_models import ResearchConfig, ResearchProvider, ResearchMode
|
||||
|
||||
|
||||
class ParameterOptimizer:
|
||||
"""
|
||||
AI-driven parameter optimization for research providers.
|
||||
|
||||
Analyzes the research context and selects optimal parameters
|
||||
for Exa, Tavily, or Google without requiring user expertise.
|
||||
"""
|
||||
|
||||
# Query patterns for intelligent routing
|
||||
TRENDING_PATTERNS = [
|
||||
r'\b(latest|recent|new|2024|2025|current|trending|news)\b',
|
||||
r'\b(update|announcement|launch|release)\b',
|
||||
]
|
||||
|
||||
TECHNICAL_PATTERNS = [
|
||||
r'\b(api|sdk|framework|library|implementation|architecture)\b',
|
||||
r'\b(code|programming|developer|technical|engineering)\b',
|
||||
]
|
||||
|
||||
COMPETITIVE_PATTERNS = [
|
||||
r'\b(competitor|alternative|vs|versus|compare|comparison)\b',
|
||||
r'\b(market|industry|landscape|players)\b',
|
||||
]
|
||||
|
||||
FACTUAL_PATTERNS = [
|
||||
r'\b(statistics|data|research|study|report|survey)\b',
|
||||
r'\b(percent|percentage|number|figure|metric)\b',
|
||||
]
|
||||
|
||||
# Exa category mapping based on query analysis
|
||||
EXA_CATEGORY_MAP = {
|
||||
'research': 'research paper',
|
||||
'news': 'news',
|
||||
'company': 'company',
|
||||
'personal': 'personal site',
|
||||
'github': 'github',
|
||||
'linkedin': 'linkedin profile',
|
||||
'finance': 'financial report',
|
||||
}
|
||||
|
||||
# Tavily topic mapping
|
||||
TAVILY_TOPIC_MAP = {
|
||||
ResearchGoal.TRENDING: 'news',
|
||||
ResearchGoal.FACTUAL: 'general',
|
||||
ResearchGoal.COMPETITIVE: 'general',
|
||||
ResearchGoal.TECHNICAL: 'general',
|
||||
ResearchGoal.EDUCATIONAL: 'general',
|
||||
ResearchGoal.INSPIRATIONAL: 'general',
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the optimizer."""
|
||||
self.exa_available = bool(os.getenv("EXA_API_KEY"))
|
||||
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
|
||||
logger.info(f"ParameterOptimizer initialized: exa={self.exa_available}, tavily={self.tavily_available}")
|
||||
|
||||
def optimize(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
|
||||
"""
|
||||
Analyze research context and return optimized provider and config.
|
||||
|
||||
Args:
|
||||
context: The research context from the calling tool
|
||||
|
||||
Returns:
|
||||
Tuple of (selected_provider, optimized_config)
|
||||
"""
|
||||
# If advanced mode, use raw parameters
|
||||
if context.advanced_mode:
|
||||
return self._build_advanced_config(context)
|
||||
|
||||
# Analyze query to determine optimal approach
|
||||
query_analysis = self._analyze_query(context.query)
|
||||
|
||||
# Select provider based on analysis and preferences
|
||||
provider = self._select_provider(context, query_analysis)
|
||||
|
||||
# Build optimized config for selected provider
|
||||
config = self._build_config(context, provider, query_analysis)
|
||||
|
||||
logger.info(f"Optimized research: provider={provider.value}, mode={config.mode.value}")
|
||||
|
||||
return provider, config
|
||||
|
||||
def _analyze_query(self, query: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze the query to understand intent and optimal approach.
|
||||
|
||||
Returns dict with:
|
||||
- is_trending: Query is about recent/current events
|
||||
- is_technical: Query is technical in nature
|
||||
- is_competitive: Query is about competition/comparison
|
||||
- is_factual: Query needs data/statistics
|
||||
- suggested_category: Exa category if applicable
|
||||
- suggested_topic: Tavily topic
|
||||
"""
|
||||
query_lower = query.lower()
|
||||
|
||||
analysis = {
|
||||
'is_trending': self._matches_patterns(query_lower, self.TRENDING_PATTERNS),
|
||||
'is_technical': self._matches_patterns(query_lower, self.TECHNICAL_PATTERNS),
|
||||
'is_competitive': self._matches_patterns(query_lower, self.COMPETITIVE_PATTERNS),
|
||||
'is_factual': self._matches_patterns(query_lower, self.FACTUAL_PATTERNS),
|
||||
'suggested_category': None,
|
||||
'suggested_topic': 'general',
|
||||
'suggested_search_type': 'auto',
|
||||
}
|
||||
|
||||
# Determine Exa category
|
||||
if 'research' in query_lower or 'study' in query_lower or 'paper' in query_lower:
|
||||
analysis['suggested_category'] = 'research paper'
|
||||
elif 'github' in query_lower or 'repository' in query_lower:
|
||||
analysis['suggested_category'] = 'github'
|
||||
elif 'linkedin' in query_lower or 'professional' in query_lower:
|
||||
analysis['suggested_category'] = 'linkedin profile'
|
||||
elif analysis['is_trending']:
|
||||
analysis['suggested_category'] = 'news'
|
||||
elif 'company' in query_lower or 'startup' in query_lower:
|
||||
analysis['suggested_category'] = 'company'
|
||||
|
||||
# Determine Tavily topic
|
||||
if analysis['is_trending']:
|
||||
analysis['suggested_topic'] = 'news'
|
||||
elif 'finance' in query_lower or 'stock' in query_lower or 'investment' in query_lower:
|
||||
analysis['suggested_topic'] = 'finance'
|
||||
else:
|
||||
analysis['suggested_topic'] = 'general'
|
||||
|
||||
# Determine search type
|
||||
if analysis['is_technical'] or analysis['is_factual']:
|
||||
analysis['suggested_search_type'] = 'neural' # Better for semantic understanding
|
||||
elif analysis['is_trending']:
|
||||
analysis['suggested_search_type'] = 'keyword' # Better for current events
|
||||
|
||||
return analysis
|
||||
|
||||
def _matches_patterns(self, text: str, patterns: list) -> bool:
|
||||
"""Check if text matches any of the patterns."""
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _select_provider(self, context: ResearchContext, analysis: Dict[str, Any]) -> ResearchProvider:
|
||||
"""
|
||||
Select the optimal provider based on context and query analysis.
|
||||
|
||||
Priority: Exa → Tavily → Google for ALL modes (including basic).
|
||||
This provides better semantic search results for content creators.
|
||||
|
||||
Exa's neural search excels at understanding context and meaning,
|
||||
which is valuable for all research types, not just technical queries.
|
||||
"""
|
||||
preference = context.provider_preference
|
||||
|
||||
# If user explicitly requested a provider, respect that
|
||||
if preference == ProviderPreference.EXA:
|
||||
if self.exa_available:
|
||||
return ResearchProvider.EXA
|
||||
logger.warning("Exa requested but not available, falling back")
|
||||
|
||||
if preference == ProviderPreference.TAVILY:
|
||||
if self.tavily_available:
|
||||
return ResearchProvider.TAVILY
|
||||
logger.warning("Tavily requested but not available, falling back")
|
||||
|
||||
if preference == ProviderPreference.GOOGLE:
|
||||
return ResearchProvider.GOOGLE
|
||||
|
||||
# AUTO mode: Always prefer Exa → Tavily → Google
|
||||
# Exa provides superior semantic search for all content types
|
||||
if self.exa_available:
|
||||
logger.info(f"Selected Exa (primary provider): query analysis shows " +
|
||||
f"technical={analysis.get('is_technical', False)}, " +
|
||||
f"trending={analysis.get('is_trending', False)}")
|
||||
return ResearchProvider.EXA
|
||||
|
||||
# Tavily as secondary option - good for real-time and news
|
||||
if self.tavily_available:
|
||||
logger.info(f"Selected Tavily (secondary): Exa unavailable, " +
|
||||
f"trending={analysis.get('is_trending', False)}")
|
||||
return ResearchProvider.TAVILY
|
||||
|
||||
# Google grounding as fallback
|
||||
logger.info("Selected Google (fallback): Exa and Tavily unavailable")
|
||||
return ResearchProvider.GOOGLE
|
||||
|
||||
def _build_config(
|
||||
self,
|
||||
context: ResearchContext,
|
||||
provider: ResearchProvider,
|
||||
analysis: Dict[str, Any]
|
||||
) -> ResearchConfig:
|
||||
"""Build optimized ResearchConfig for the selected provider."""
|
||||
|
||||
# Map ResearchDepth to ResearchMode
|
||||
mode_map = {
|
||||
ResearchDepth.QUICK: ResearchMode.BASIC,
|
||||
ResearchDepth.STANDARD: ResearchMode.BASIC,
|
||||
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
|
||||
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
|
||||
}
|
||||
mode = mode_map.get(context.depth, ResearchMode.BASIC)
|
||||
|
||||
# Base config
|
||||
config = ResearchConfig(
|
||||
mode=mode,
|
||||
provider=provider,
|
||||
max_sources=context.max_sources,
|
||||
include_statistics=context.personalization.include_statistics if context.personalization else True,
|
||||
include_expert_quotes=context.personalization.include_expert_quotes if context.personalization else True,
|
||||
include_competitors=analysis['is_competitive'],
|
||||
include_trends=analysis['is_trending'],
|
||||
)
|
||||
|
||||
# Provider-specific optimizations
|
||||
if provider == ResearchProvider.EXA:
|
||||
config = self._optimize_exa_config(config, context, analysis)
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
config = self._optimize_tavily_config(config, context, analysis)
|
||||
|
||||
# Apply domain filters
|
||||
if context.include_domains:
|
||||
if provider == ResearchProvider.EXA:
|
||||
config.exa_include_domains = context.include_domains
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
config.tavily_include_domains = context.include_domains[:300] # Tavily limit
|
||||
|
||||
if context.exclude_domains:
|
||||
if provider == ResearchProvider.EXA:
|
||||
config.exa_exclude_domains = context.exclude_domains
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
config.tavily_exclude_domains = context.exclude_domains[:150] # Tavily limit
|
||||
|
||||
return config
|
||||
|
||||
def _optimize_exa_config(
|
||||
self,
|
||||
config: ResearchConfig,
|
||||
context: ResearchContext,
|
||||
analysis: Dict[str, Any]
|
||||
) -> ResearchConfig:
|
||||
"""Add Exa-specific optimizations."""
|
||||
|
||||
# Set category based on analysis
|
||||
if analysis['suggested_category']:
|
||||
config.exa_category = analysis['suggested_category']
|
||||
|
||||
# Set search type
|
||||
config.exa_search_type = analysis.get('suggested_search_type', 'auto')
|
||||
|
||||
# For comprehensive research, use neural search
|
||||
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
|
||||
config.exa_search_type = 'neural'
|
||||
|
||||
return config
|
||||
|
||||
def _optimize_tavily_config(
|
||||
self,
|
||||
config: ResearchConfig,
|
||||
context: ResearchContext,
|
||||
analysis: Dict[str, Any]
|
||||
) -> ResearchConfig:
|
||||
"""Add Tavily-specific optimizations."""
|
||||
|
||||
# Set topic based on analysis
|
||||
config.tavily_topic = analysis.get('suggested_topic', 'general')
|
||||
|
||||
# Set search depth based on research depth
|
||||
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
|
||||
config.tavily_search_depth = 'advanced' # 2 credits, but better results
|
||||
config.tavily_chunks_per_source = 3
|
||||
else:
|
||||
config.tavily_search_depth = 'basic' # 1 credit
|
||||
|
||||
# Set time range based on recency
|
||||
if context.recency:
|
||||
recency_map = {
|
||||
'day': 'd',
|
||||
'week': 'w',
|
||||
'month': 'm',
|
||||
'year': 'y',
|
||||
}
|
||||
config.tavily_time_range = recency_map.get(context.recency, context.recency)
|
||||
elif analysis['is_trending']:
|
||||
config.tavily_time_range = 'w' # Last week for trending topics
|
||||
|
||||
# Include answer for comprehensive research
|
||||
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
|
||||
config.tavily_include_answer = 'advanced'
|
||||
|
||||
# Include raw content for expert depth
|
||||
if context.depth == ResearchDepth.EXPERT:
|
||||
config.tavily_include_raw_content = 'markdown'
|
||||
|
||||
return config
|
||||
|
||||
def _build_advanced_config(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
|
||||
"""
|
||||
Build config from raw advanced parameters.
|
||||
Used when advanced_mode=True and user wants full control.
|
||||
"""
|
||||
# Determine provider from explicit parameters
|
||||
provider = ResearchProvider.GOOGLE
|
||||
|
||||
if context.exa_category or context.exa_search_type:
|
||||
provider = ResearchProvider.EXA if self.exa_available else ResearchProvider.GOOGLE
|
||||
elif context.tavily_topic or context.tavily_search_depth:
|
||||
provider = ResearchProvider.TAVILY if self.tavily_available else ResearchProvider.GOOGLE
|
||||
|
||||
# Check preference override
|
||||
if context.provider_preference == ProviderPreference.EXA and self.exa_available:
|
||||
provider = ResearchProvider.EXA
|
||||
elif context.provider_preference == ProviderPreference.TAVILY and self.tavily_available:
|
||||
provider = ResearchProvider.TAVILY
|
||||
elif context.provider_preference == ProviderPreference.GOOGLE:
|
||||
provider = ResearchProvider.GOOGLE
|
||||
|
||||
# Map depth to mode
|
||||
mode_map = {
|
||||
ResearchDepth.QUICK: ResearchMode.BASIC,
|
||||
ResearchDepth.STANDARD: ResearchMode.BASIC,
|
||||
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
|
||||
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
|
||||
}
|
||||
mode = mode_map.get(context.depth, ResearchMode.BASIC)
|
||||
|
||||
# Build config with raw parameters
|
||||
config = ResearchConfig(
|
||||
mode=mode,
|
||||
provider=provider,
|
||||
max_sources=context.max_sources,
|
||||
# Exa
|
||||
exa_category=context.exa_category,
|
||||
exa_search_type=context.exa_search_type,
|
||||
exa_include_domains=context.include_domains,
|
||||
exa_exclude_domains=context.exclude_domains,
|
||||
# Tavily
|
||||
tavily_topic=context.tavily_topic,
|
||||
tavily_search_depth=context.tavily_search_depth,
|
||||
tavily_include_domains=context.include_domains[:300] if context.include_domains else [],
|
||||
tavily_exclude_domains=context.exclude_domains[:150] if context.exclude_domains else [],
|
||||
tavily_include_answer=context.tavily_include_answer,
|
||||
tavily_include_raw_content=context.tavily_include_raw_content,
|
||||
tavily_time_range=context.tavily_time_range,
|
||||
tavily_country=context.tavily_country,
|
||||
)
|
||||
|
||||
logger.info(f"Advanced config: provider={provider.value}, mode={mode.value}")
|
||||
|
||||
return provider, config
|
||||
|
||||
198
backend/services/research/core/research_context.py
Normal file
198
backend/services/research/core/research_context.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Research Context Schema
|
||||
|
||||
Defines the unified input schema for the Research Engine.
|
||||
Any tool (Blog Writer, Podcast Maker, YouTube Creator) can create a ResearchContext
|
||||
and pass it to the Research Engine.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ContentType(str, Enum):
|
||||
"""Type of content being created - affects research focus."""
|
||||
BLOG = "blog"
|
||||
PODCAST = "podcast"
|
||||
VIDEO = "video"
|
||||
SOCIAL = "social"
|
||||
EMAIL = "email"
|
||||
NEWSLETTER = "newsletter"
|
||||
WHITEPAPER = "whitepaper"
|
||||
GENERAL = "general"
|
||||
|
||||
|
||||
class ResearchGoal(str, Enum):
|
||||
"""Primary goal of the research - affects provider selection and depth."""
|
||||
FACTUAL = "factual" # Stats, data, citations
|
||||
TRENDING = "trending" # Current trends, news
|
||||
COMPETITIVE = "competitive" # Competitor analysis
|
||||
EDUCATIONAL = "educational" # How-to, explanations
|
||||
INSPIRATIONAL = "inspirational" # Stories, quotes
|
||||
TECHNICAL = "technical" # Deep technical content
|
||||
|
||||
|
||||
class ResearchDepth(str, Enum):
|
||||
"""Depth of research - maps to existing ResearchMode."""
|
||||
QUICK = "quick" # Fast, surface-level (maps to BASIC)
|
||||
STANDARD = "standard" # Balanced depth (maps to BASIC with more sources)
|
||||
COMPREHENSIVE = "comprehensive" # Deep research (maps to COMPREHENSIVE)
|
||||
EXPERT = "expert" # Maximum depth with expert sources
|
||||
|
||||
|
||||
class ProviderPreference(str, Enum):
|
||||
"""Provider preference - AUTO lets the engine decide."""
|
||||
AUTO = "auto" # AI decides based on query (default)
|
||||
EXA = "exa" # Force Exa neural search
|
||||
TAVILY = "tavily" # Force Tavily AI search
|
||||
GOOGLE = "google" # Force Google grounding
|
||||
HYBRID = "hybrid" # Use multiple providers
|
||||
|
||||
|
||||
class ResearchPersonalizationContext(BaseModel):
|
||||
"""
|
||||
Context from the calling tool (Blog Writer, Podcast Maker, etc.)
|
||||
This personalizes the research without the Research Engine knowing
|
||||
the specific tool implementation.
|
||||
"""
|
||||
# Who is creating the content
|
||||
creator_id: Optional[str] = None # Clerk user ID
|
||||
|
||||
# Content context
|
||||
content_type: ContentType = ContentType.GENERAL
|
||||
industry: Optional[str] = None
|
||||
target_audience: Optional[str] = None
|
||||
tone: Optional[str] = None # professional, casual, technical, etc.
|
||||
|
||||
# Persona data (from onboarding)
|
||||
persona_id: Optional[str] = None
|
||||
brand_voice: Optional[str] = None
|
||||
competitor_urls: List[str] = Field(default_factory=list)
|
||||
|
||||
# Content requirements
|
||||
word_count_target: Optional[int] = None
|
||||
include_statistics: bool = True
|
||||
include_expert_quotes: bool = True
|
||||
include_case_studies: bool = False
|
||||
include_visuals: bool = False
|
||||
|
||||
# Platform-specific hints
|
||||
platform: Optional[str] = None # medium, wordpress, youtube, spotify, etc.
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
|
||||
class ResearchContext(BaseModel):
|
||||
"""
|
||||
Main input schema for the Research Engine.
|
||||
|
||||
This is what any tool passes to the Research Engine to get research results.
|
||||
The engine uses AI to optimize parameters based on this context.
|
||||
"""
|
||||
# Primary research input
|
||||
query: str = Field(..., description="Main research query or topic")
|
||||
keywords: List[str] = Field(default_factory=list, description="Additional keywords")
|
||||
|
||||
# Research configuration
|
||||
goal: ResearchGoal = ResearchGoal.FACTUAL
|
||||
depth: ResearchDepth = ResearchDepth.STANDARD
|
||||
provider_preference: ProviderPreference = ProviderPreference.AUTO
|
||||
|
||||
# Personalization from calling tool
|
||||
personalization: Optional[ResearchPersonalizationContext] = None
|
||||
|
||||
# Constraints
|
||||
max_sources: int = Field(default=10, ge=1, le=25)
|
||||
recency: Optional[str] = None # "day", "week", "month", "year", None for all-time
|
||||
|
||||
# Domain filtering
|
||||
include_domains: List[str] = Field(default_factory=list)
|
||||
exclude_domains: List[str] = Field(default_factory=list)
|
||||
|
||||
# Advanced mode (exposes raw provider parameters)
|
||||
advanced_mode: bool = False
|
||||
|
||||
# Raw provider parameters (only used if advanced_mode=True)
|
||||
# Exa-specific
|
||||
exa_category: Optional[str] = None
|
||||
exa_search_type: Optional[str] = None # auto, keyword, neural
|
||||
|
||||
# Tavily-specific
|
||||
tavily_topic: Optional[str] = None # general, news, finance
|
||||
tavily_search_depth: Optional[str] = None # basic, advanced
|
||||
tavily_include_answer: bool = False
|
||||
tavily_include_raw_content: bool = False
|
||||
tavily_time_range: Optional[str] = None
|
||||
tavily_country: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
def get_effective_query(self) -> str:
|
||||
"""Build effective query combining query and keywords."""
|
||||
if self.keywords:
|
||||
return f"{self.query} {' '.join(self.keywords)}"
|
||||
return self.query
|
||||
|
||||
def get_industry(self) -> str:
|
||||
"""Get industry from personalization or default."""
|
||||
if self.personalization and self.personalization.industry:
|
||||
return self.personalization.industry
|
||||
return "General"
|
||||
|
||||
def get_audience(self) -> str:
|
||||
"""Get target audience from personalization or default."""
|
||||
if self.personalization and self.personalization.target_audience:
|
||||
return self.personalization.target_audience
|
||||
return "General"
|
||||
|
||||
def get_user_id(self) -> Optional[str]:
|
||||
"""Get user ID from personalization."""
|
||||
if self.personalization:
|
||||
return self.personalization.creator_id
|
||||
return None
|
||||
|
||||
|
||||
class ResearchResult(BaseModel):
|
||||
"""
|
||||
Output schema from the Research Engine.
|
||||
Standardized format that any tool can consume.
|
||||
"""
|
||||
success: bool = True
|
||||
|
||||
# Content
|
||||
summary: Optional[str] = None # AI-generated summary of findings
|
||||
raw_content: Optional[str] = None # Raw aggregated content for LLM processing
|
||||
|
||||
# Sources
|
||||
sources: List[Dict[str, Any]] = Field(default_factory=list)
|
||||
|
||||
# Analysis (reuses existing blog writer analysis)
|
||||
keyword_analysis: Dict[str, Any] = Field(default_factory=dict)
|
||||
competitor_analysis: Dict[str, Any] = Field(default_factory=dict)
|
||||
suggested_angles: List[str] = Field(default_factory=list)
|
||||
|
||||
# Metadata
|
||||
provider_used: str = "google" # Which provider was actually used
|
||||
search_queries: List[str] = Field(default_factory=list)
|
||||
grounding_metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
# Cost tracking
|
||||
estimated_cost: float = 0.0
|
||||
|
||||
# Error handling
|
||||
error_message: Optional[str] = None
|
||||
error_code: Optional[str] = None
|
||||
retry_suggested: bool = False
|
||||
|
||||
# Original context for reference
|
||||
original_query: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
558
backend/services/research/core/research_engine.py
Normal file
558
backend/services/research/core/research_engine.py
Normal file
@@ -0,0 +1,558 @@
|
||||
"""
|
||||
Research Engine - Core Orchestrator
|
||||
|
||||
The main entry point for AI research across all ALwrity tools.
|
||||
This engine wraps existing providers (Exa, Tavily, Google) and provides
|
||||
a unified interface for any content generation tool.
|
||||
|
||||
Usage:
|
||||
from services.research.core import ResearchEngine, ResearchContext, ContentType
|
||||
|
||||
engine = ResearchEngine()
|
||||
result = await engine.research(ResearchContext(
|
||||
query="AI trends in healthcare 2025",
|
||||
content_type=ContentType.PODCAST,
|
||||
personalization=ResearchPersonalizationContext(
|
||||
industry="Healthcare",
|
||||
target_audience="Medical professionals"
|
||||
)
|
||||
))
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Dict, Any, Optional, Callable
|
||||
from loguru import logger
|
||||
|
||||
from .research_context import (
|
||||
ResearchContext,
|
||||
ResearchResult,
|
||||
ResearchDepth,
|
||||
ContentType,
|
||||
ResearchPersonalizationContext,
|
||||
)
|
||||
from .parameter_optimizer import ParameterOptimizer
|
||||
|
||||
# Reuse existing blog writer models and services
|
||||
from models.blog_models import (
|
||||
BlogResearchRequest,
|
||||
BlogResearchResponse,
|
||||
ResearchConfig,
|
||||
ResearchProvider,
|
||||
ResearchMode,
|
||||
PersonaInfo,
|
||||
ResearchSource,
|
||||
)
|
||||
|
||||
# Research persona for personalization
|
||||
from models.research_persona_models import ResearchPersona
|
||||
|
||||
|
||||
class ResearchEngine:
|
||||
"""
|
||||
AI Research Engine - Standalone module for content research.
|
||||
|
||||
This engine:
|
||||
1. Accepts a ResearchContext from any tool
|
||||
2. Uses AI to optimize parameters for Exa/Tavily
|
||||
3. Integrates research persona for personalization
|
||||
4. Executes research using existing providers
|
||||
5. Returns standardized ResearchResult
|
||||
|
||||
Can be imported by Blog Writer, Podcast Maker, YouTube Creator, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, db_session=None):
|
||||
"""Initialize the Research Engine."""
|
||||
self.optimizer = ParameterOptimizer()
|
||||
self._providers_initialized = False
|
||||
self._exa_provider = None
|
||||
self._tavily_provider = None
|
||||
self._google_provider = None
|
||||
self._db_session = db_session
|
||||
|
||||
# Check provider availability
|
||||
self.exa_available = bool(os.getenv("EXA_API_KEY"))
|
||||
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
logger.info(f"ResearchEngine initialized: exa={self.exa_available}, tavily={self.tavily_available}")
|
||||
|
||||
def _get_research_persona(self, user_id: str, generate_if_missing: bool = True) -> Optional[ResearchPersona]:
|
||||
"""
|
||||
Fetch research persona for user, generating if missing.
|
||||
|
||||
Phase 2: Since onboarding is mandatory and always completes before accessing
|
||||
any tool, we can safely generate research persona on first use. This ensures
|
||||
hyper-personalization without requiring "General" fallbacks.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
generate_if_missing: If True, generate persona if not cached (default: True)
|
||||
|
||||
Returns:
|
||||
ResearchPersona if successful, None only if user has no core persona
|
||||
"""
|
||||
if not user_id:
|
||||
return None
|
||||
|
||||
try:
|
||||
from services.research.research_persona_service import ResearchPersonaService
|
||||
|
||||
db = self._db_session
|
||||
if not db:
|
||||
from services.database import get_db_session
|
||||
db = get_db_session()
|
||||
|
||||
persona_service = ResearchPersonaService(db_session=db)
|
||||
|
||||
if generate_if_missing:
|
||||
# Phase 2: Use get_or_generate() to create persona on first visit
|
||||
# This triggers LLM call if not cached, but onboarding guarantees
|
||||
# core persona exists, so generation will succeed
|
||||
logger.info(f"🔄 Getting/generating research persona for user {user_id}...")
|
||||
persona = persona_service.get_or_generate(user_id, force_refresh=False)
|
||||
|
||||
if persona:
|
||||
logger.info(f"✅ Research persona ready for user {user_id}: industry={persona.default_industry}")
|
||||
else:
|
||||
logger.warning(f"⚠️ Could not get/generate research persona for user {user_id} - using core persona fallback")
|
||||
else:
|
||||
# Fast path: only return cached (for config endpoints)
|
||||
persona = persona_service.get_cached_only(user_id)
|
||||
if persona:
|
||||
logger.debug(f"Research persona loaded from cache for user {user_id}")
|
||||
|
||||
return persona
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load research persona for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def _enrich_context_with_persona(
|
||||
self,
|
||||
context: ResearchContext,
|
||||
persona: ResearchPersona
|
||||
) -> ResearchContext:
|
||||
"""
|
||||
Enrich the research context with persona data.
|
||||
|
||||
Only applies persona defaults if the context doesn't already have values.
|
||||
User-provided values always take precedence.
|
||||
"""
|
||||
# Create personalization context if not exists
|
||||
if not context.personalization:
|
||||
context.personalization = ResearchPersonalizationContext()
|
||||
|
||||
# Apply persona defaults only if not already set
|
||||
if not context.personalization.industry or context.personalization.industry == "General":
|
||||
if persona.default_industry:
|
||||
context.personalization.industry = persona.default_industry
|
||||
logger.debug(f"Applied persona industry: {persona.default_industry}")
|
||||
|
||||
if not context.personalization.target_audience or context.personalization.target_audience == "General":
|
||||
if persona.default_target_audience:
|
||||
context.personalization.target_audience = persona.default_target_audience
|
||||
logger.debug(f"Applied persona target_audience: {persona.default_target_audience}")
|
||||
|
||||
# Apply suggested Exa domains if not already set
|
||||
if not context.include_domains and persona.suggested_exa_domains:
|
||||
context.include_domains = persona.suggested_exa_domains[:6] # Limit to 6 domains
|
||||
logger.debug(f"Applied persona domains: {context.include_domains}")
|
||||
|
||||
# Apply suggested Exa category if not already set
|
||||
if not context.exa_category and persona.suggested_exa_category:
|
||||
context.exa_category = persona.suggested_exa_category
|
||||
logger.debug(f"Applied persona exa_category: {persona.suggested_exa_category}")
|
||||
|
||||
return context
|
||||
|
||||
async def research(
|
||||
self,
|
||||
context: ResearchContext,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> ResearchResult:
|
||||
"""
|
||||
Execute research based on the given context.
|
||||
|
||||
Args:
|
||||
context: Research context with query, goals, and personalization
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
ResearchResult with sources, analysis, and content
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Progress update
|
||||
self._progress(progress_callback, "🔍 Analyzing research query...")
|
||||
|
||||
# Enrich context with research persona (Phase 2: generate if missing)
|
||||
user_id = context.get_user_id()
|
||||
if user_id:
|
||||
self._progress(progress_callback, "👤 Loading personalized research profile...")
|
||||
persona = self._get_research_persona(user_id, generate_if_missing=True)
|
||||
if persona:
|
||||
self._progress(progress_callback, "✨ Applying hyper-personalized settings...")
|
||||
context = self._enrich_context_with_persona(context, persona)
|
||||
else:
|
||||
logger.warning(f"No research persona available for user {user_id} - proceeding with provided context")
|
||||
|
||||
# Optimize parameters based on enriched context
|
||||
provider, config = self.optimizer.optimize(context)
|
||||
|
||||
self._progress(progress_callback, f"🤖 Selected {provider.value.upper()} for research")
|
||||
|
||||
# Build the request using existing blog models
|
||||
request = self._build_request(context, config)
|
||||
user_id = context.get_user_id() or ""
|
||||
|
||||
# Execute research using appropriate provider
|
||||
self._progress(progress_callback, f"🌐 Connecting to {provider.value} search...")
|
||||
|
||||
if provider == ResearchProvider.EXA:
|
||||
response = await self._execute_exa_research(request, config, user_id, progress_callback)
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
response = await self._execute_tavily_research(request, config, user_id, progress_callback)
|
||||
else:
|
||||
response = await self._execute_google_research(request, config, user_id, progress_callback)
|
||||
|
||||
# Transform response to ResearchResult
|
||||
self._progress(progress_callback, "📊 Processing results...")
|
||||
|
||||
result = self._transform_response(response, provider, context)
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
logger.info(f"Research completed in {duration_ms:.0f}ms: {len(result.sources)} sources")
|
||||
|
||||
self._progress(progress_callback, f"✅ Research complete: {len(result.sources)} sources found")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Research failed: {e}")
|
||||
return ResearchResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
error_code="RESEARCH_FAILED",
|
||||
retry_suggested=True,
|
||||
original_query=context.query
|
||||
)
|
||||
|
||||
def _progress(self, callback: Optional[Callable[[str], None]], message: str):
|
||||
"""Send progress update if callback provided."""
|
||||
if callback:
|
||||
callback(message)
|
||||
logger.info(f"[Research] {message}")
|
||||
|
||||
def _build_request(self, context: ResearchContext, config: ResearchConfig) -> BlogResearchRequest:
|
||||
"""Build BlogResearchRequest from ResearchContext."""
|
||||
|
||||
# Extract keywords from query
|
||||
keywords = context.keywords if context.keywords else [context.query]
|
||||
|
||||
# Build persona info from personalization
|
||||
persona = None
|
||||
if context.personalization:
|
||||
persona = PersonaInfo(
|
||||
persona_id=context.personalization.persona_id,
|
||||
tone=context.personalization.tone,
|
||||
audience=context.personalization.target_audience,
|
||||
industry=context.personalization.industry,
|
||||
)
|
||||
|
||||
return BlogResearchRequest(
|
||||
keywords=keywords,
|
||||
topic=context.query,
|
||||
industry=context.get_industry(),
|
||||
target_audience=context.get_audience(),
|
||||
tone=context.personalization.tone if context.personalization else None,
|
||||
word_count_target=context.personalization.word_count_target if context.personalization else 1500,
|
||||
persona=persona,
|
||||
research_mode=config.mode,
|
||||
config=config,
|
||||
)
|
||||
|
||||
async def _execute_exa_research(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> BlogResearchResponse:
|
||||
"""Execute research using Exa provider."""
|
||||
from services.blog_writer.research.exa_provider import ExaResearchProvider
|
||||
from services.blog_writer.research.research_strategies import get_strategy_for_mode
|
||||
|
||||
self._progress(progress_callback, "🔍 Executing Exa neural search...")
|
||||
|
||||
# Get strategy for building prompt
|
||||
strategy = get_strategy_for_mode(config.mode)
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
target_audience = request.target_audience or "General"
|
||||
|
||||
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
|
||||
|
||||
# Execute Exa search
|
||||
try:
|
||||
exa_provider = ExaResearchProvider()
|
||||
raw_result = await exa_provider.search(
|
||||
research_prompt, topic, industry, target_audience, config, user_id
|
||||
)
|
||||
|
||||
# Track usage
|
||||
cost = raw_result.get('cost', {}).get('total', 0.005) if isinstance(raw_result.get('cost'), dict) else 0.005
|
||||
exa_provider.track_exa_usage(user_id, cost)
|
||||
|
||||
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
|
||||
|
||||
# Run common analysis
|
||||
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
|
||||
|
||||
except RuntimeError as e:
|
||||
if "EXA_API_KEY not configured" in str(e):
|
||||
logger.warning("Exa not configured, falling back to Tavily")
|
||||
self._progress(progress_callback, "⚠️ Exa unavailable, trying Tavily...")
|
||||
config.provider = ResearchProvider.TAVILY
|
||||
return await self._execute_tavily_research(request, config, user_id, progress_callback)
|
||||
raise
|
||||
|
||||
async def _execute_tavily_research(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> BlogResearchResponse:
|
||||
"""Execute research using Tavily provider."""
|
||||
from services.blog_writer.research.tavily_provider import TavilyResearchProvider
|
||||
from services.blog_writer.research.research_strategies import get_strategy_for_mode
|
||||
|
||||
self._progress(progress_callback, "🔍 Executing Tavily AI search...")
|
||||
|
||||
# Get strategy for building prompt
|
||||
strategy = get_strategy_for_mode(config.mode)
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
target_audience = request.target_audience or "General"
|
||||
|
||||
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
|
||||
|
||||
# Execute Tavily search
|
||||
try:
|
||||
tavily_provider = TavilyResearchProvider()
|
||||
raw_result = await tavily_provider.search(
|
||||
research_prompt, topic, industry, target_audience, config, user_id
|
||||
)
|
||||
|
||||
# Track usage
|
||||
cost = raw_result.get('cost', {}).get('total', 0.001) if isinstance(raw_result.get('cost'), dict) else 0.001
|
||||
search_depth = config.tavily_search_depth or "basic"
|
||||
tavily_provider.track_tavily_usage(user_id, cost, search_depth)
|
||||
|
||||
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
|
||||
|
||||
# Run common analysis
|
||||
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
|
||||
|
||||
except RuntimeError as e:
|
||||
if "TAVILY_API_KEY not configured" in str(e):
|
||||
logger.warning("Tavily not configured, falling back to Google")
|
||||
self._progress(progress_callback, "⚠️ Tavily unavailable, using Google Search...")
|
||||
config.provider = ResearchProvider.GOOGLE
|
||||
return await self._execute_google_research(request, config, user_id, progress_callback)
|
||||
raise
|
||||
|
||||
async def _execute_google_research(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> BlogResearchResponse:
|
||||
"""Execute research using Google/Gemini grounding."""
|
||||
from services.blog_writer.research.google_provider import GoogleResearchProvider
|
||||
from services.blog_writer.research.research_strategies import get_strategy_for_mode
|
||||
|
||||
self._progress(progress_callback, "🔍 Executing Google Search grounding...")
|
||||
|
||||
# Get strategy for building prompt
|
||||
strategy = get_strategy_for_mode(config.mode)
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
target_audience = request.target_audience or "General"
|
||||
|
||||
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
|
||||
|
||||
# Execute Google search
|
||||
google_provider = GoogleResearchProvider()
|
||||
raw_result = await google_provider.search(
|
||||
research_prompt, topic, industry, target_audience, config, user_id
|
||||
)
|
||||
|
||||
self._progress(progress_callback, "📝 Processing grounded results...")
|
||||
|
||||
# Run common analysis
|
||||
return await self._run_analysis(request, raw_result, config, user_id, progress_callback, is_google=True)
|
||||
|
||||
async def _run_analysis(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
raw_result: Dict[str, Any],
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None,
|
||||
is_google: bool = False
|
||||
) -> BlogResearchResponse:
|
||||
"""Run common analysis on raw results."""
|
||||
from services.blog_writer.research.keyword_analyzer import KeywordAnalyzer
|
||||
from services.blog_writer.research.competitor_analyzer import CompetitorAnalyzer
|
||||
from services.blog_writer.research.content_angle_generator import ContentAngleGenerator
|
||||
from services.blog_writer.research.data_filter import ResearchDataFilter
|
||||
|
||||
self._progress(progress_callback, "🔍 Analyzing keywords and content angles...")
|
||||
|
||||
# Extract content for analysis
|
||||
if is_google:
|
||||
content = raw_result.get("content", "")
|
||||
sources = self._extract_sources_from_grounding(raw_result)
|
||||
search_queries = raw_result.get("search_queries", []) or []
|
||||
grounding_metadata = self._extract_grounding_metadata(raw_result)
|
||||
else:
|
||||
content = raw_result.get('content', '')
|
||||
sources = [ResearchSource(**s) if isinstance(s, dict) else s for s in raw_result.get('sources', [])]
|
||||
search_queries = raw_result.get('search_queries', [])
|
||||
grounding_metadata = None
|
||||
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
|
||||
# Run analyzers
|
||||
keyword_analyzer = KeywordAnalyzer()
|
||||
competitor_analyzer = CompetitorAnalyzer()
|
||||
content_angle_generator = ContentAngleGenerator()
|
||||
data_filter = ResearchDataFilter()
|
||||
|
||||
keyword_analysis = keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
|
||||
competitor_analysis = competitor_analyzer.analyze(content, user_id=user_id)
|
||||
suggested_angles = content_angle_generator.generate(content, topic, industry, user_id=user_id)
|
||||
|
||||
# Build response
|
||||
response = BlogResearchResponse(
|
||||
success=True,
|
||||
sources=sources,
|
||||
keyword_analysis=keyword_analysis,
|
||||
competitor_analysis=competitor_analysis,
|
||||
suggested_angles=suggested_angles,
|
||||
search_widget="",
|
||||
search_queries=search_queries,
|
||||
grounding_metadata=grounding_metadata,
|
||||
original_keywords=request.keywords,
|
||||
)
|
||||
|
||||
# Filter and clean research data
|
||||
self._progress(progress_callback, "✨ Filtering and optimizing results...")
|
||||
filtered_response = data_filter.filter_research_data(response)
|
||||
|
||||
return filtered_response
|
||||
|
||||
def _extract_sources_from_grounding(self, gemini_result: Dict[str, Any]) -> list:
|
||||
"""Extract sources from Gemini grounding metadata."""
|
||||
from models.blog_models import ResearchSource
|
||||
|
||||
sources = []
|
||||
if not gemini_result or not isinstance(gemini_result, dict):
|
||||
return sources
|
||||
|
||||
raw_sources = gemini_result.get("sources", []) or []
|
||||
|
||||
for src in raw_sources:
|
||||
source = ResearchSource(
|
||||
title=src.get("title", "Untitled"),
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
published_at=str(src.get("publication_date", "2024-01-01")),
|
||||
index=src.get("index"),
|
||||
source_type=src.get("type", "web")
|
||||
)
|
||||
sources.append(source)
|
||||
|
||||
return sources
|
||||
|
||||
def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract grounding metadata from Gemini result."""
|
||||
if not gemini_result or not isinstance(gemini_result, dict):
|
||||
return None
|
||||
|
||||
return gemini_result.get("grounding_metadata")
|
||||
|
||||
def _transform_response(
|
||||
self,
|
||||
response: BlogResearchResponse,
|
||||
provider: ResearchProvider,
|
||||
context: ResearchContext
|
||||
) -> ResearchResult:
|
||||
"""Transform BlogResearchResponse to ResearchResult."""
|
||||
|
||||
# Convert sources to dicts
|
||||
sources = []
|
||||
for s in response.sources:
|
||||
if hasattr(s, 'dict'):
|
||||
sources.append(s.dict())
|
||||
elif isinstance(s, dict):
|
||||
sources.append(s)
|
||||
else:
|
||||
sources.append({
|
||||
'title': getattr(s, 'title', ''),
|
||||
'url': getattr(s, 'url', ''),
|
||||
'excerpt': getattr(s, 'excerpt', ''),
|
||||
})
|
||||
|
||||
# Extract grounding metadata
|
||||
grounding = None
|
||||
if response.grounding_metadata:
|
||||
if hasattr(response.grounding_metadata, 'dict'):
|
||||
grounding = response.grounding_metadata.dict()
|
||||
else:
|
||||
grounding = response.grounding_metadata
|
||||
|
||||
return ResearchResult(
|
||||
success=response.success,
|
||||
sources=sources,
|
||||
keyword_analysis=response.keyword_analysis,
|
||||
competitor_analysis=response.competitor_analysis,
|
||||
suggested_angles=response.suggested_angles,
|
||||
provider_used=provider.value,
|
||||
search_queries=response.search_queries,
|
||||
grounding_metadata=grounding,
|
||||
original_query=context.query,
|
||||
error_message=response.error_message,
|
||||
error_code=response.error_code if hasattr(response, 'error_code') else None,
|
||||
retry_suggested=response.retry_suggested if hasattr(response, 'retry_suggested') else False,
|
||||
)
|
||||
|
||||
def get_provider_status(self) -> Dict[str, Any]:
|
||||
"""Get status of available providers."""
|
||||
return {
|
||||
"exa": {
|
||||
"available": self.exa_available,
|
||||
"priority": 1,
|
||||
"description": "Neural search for semantic understanding"
|
||||
},
|
||||
"tavily": {
|
||||
"available": self.tavily_available,
|
||||
"priority": 2,
|
||||
"description": "AI-powered web search"
|
||||
},
|
||||
"google": {
|
||||
"available": True, # Always available via Gemini
|
||||
"priority": 3,
|
||||
"description": "Google Search grounding"
|
||||
}
|
||||
}
|
||||
|
||||
794
backend/services/research/exa_service.py
Normal file
794
backend/services/research/exa_service.py
Normal file
@@ -0,0 +1,794 @@
|
||||
"""
|
||||
Exa API Service for ALwrity
|
||||
|
||||
This service provides competitor discovery and analysis using the Exa API,
|
||||
which uses neural search to find semantically similar websites and content.
|
||||
|
||||
Key Features:
|
||||
- Competitor discovery using neural search
|
||||
- Content analysis and summarization
|
||||
- Competitive intelligence gathering
|
||||
- Cost-effective API usage with caching
|
||||
- Integration with onboarding Step 3
|
||||
|
||||
Dependencies:
|
||||
- aiohttp (for async HTTP requests)
|
||||
- os (for environment variables)
|
||||
- logging (for debugging)
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
Last Updated: January 2025
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional, Any, Union
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse
|
||||
from exa_py import Exa
|
||||
|
||||
class ExaService:
|
||||
"""
|
||||
Service for competitor discovery and analysis using the Exa API.
|
||||
|
||||
This service provides neural search capabilities to find semantically similar
|
||||
websites and analyze their content for competitive intelligence.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Exa Service with API credentials."""
|
||||
self.api_key = os.getenv("EXA_API_KEY")
|
||||
self.exa = None
|
||||
self.enabled = False
|
||||
|
||||
# Don't assume key is available at import time in production.
|
||||
# Keys may be injected per-request via middleware, so defer init.
|
||||
self._try_initialize()
|
||||
|
||||
def _try_initialize(self) -> None:
|
||||
"""Attempt to (re)initialize the Exa SDK from current environment."""
|
||||
if self.enabled and self.exa:
|
||||
return
|
||||
try:
|
||||
self.api_key = os.getenv("EXA_API_KEY")
|
||||
if not self.api_key:
|
||||
# Leave disabled; caller may try again after middleware injection
|
||||
logger.warning("EXA_API_KEY not configured; Exa service will be disabled")
|
||||
self.enabled = False
|
||||
self.exa = None
|
||||
return
|
||||
self.exa = Exa(api_key=self.api_key)
|
||||
self.enabled = True
|
||||
logger.info("Exa Service initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Exa service: {e}")
|
||||
self.enabled = False
|
||||
self.exa = None
|
||||
|
||||
async def discover_competitors(
|
||||
self,
|
||||
user_url: str,
|
||||
num_results: int = 10,
|
||||
include_domains: Optional[List[str]] = None,
|
||||
exclude_domains: Optional[List[str]] = None,
|
||||
industry_context: Optional[str] = None,
|
||||
website_analysis_data: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Discover competitors for a given website using Exa's neural search.
|
||||
|
||||
Args:
|
||||
user_url: The website URL to find competitors for
|
||||
num_results: Number of competitor results to return (max 100)
|
||||
include_domains: List of domains to include in search
|
||||
exclude_domains: List of domains to exclude from search
|
||||
industry_context: Industry context for better competitor discovery
|
||||
|
||||
Returns:
|
||||
Dictionary containing competitor analysis results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Exa Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting competitor discovery for: {user_url}")
|
||||
|
||||
# Extract user domain for exclusion
|
||||
user_domain = urlparse(user_url).netloc
|
||||
exclude_domains_list = exclude_domains or []
|
||||
exclude_domains_list.append(user_domain)
|
||||
|
||||
logger.info(f"Excluding domains: {exclude_domains_list}")
|
||||
|
||||
# Extract insights from website analysis for better targeting
|
||||
include_text_queries = []
|
||||
summary_query = f"Business model, target audience, content strategy{f' in {industry_context}' if industry_context else ''}"
|
||||
|
||||
if website_analysis_data:
|
||||
analysis = website_analysis_data.get('analysis', {})
|
||||
|
||||
# Extract key business terms from the analysis
|
||||
if 'target_audience' in analysis:
|
||||
audience = analysis['target_audience']
|
||||
if isinstance(audience, dict) and 'primary_audience' in audience:
|
||||
primary_audience = audience['primary_audience']
|
||||
if len(primary_audience.split()) <= 5: # Exa limit
|
||||
include_text_queries.append(primary_audience)
|
||||
|
||||
# Use industry context from analysis
|
||||
if 'industry' in analysis and analysis['industry']:
|
||||
industry = analysis['industry']
|
||||
if len(industry.split()) <= 5:
|
||||
include_text_queries.append(industry)
|
||||
|
||||
# Enhance summary query with analysis insights
|
||||
if 'content_type' in analysis:
|
||||
content_type = analysis['content_type']
|
||||
summary_query += f", {content_type} content strategy"
|
||||
|
||||
logger.info(f"Enhanced targeting with analysis data: {include_text_queries}")
|
||||
|
||||
# Use the Exa SDK to find similar links with content and context
|
||||
search_result = self.exa.find_similar_and_contents(
|
||||
url=user_url,
|
||||
num_results=min(num_results, 10), # Exa API limit
|
||||
include_domains=include_domains,
|
||||
exclude_domains=exclude_domains_list,
|
||||
include_text=include_text_queries if include_text_queries else None,
|
||||
text=True,
|
||||
highlights={
|
||||
"numSentences": 2,
|
||||
"highlightsPerUrl": 3,
|
||||
"query": "Unique value proposition, competitive advantages, market position"
|
||||
},
|
||||
summary={
|
||||
"query": summary_query
|
||||
}
|
||||
)
|
||||
|
||||
# TODO: Add context generation once SDK supports it
|
||||
# For now, we'll generate a basic context from the results
|
||||
context_result = None
|
||||
|
||||
# Log the raw Exa API response summary (avoiding verbose markdown content)
|
||||
logger.info(f"📊 Exa API response for {user_url}:")
|
||||
logger.info(f" ├─ Request ID: {getattr(search_result, 'request_id', 'N/A')}")
|
||||
logger.info(f" ├─ Results count: {len(getattr(search_result, 'results', []))}")
|
||||
logger.info(f" └─ Cost: ${getattr(getattr(search_result, 'cost_dollars', None), 'total', 0)}")
|
||||
|
||||
# Note: Full raw response contains verbose markdown content - logging only summary
|
||||
# To see full response, set EXA_DEBUG=true in environment
|
||||
|
||||
# Extract results from search
|
||||
results = getattr(search_result, 'results', [])
|
||||
|
||||
# Log summary of results
|
||||
logger.info(f" - Found {len(results)} competitors")
|
||||
|
||||
# Process and structure the results
|
||||
competitors = self._process_competitor_results(search_result, user_url)
|
||||
|
||||
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"user_url": user_url,
|
||||
"competitors": competitors,
|
||||
"total_competitors": len(competitors),
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"industry_context": industry_context,
|
||||
"api_cost": getattr(getattr(search_result, 'cost_dollars', None), 'total', 0) if hasattr(search_result, 'cost_dollars') and getattr(search_result, 'cost_dollars', None) else 0,
|
||||
"request_id": getattr(search_result, 'request_id', None) if hasattr(search_result, 'request_id') else None
|
||||
}
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("Exa API request timed out")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Request timed out",
|
||||
"details": "The competitor discovery request took too long to complete"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in competitor discovery: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during competitor discovery"
|
||||
}
|
||||
|
||||
def _process_competitor_results(self, search_result, user_url: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process and structure the Exa SDK response into competitor data.
|
||||
|
||||
Args:
|
||||
search_result: Response from Exa SDK
|
||||
user_url: Original user URL for reference
|
||||
|
||||
Returns:
|
||||
List of processed competitor data
|
||||
"""
|
||||
competitors = []
|
||||
user_domain = urlparse(user_url).netloc
|
||||
|
||||
# Extract results from the SDK response
|
||||
results = getattr(search_result, 'results', [])
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
# Extract basic information from the result object
|
||||
competitor_url = getattr(result, 'url', '')
|
||||
competitor_domain = urlparse(competitor_url).netloc
|
||||
|
||||
# Skip if it's the same domain as the user
|
||||
if competitor_domain == user_domain:
|
||||
continue
|
||||
|
||||
# Extract content insights
|
||||
summary = getattr(result, 'summary', '')
|
||||
highlights = getattr(result, 'highlights', [])
|
||||
highlight_scores = getattr(result, 'highlight_scores', [])
|
||||
|
||||
# Calculate competitive relevance score
|
||||
relevance_score = self._calculate_relevance_score(result, user_url)
|
||||
|
||||
competitor_data = {
|
||||
"url": competitor_url,
|
||||
"domain": competitor_domain,
|
||||
"title": getattr(result, 'title', ''),
|
||||
"published_date": getattr(result, 'published_date', None),
|
||||
"author": getattr(result, 'author', None),
|
||||
"favicon": getattr(result, 'favicon', None),
|
||||
"image": getattr(result, 'image', None),
|
||||
"summary": summary,
|
||||
"highlights": highlights,
|
||||
"highlight_scores": highlight_scores,
|
||||
"relevance_score": relevance_score,
|
||||
"competitive_insights": self._extract_competitive_insights(summary, highlights),
|
||||
"content_analysis": self._analyze_content_quality(result)
|
||||
}
|
||||
|
||||
competitors.append(competitor_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing competitor result: {str(e)}")
|
||||
continue
|
||||
|
||||
# Sort by relevance score (highest first)
|
||||
competitors.sort(key=lambda x: x["relevance_score"], reverse=True)
|
||||
|
||||
return competitors
|
||||
|
||||
def _calculate_relevance_score(self, result, user_url: str) -> float:
|
||||
"""
|
||||
Calculate a relevance score for competitor ranking.
|
||||
|
||||
Args:
|
||||
result: Competitor result from Exa SDK
|
||||
user_url: Original user URL
|
||||
|
||||
Returns:
|
||||
Relevance score between 0 and 1
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
# Base score from highlight scores
|
||||
highlight_scores = getattr(result, 'highlight_scores', [])
|
||||
if highlight_scores:
|
||||
score += sum(highlight_scores) / len(highlight_scores) * 0.4
|
||||
|
||||
# Score from summary quality
|
||||
summary = getattr(result, 'summary', '')
|
||||
if summary and len(summary) > 100:
|
||||
score += 0.3
|
||||
|
||||
# Score from title relevance
|
||||
title = getattr(result, 'title', '').lower()
|
||||
if any(keyword in title for keyword in ["business", "company", "service", "solution", "platform"]):
|
||||
score += 0.2
|
||||
|
||||
# Score from URL structure similarity
|
||||
competitor_url = getattr(result, 'url', '')
|
||||
if self._url_structure_similarity(user_url, competitor_url) > 0.5:
|
||||
score += 0.1
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
def _url_structure_similarity(self, url1: str, url2: str) -> float:
|
||||
"""
|
||||
Calculate URL structure similarity.
|
||||
|
||||
Args:
|
||||
url1: First URL
|
||||
url2: Second URL
|
||||
|
||||
Returns:
|
||||
Similarity score between 0 and 1
|
||||
"""
|
||||
try:
|
||||
parsed1 = urlparse(url1)
|
||||
parsed2 = urlparse(url2)
|
||||
|
||||
# Compare path structure
|
||||
path1_parts = [part for part in parsed1.path.split('/') if part]
|
||||
path2_parts = [part for part in parsed2.path.split('/') if part]
|
||||
|
||||
if not path1_parts or not path2_parts:
|
||||
return 0.0
|
||||
|
||||
# Calculate similarity based on path length and structure
|
||||
max_parts = max(len(path1_parts), len(path2_parts))
|
||||
common_parts = sum(1 for p1, p2 in zip(path1_parts, path2_parts) if p1 == p2)
|
||||
|
||||
return common_parts / max_parts
|
||||
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
def _extract_competitive_insights(self, summary: str, highlights: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract competitive insights from summary and highlights.
|
||||
|
||||
Args:
|
||||
summary: Content summary
|
||||
highlights: Content highlights
|
||||
|
||||
Returns:
|
||||
Dictionary of competitive insights
|
||||
"""
|
||||
insights = {
|
||||
"business_model": "",
|
||||
"target_audience": "",
|
||||
"value_proposition": "",
|
||||
"competitive_advantages": [],
|
||||
"content_strategy": ""
|
||||
}
|
||||
|
||||
# Combine summary and highlights for analysis
|
||||
content = f"{summary} {' '.join(highlights)}".lower()
|
||||
|
||||
# Extract business model indicators
|
||||
business_models = ["saas", "platform", "service", "product", "consulting", "agency", "marketplace"]
|
||||
for model in business_models:
|
||||
if model in content:
|
||||
insights["business_model"] = model.title()
|
||||
break
|
||||
|
||||
# Extract target audience indicators
|
||||
audiences = ["enterprise", "small business", "startups", "developers", "marketers", "consumers"]
|
||||
for audience in audiences:
|
||||
if audience in content:
|
||||
insights["target_audience"] = audience.title()
|
||||
break
|
||||
|
||||
# Extract value proposition from highlights
|
||||
if highlights:
|
||||
insights["value_proposition"] = highlights[0][:100] + "..." if len(highlights[0]) > 100 else highlights[0]
|
||||
|
||||
return insights
|
||||
|
||||
def _analyze_content_quality(self, result) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze the content quality of a competitor.
|
||||
|
||||
Args:
|
||||
result: Competitor result from Exa SDK
|
||||
|
||||
Returns:
|
||||
Dictionary of content quality metrics
|
||||
"""
|
||||
quality_metrics = {
|
||||
"content_depth": "medium",
|
||||
"technical_sophistication": "medium",
|
||||
"content_freshness": "unknown",
|
||||
"engagement_potential": "medium"
|
||||
}
|
||||
|
||||
# Analyze content depth from summary length
|
||||
summary = getattr(result, 'summary', '')
|
||||
if len(summary) > 300:
|
||||
quality_metrics["content_depth"] = "high"
|
||||
elif len(summary) < 100:
|
||||
quality_metrics["content_depth"] = "low"
|
||||
|
||||
# Analyze technical sophistication
|
||||
technical_keywords = ["api", "integration", "automation", "analytics", "data", "platform"]
|
||||
highlights = getattr(result, 'highlights', [])
|
||||
content_text = f"{summary} {' '.join(highlights)}".lower()
|
||||
|
||||
technical_count = sum(1 for keyword in technical_keywords if keyword in content_text)
|
||||
if technical_count >= 3:
|
||||
quality_metrics["technical_sophistication"] = "high"
|
||||
elif technical_count == 0:
|
||||
quality_metrics["technical_sophistication"] = "low"
|
||||
|
||||
return quality_metrics
|
||||
|
||||
async def discover_social_media_accounts(self, user_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Discover social media accounts for a given website using Exa's answer API.
|
||||
|
||||
Args:
|
||||
user_url: The website URL to find social media accounts for
|
||||
|
||||
Returns:
|
||||
Dictionary containing social media discovery results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Exa Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting social media discovery for: {user_url}")
|
||||
|
||||
# Extract domain from URL for better targeting
|
||||
domain = urlparse(user_url).netloc.replace('www.', '')
|
||||
|
||||
# Use Exa's answer API to find social media accounts
|
||||
result = self.exa.answer(
|
||||
f"Find all social media accounts of the url: {domain}. Return a JSON object with facebook, twitter, instagram, linkedin, youtube, and tiktok fields containing the URLs or empty strings if not found.",
|
||||
model="exa-pro",
|
||||
text=True
|
||||
)
|
||||
|
||||
# Log the raw Exa API response for debugging
|
||||
logger.info(f"Raw Exa social media response for {user_url}:")
|
||||
logger.info(f" - Request ID: {getattr(result, 'request_id', 'N/A')}")
|
||||
logger.info(f" └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}")
|
||||
# Note: Full raw response contains verbose content - logging only summary
|
||||
# To see full response, set EXA_DEBUG=true in environment
|
||||
|
||||
# Extract social media data
|
||||
answer_text = getattr(result, 'answer', '')
|
||||
citations = getattr(result, 'citations', [])
|
||||
|
||||
# Convert AnswerResult objects to dictionaries for JSON serialization
|
||||
citations_dicts = []
|
||||
for citation in citations:
|
||||
if hasattr(citation, '__dict__'):
|
||||
# Convert object to dictionary
|
||||
citation_dict = {
|
||||
'id': getattr(citation, 'id', ''),
|
||||
'title': getattr(citation, 'title', ''),
|
||||
'url': getattr(citation, 'url', ''),
|
||||
'text': getattr(citation, 'text', ''),
|
||||
'snippet': getattr(citation, 'snippet', ''),
|
||||
'published_date': getattr(citation, 'published_date', None),
|
||||
'author': getattr(citation, 'author', None),
|
||||
'image': getattr(citation, 'image', None),
|
||||
'favicon': getattr(citation, 'favicon', None)
|
||||
}
|
||||
citations_dicts.append(citation_dict)
|
||||
else:
|
||||
# If it's already a dict, use as is
|
||||
citations_dicts.append(citation)
|
||||
|
||||
logger.info(f" - Raw answer text: {answer_text}")
|
||||
logger.info(f" - Citations count: {len(citations_dicts)}")
|
||||
|
||||
# Parse the response from the answer (could be JSON or markdown format)
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
if answer_text.strip().startswith('{'):
|
||||
# Direct JSON format
|
||||
answer_data = json.loads(answer_text.strip())
|
||||
else:
|
||||
# Parse markdown format with URLs
|
||||
answer_data = {
|
||||
"facebook": "",
|
||||
"twitter": "",
|
||||
"instagram": "",
|
||||
"linkedin": "",
|
||||
"youtube": "",
|
||||
"tiktok": ""
|
||||
}
|
||||
|
||||
# Extract URLs using regex patterns
|
||||
facebook_match = re.search(r'Facebook.*?\[([^\]]+)\]', answer_text)
|
||||
if facebook_match:
|
||||
answer_data["facebook"] = facebook_match.group(1)
|
||||
|
||||
twitter_match = re.search(r'Twitter.*?\[([^\]]+)\]', answer_text)
|
||||
if twitter_match:
|
||||
answer_data["twitter"] = twitter_match.group(1)
|
||||
|
||||
instagram_match = re.search(r'Instagram.*?\[([^\]]+)\]', answer_text)
|
||||
if instagram_match:
|
||||
answer_data["instagram"] = instagram_match.group(1)
|
||||
|
||||
linkedin_match = re.search(r'LinkedIn.*?\[([^\]]+)\]', answer_text)
|
||||
if linkedin_match:
|
||||
answer_data["linkedin"] = linkedin_match.group(1)
|
||||
|
||||
youtube_match = re.search(r'YouTube.*?\[([^\]]+)\]', answer_text)
|
||||
if youtube_match:
|
||||
answer_data["youtube"] = youtube_match.group(1)
|
||||
|
||||
tiktok_match = re.search(r'TikTok.*?\[([^\]]+)\]', answer_text)
|
||||
if tiktok_match:
|
||||
answer_data["tiktok"] = tiktok_match.group(1)
|
||||
|
||||
except (json.JSONDecodeError, AttributeError, KeyError):
|
||||
# If parsing fails, create empty structure
|
||||
answer_data = {
|
||||
"facebook": "",
|
||||
"twitter": "",
|
||||
"instagram": "",
|
||||
"linkedin": "",
|
||||
"youtube": "",
|
||||
"tiktok": ""
|
||||
}
|
||||
|
||||
logger.info(f" - Parsed social media accounts:")
|
||||
for platform, url in answer_data.items():
|
||||
if url:
|
||||
logger.info(f" {platform}: {url}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"user_url": user_url,
|
||||
"social_media_accounts": answer_data,
|
||||
"citations": citations_dicts,
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"api_cost": getattr(getattr(result, 'cost_dollars', None), 'total', 0) if hasattr(result, 'cost_dollars') and getattr(result, 'cost_dollars', None) else 0,
|
||||
"request_id": getattr(result, 'request_id', None) if hasattr(result, 'request_id') else None
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in social media discovery: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during social media discovery"
|
||||
}
|
||||
|
||||
def _generate_basic_context(self, results: List[Any], user_url: str) -> str:
|
||||
"""
|
||||
Generate a basic context string from competitor results for LLM consumption.
|
||||
|
||||
Args:
|
||||
results: List of competitor results from Exa API
|
||||
user_url: Original user URL for reference
|
||||
|
||||
Returns:
|
||||
Formatted context string
|
||||
"""
|
||||
context_parts = [
|
||||
f"Competitive Analysis for: {user_url}",
|
||||
f"Found {len(results)} similar websites/competitors:",
|
||||
""
|
||||
]
|
||||
|
||||
for i, result in enumerate(results[:5], 1): # Limit to top 5 for context
|
||||
url = getattr(result, 'url', 'Unknown URL')
|
||||
title = getattr(result, 'title', 'Unknown Title')
|
||||
summary = getattr(result, 'summary', 'No summary available')
|
||||
|
||||
context_parts.extend([
|
||||
f"{i}. {title}",
|
||||
f" URL: {url}",
|
||||
f" Summary: {summary[:200]}{'...' if len(summary) > 200 else ''}",
|
||||
""
|
||||
])
|
||||
|
||||
context_parts.append("Key insights:")
|
||||
context_parts.append("- These competitors offer similar services or content")
|
||||
context_parts.append("- Analyze their content strategy and positioning")
|
||||
context_parts.append("- Identify opportunities for differentiation")
|
||||
|
||||
return "\n".join(context_parts)
|
||||
|
||||
async def analyze_competitor_content(
|
||||
self,
|
||||
competitor_url: str,
|
||||
analysis_depth: str = "standard"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Perform deeper analysis of a specific competitor.
|
||||
|
||||
Args:
|
||||
competitor_url: URL of the competitor to analyze
|
||||
analysis_depth: Depth of analysis ("quick", "standard", "deep")
|
||||
|
||||
Returns:
|
||||
Dictionary containing detailed competitor analysis
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting detailed analysis for competitor: {competitor_url}")
|
||||
|
||||
# Get similar content from this competitor
|
||||
similar_results = await self.discover_competitors(
|
||||
competitor_url,
|
||||
num_results=10,
|
||||
include_domains=[urlparse(competitor_url).netloc]
|
||||
)
|
||||
|
||||
if not similar_results["success"]:
|
||||
return similar_results
|
||||
|
||||
# Analyze content patterns
|
||||
content_patterns = self._analyze_content_patterns(similar_results["competitors"])
|
||||
|
||||
# Generate competitive insights
|
||||
competitive_insights = self._generate_competitive_insights(
|
||||
competitor_url,
|
||||
similar_results["competitors"],
|
||||
content_patterns
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"competitor_url": competitor_url,
|
||||
"content_patterns": content_patterns,
|
||||
"competitive_insights": competitive_insights,
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"analysis_depth": analysis_depth
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in competitor content analysis: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during competitor analysis"
|
||||
}
|
||||
|
||||
def _analyze_content_patterns(self, competitors: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze content patterns across competitors.
|
||||
|
||||
Args:
|
||||
competitors: List of competitor data
|
||||
|
||||
Returns:
|
||||
Dictionary of content patterns
|
||||
"""
|
||||
patterns = {
|
||||
"common_themes": [],
|
||||
"content_types": [],
|
||||
"publishing_patterns": {},
|
||||
"target_keywords": [],
|
||||
"content_strategies": []
|
||||
}
|
||||
|
||||
# Analyze common themes
|
||||
all_summaries = [comp.get("summary", "") for comp in competitors]
|
||||
# This would be enhanced with NLP analysis in a full implementation
|
||||
|
||||
# Analyze content types from URLs
|
||||
content_types = set()
|
||||
for comp in competitors:
|
||||
url = comp.get("url", "")
|
||||
if "/blog/" in url:
|
||||
content_types.add("blog")
|
||||
elif "/product/" in url or "/service/" in url:
|
||||
content_types.add("product")
|
||||
elif "/about/" in url:
|
||||
content_types.add("about")
|
||||
elif "/contact/" in url:
|
||||
content_types.add("contact")
|
||||
|
||||
patterns["content_types"] = list(content_types)
|
||||
|
||||
return patterns
|
||||
|
||||
def _generate_competitive_insights(
|
||||
self,
|
||||
competitor_url: str,
|
||||
competitors: List[Dict[str, Any]],
|
||||
content_patterns: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate competitive insights from analysis data.
|
||||
|
||||
Args:
|
||||
competitor_url: URL of the competitor
|
||||
competitors: List of competitor data
|
||||
content_patterns: Content pattern analysis
|
||||
|
||||
Returns:
|
||||
Dictionary of competitive insights
|
||||
"""
|
||||
insights = {
|
||||
"competitive_strengths": [],
|
||||
"content_opportunities": [],
|
||||
"market_positioning": "unknown",
|
||||
"strategic_recommendations": []
|
||||
}
|
||||
|
||||
# Analyze competitive strengths
|
||||
for comp in competitors:
|
||||
if comp.get("relevance_score", 0) > 0.7:
|
||||
insights["competitive_strengths"].append({
|
||||
"strength": comp.get("summary", "")[:100],
|
||||
"relevance": comp.get("relevance_score", 0)
|
||||
})
|
||||
|
||||
# Generate content opportunities
|
||||
if content_patterns.get("content_types"):
|
||||
insights["content_opportunities"] = [
|
||||
f"Develop {content_type} content"
|
||||
for content_type in content_patterns["content_types"]
|
||||
]
|
||||
|
||||
return insights
|
||||
|
||||
def health_check(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check the health of the Exa service.
|
||||
|
||||
Returns:
|
||||
Dictionary containing service health status
|
||||
"""
|
||||
try:
|
||||
# Ensure latest env before health check
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
return {
|
||||
"status": "disabled",
|
||||
"message": "Exa API key not configured",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Test with a simple request using the SDK directly
|
||||
test_result = self.exa.find_similar(
|
||||
url="https://example.com",
|
||||
num_results=1
|
||||
)
|
||||
|
||||
# If we get here without an exception, the API is working
|
||||
return {
|
||||
"status": "healthy",
|
||||
"message": "Exa API is operational",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"test_successful": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "error",
|
||||
"message": f"Health check failed: {str(e)}",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
def get_cost_estimate(self, num_results: int, include_content: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Get cost estimate for Exa API usage.
|
||||
|
||||
Args:
|
||||
num_results: Number of results requested
|
||||
include_content: Whether to include content analysis
|
||||
|
||||
Returns:
|
||||
Dictionary containing cost estimate
|
||||
"""
|
||||
# Exa API pricing (as of documentation)
|
||||
if num_results <= 25:
|
||||
search_cost = 0.005
|
||||
elif num_results <= 100:
|
||||
search_cost = 0.025
|
||||
else:
|
||||
search_cost = 1.0
|
||||
|
||||
content_cost = 0.0
|
||||
if include_content:
|
||||
# Estimate content analysis cost
|
||||
content_cost = num_results * 0.001 # Rough estimate
|
||||
|
||||
total_cost = search_cost + content_cost
|
||||
|
||||
return {
|
||||
"search_cost": search_cost,
|
||||
"content_cost": content_cost,
|
||||
"total_estimated_cost": total_cost,
|
||||
"num_results": num_results,
|
||||
"include_content": include_content
|
||||
}
|
||||
497
backend/services/research/google_search_service.py
Normal file
497
backend/services/research/google_search_service.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""
|
||||
Google Search Service for ALwrity
|
||||
|
||||
This service provides real-time industry research using Google Custom Search API,
|
||||
replacing the mock research system with actual web search capabilities.
|
||||
|
||||
Key Features:
|
||||
- Industry-specific search queries
|
||||
- Source credibility scoring and ranking
|
||||
- Content extraction and insight generation
|
||||
- Real-time information from the last month
|
||||
- Fallback mechanisms for API failures
|
||||
|
||||
Dependencies:
|
||||
- google-api-python-client
|
||||
- aiohttp (for async HTTP requests)
|
||||
- os (for environment variables)
|
||||
- logging (for debugging)
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
Last Updated: January 2025
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from typing import Dict, List, Optional, Any
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
|
||||
class GoogleSearchService:
|
||||
"""
|
||||
Service for conducting real industry research using Google Custom Search API.
|
||||
|
||||
This service replaces the mock research system with actual web search capabilities,
|
||||
providing current, relevant industry information for content grounding.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Google Search Service with API credentials."""
|
||||
self.api_key = os.getenv("GOOGLE_SEARCH_API_KEY")
|
||||
self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
|
||||
self.base_url = "https://www.googleapis.com/customsearch/v1"
|
||||
|
||||
if not self.api_key or not self.search_engine_id:
|
||||
raise ValueError("Google Search API credentials not configured. Please set GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables.")
|
||||
else:
|
||||
self.enabled = True
|
||||
logger.info("Google Search Service initialized successfully")
|
||||
|
||||
async def search_industry_trends(
|
||||
self,
|
||||
topic: str,
|
||||
industry: str,
|
||||
max_results: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for current industry trends and insights.
|
||||
|
||||
Args:
|
||||
topic: The specific topic to research
|
||||
industry: The industry context for the search
|
||||
max_results: Maximum number of search results to return
|
||||
|
||||
Returns:
|
||||
List of search results with credibility scoring
|
||||
"""
|
||||
if not self.enabled:
|
||||
raise RuntimeError("Google Search Service is not enabled. Please configure API credentials.")
|
||||
|
||||
try:
|
||||
# Construct industry-specific search query
|
||||
search_query = self._build_search_query(topic, industry)
|
||||
logger.info(f"Searching for: {search_query}")
|
||||
|
||||
# Perform the search
|
||||
search_results = await self._perform_search(search_query, max_results)
|
||||
|
||||
# Process and rank results
|
||||
processed_results = await self._process_search_results(search_results, topic, industry)
|
||||
|
||||
# Extract insights and statistics
|
||||
insights = await self._extract_insights(processed_results, topic, industry)
|
||||
|
||||
logger.info(f"Search completed successfully. Found {len(processed_results)} relevant sources.")
|
||||
|
||||
return {
|
||||
"sources": processed_results,
|
||||
"key_insights": insights["insights"],
|
||||
"statistics": insights["statistics"],
|
||||
"grounding_enabled": True,
|
||||
"search_query": search_query,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Google search failed: {str(e)}")
|
||||
raise RuntimeError(f"Google search failed: {str(e)}")
|
||||
|
||||
def _build_search_query(self, topic: str, industry: str) -> str:
|
||||
"""
|
||||
Build an optimized search query for industry research.
|
||||
|
||||
Args:
|
||||
topic: The specific topic to research
|
||||
industry: The industry context
|
||||
|
||||
Returns:
|
||||
Optimized search query string
|
||||
"""
|
||||
# Add industry-specific terms and current year for relevance
|
||||
current_year = datetime.now().year
|
||||
|
||||
# Industry-specific search patterns
|
||||
industry_patterns = {
|
||||
"Technology": ["trends", "innovations", "developments", "insights"],
|
||||
"Healthcare": ["advances", "research", "treatments", "studies"],
|
||||
"Finance": ["market analysis", "trends", "reports", "insights"],
|
||||
"Marketing": ["strategies", "trends", "best practices", "case studies"],
|
||||
"Education": ["innovations", "trends", "research", "best practices"]
|
||||
}
|
||||
|
||||
# Get industry-specific terms
|
||||
industry_terms = industry_patterns.get(industry, ["trends", "insights", "developments"])
|
||||
|
||||
# Build the query
|
||||
query_components = [
|
||||
topic,
|
||||
industry,
|
||||
f"{current_year}",
|
||||
"latest",
|
||||
"trends",
|
||||
"insights"
|
||||
]
|
||||
|
||||
# Add industry-specific terms
|
||||
query_components.extend(industry_terms[:2])
|
||||
|
||||
return " ".join(query_components)
|
||||
|
||||
async def _perform_search(self, query: str, max_results: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Perform the actual Google Custom Search API call.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
max_results: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
Raw search results from Google API
|
||||
"""
|
||||
params = {
|
||||
"key": self.api_key,
|
||||
"cx": self.search_engine_id,
|
||||
"q": query,
|
||||
"num": min(max_results, 10), # Google CSE max is 10 per request
|
||||
"dateRestrict": "m1", # Last month
|
||||
"sort": "date", # Sort by date for current information
|
||||
"safe": "active" # Safe search for professional content
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(self.base_url, params=params) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
return data.get("items", [])
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Google Search API error: {response.status} - {error_text}")
|
||||
raise Exception(f"Search API returned status {response.status}")
|
||||
|
||||
async def _process_search_results(
|
||||
self,
|
||||
raw_results: List[Dict[str, Any]],
|
||||
topic: str,
|
||||
industry: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process and rank search results by relevance and credibility.
|
||||
|
||||
Args:
|
||||
raw_results: Raw search results from Google API
|
||||
topic: The research topic for relevance scoring
|
||||
industry: The industry context for relevance scoring
|
||||
|
||||
Returns:
|
||||
Processed and ranked search results
|
||||
"""
|
||||
processed_results = []
|
||||
|
||||
for result in raw_results:
|
||||
try:
|
||||
# Extract basic information
|
||||
title = result.get("title", "")
|
||||
url = result.get("link", "")
|
||||
snippet = result.get("snippet", "")
|
||||
|
||||
# Calculate relevance score
|
||||
relevance_score = self._calculate_relevance_score(title, snippet, topic, industry)
|
||||
|
||||
# Calculate credibility score
|
||||
credibility_score = self._calculate_credibility_score(url, title)
|
||||
|
||||
# Extract publication date if available
|
||||
publication_date = self._extract_publication_date(result)
|
||||
|
||||
# Calculate domain authority
|
||||
domain_authority = self._calculate_domain_authority(url)
|
||||
|
||||
processed_result = {
|
||||
"title": title,
|
||||
"url": url,
|
||||
"content": snippet,
|
||||
"relevance_score": relevance_score,
|
||||
"credibility_score": credibility_score,
|
||||
"domain_authority": domain_authority,
|
||||
"publication_date": publication_date,
|
||||
"source_type": self._categorize_source(url, title),
|
||||
"raw_result": result
|
||||
}
|
||||
|
||||
processed_results.append(processed_result)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to process search result: {str(e)}")
|
||||
continue
|
||||
|
||||
# Sort by combined score (relevance + credibility)
|
||||
processed_results.sort(
|
||||
key=lambda x: (x["relevance_score"] + x["credibility_score"]) / 2,
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return processed_results
|
||||
|
||||
def _calculate_relevance_score(self, title: str, snippet: str, topic: str, industry: str) -> float:
|
||||
"""
|
||||
Calculate relevance score based on topic and industry alignment.
|
||||
|
||||
Args:
|
||||
title: The title of the search result
|
||||
snippet: The snippet/description of the result
|
||||
topic: The research topic
|
||||
industry: The industry context
|
||||
|
||||
Returns:
|
||||
Relevance score between 0.0 and 1.0
|
||||
"""
|
||||
score = 0.0
|
||||
text = f"{title} {snippet}".lower()
|
||||
|
||||
# Topic relevance (40% of score)
|
||||
topic_words = topic.lower().split()
|
||||
topic_matches = sum(1 for word in topic_words if word in text)
|
||||
topic_score = min(topic_matches / len(topic_words), 1.0) * 0.4
|
||||
|
||||
# Industry relevance (30% of score)
|
||||
industry_words = industry.lower().split()
|
||||
industry_matches = sum(1 for word in industry_words if word in text)
|
||||
industry_score = min(industry_matches / len(industry_words), 1.0) * 0.3
|
||||
|
||||
# Content quality indicators (30% of score)
|
||||
quality_indicators = [
|
||||
"research", "study", "analysis", "report", "insights",
|
||||
"trends", "data", "statistics", "findings", "expert"
|
||||
]
|
||||
quality_matches = sum(1 for indicator in quality_indicators if indicator in text)
|
||||
quality_score = min(quality_matches / len(quality_indicators), 1.0) * 0.3
|
||||
|
||||
score = topic_score + industry_score + quality_score
|
||||
return round(score, 3)
|
||||
|
||||
def _calculate_credibility_score(self, url: str, title: str) -> float:
|
||||
"""
|
||||
Calculate credibility score based on URL and title analysis.
|
||||
|
||||
Args:
|
||||
url: The URL of the source
|
||||
title: The title of the content
|
||||
|
||||
Returns:
|
||||
Credibility score between 0.0 and 1.0
|
||||
"""
|
||||
score = 0.5 # Base score
|
||||
|
||||
# Domain credibility indicators
|
||||
credible_domains = [
|
||||
"harvard.edu", "stanford.edu", "mit.edu", "berkeley.edu", # Academic
|
||||
"forbes.com", "bloomberg.com", "reuters.com", "wsj.com", # Business
|
||||
"nature.com", "science.org", "ieee.org", "acm.org", # Scientific
|
||||
"linkedin.com", "medium.com", "substack.com" # Professional
|
||||
]
|
||||
|
||||
# Check if domain is in credible list
|
||||
domain = self._extract_domain(url)
|
||||
if any(credible_domain in domain for credible_domain in credible_domains):
|
||||
score += 0.3
|
||||
|
||||
# Title credibility indicators
|
||||
credible_indicators = [
|
||||
"research", "study", "analysis", "report", "insights",
|
||||
"expert", "professional", "industry", "trends"
|
||||
]
|
||||
|
||||
title_lower = title.lower()
|
||||
credible_matches = sum(1 for indicator in credible_indicators if indicator in title_lower)
|
||||
score += min(credible_matches * 0.1, 0.2)
|
||||
|
||||
return round(min(score, 1.0), 3)
|
||||
|
||||
def _calculate_domain_authority(self, url: str) -> float:
|
||||
"""
|
||||
Calculate domain authority based on URL analysis.
|
||||
|
||||
Args:
|
||||
url: The URL to analyze
|
||||
|
||||
Returns:
|
||||
Domain authority score between 0.0 and 1.0
|
||||
"""
|
||||
domain = self._extract_domain(url)
|
||||
|
||||
# High authority domains
|
||||
high_authority = [
|
||||
"harvard.edu", "stanford.edu", "mit.edu", "berkeley.edu",
|
||||
"forbes.com", "bloomberg.com", "reuters.com", "wsj.com",
|
||||
"nature.com", "science.org", "ieee.org", "acm.org"
|
||||
]
|
||||
|
||||
# Medium authority domains
|
||||
medium_authority = [
|
||||
"linkedin.com", "medium.com", "substack.com", "techcrunch.com",
|
||||
"venturebeat.com", "wired.com", "theverge.com"
|
||||
]
|
||||
|
||||
if any(auth_domain in domain for auth_domain in high_authority):
|
||||
return 0.9
|
||||
elif any(auth_domain in domain for auth_domain in medium_authority):
|
||||
return 0.7
|
||||
else:
|
||||
# Basic scoring for other domains
|
||||
return 0.5
|
||||
|
||||
def _extract_domain(self, url: str) -> str:
|
||||
"""Extract domain from URL."""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(url)
|
||||
return parsed.netloc.lower()
|
||||
except:
|
||||
return url.lower()
|
||||
|
||||
def _extract_publication_date(self, result: Dict[str, Any]) -> Optional[str]:
|
||||
"""Extract publication date from search result if available."""
|
||||
# Check for various date fields
|
||||
date_fields = ["pagemap", "metatags", "date"]
|
||||
|
||||
for field in date_fields:
|
||||
if field in result:
|
||||
date_value = result[field]
|
||||
if isinstance(date_value, dict):
|
||||
# Look for common date keys
|
||||
for date_key in ["date", "pubdate", "article:published_time"]:
|
||||
if date_key in date_value:
|
||||
return date_value[date_key]
|
||||
elif isinstance(date_value, str):
|
||||
return date_value
|
||||
|
||||
return None
|
||||
|
||||
def _categorize_source(self, url: str, title: str) -> str:
|
||||
"""Categorize the source type based on URL and title."""
|
||||
domain = self._extract_domain(url)
|
||||
title_lower = title.lower()
|
||||
|
||||
# Academic sources
|
||||
if any(edu in domain for edu in [".edu", "harvard", "stanford", "mit"]):
|
||||
return "academic"
|
||||
|
||||
# Business/News sources
|
||||
if any(biz in domain for biz in ["forbes", "bloomberg", "reuters", "wsj"]):
|
||||
return "business_news"
|
||||
|
||||
# Professional platforms
|
||||
if any(prof in domain for prof in ["linkedin", "medium", "substack"]):
|
||||
return "professional_platform"
|
||||
|
||||
# Research/Scientific
|
||||
if any(research in domain for research in ["nature", "science", "ieee", "acm"]):
|
||||
return "research_scientific"
|
||||
|
||||
# Industry reports
|
||||
if any(report in title_lower for report in ["report", "study", "analysis", "research"]):
|
||||
return "industry_report"
|
||||
|
||||
return "general"
|
||||
|
||||
async def _extract_insights(
|
||||
self,
|
||||
sources: List[Dict[str, Any]],
|
||||
topic: str,
|
||||
industry: str
|
||||
) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Extract key insights and statistics from search results.
|
||||
|
||||
Args:
|
||||
sources: Processed search results
|
||||
topic: The research topic
|
||||
industry: The industry context
|
||||
|
||||
Returns:
|
||||
Dictionary containing insights and statistics
|
||||
"""
|
||||
insights = []
|
||||
statistics = []
|
||||
|
||||
# Extract insights from top sources
|
||||
top_sources = sources[:5] # Top 5 most relevant sources
|
||||
|
||||
for source in top_sources:
|
||||
content = source.get("content", "")
|
||||
|
||||
# Look for insight patterns
|
||||
insight_patterns = [
|
||||
"shows", "indicates", "suggests", "reveals", "demonstrates",
|
||||
"highlights", "emphasizes", "points to", "suggests that"
|
||||
]
|
||||
|
||||
for pattern in insight_patterns:
|
||||
if pattern in content.lower():
|
||||
# Extract the sentence containing the insight
|
||||
sentences = content.split(". ")
|
||||
for sentence in sentences:
|
||||
if pattern in sentence.lower():
|
||||
insights.append(sentence.strip())
|
||||
break
|
||||
|
||||
# Look for statistical patterns
|
||||
stat_patterns = [
|
||||
r'\d+%', # Percentages
|
||||
r'\d+ percent', # Written percentages
|
||||
r'\$\d+', # Dollar amounts
|
||||
r'\d+ million', # Millions
|
||||
r'\d+ billion', # Billions
|
||||
r'\d+ out of \d+', # Ratios
|
||||
]
|
||||
|
||||
import re
|
||||
for pattern in stat_patterns:
|
||||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||||
for match in matches:
|
||||
statistics.append(f"{match}")
|
||||
|
||||
# Limit the number of insights and statistics
|
||||
insights = insights[:10] # Top 10 insights
|
||||
statistics = statistics[:10] # Top 10 statistics
|
||||
|
||||
return {
|
||||
"insights": insights,
|
||||
"statistics": statistics
|
||||
}
|
||||
|
||||
|
||||
async def test_api_connection(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the Google Search API connection.
|
||||
|
||||
Returns:
|
||||
Test results and status information
|
||||
"""
|
||||
if not self.enabled:
|
||||
raise RuntimeError("Google Search Service is not enabled. Please configure API credentials.")
|
||||
|
||||
try:
|
||||
# Perform a simple test search
|
||||
test_query = "AI technology trends 2024"
|
||||
test_results = await self._perform_search(test_query, 1)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Google Search API connection successful",
|
||||
"enabled": True,
|
||||
"test_results_count": len(test_results),
|
||||
"api_key_configured": bool(self.api_key),
|
||||
"search_engine_configured": bool(self.search_engine_id)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "error",
|
||||
"message": f"Google Search API connection failed: {str(e)}",
|
||||
"enabled": False,
|
||||
"error": str(e)
|
||||
}
|
||||
23
backend/services/research/intent/__init__.py
Normal file
23
backend/services/research/intent/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Research Intent Package
|
||||
|
||||
This package provides intent-driven research capabilities:
|
||||
- Intent inference from user input
|
||||
- Targeted query generation
|
||||
- Intent-aware result analysis
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
from .research_intent_inference import ResearchIntentInference
|
||||
from .intent_query_generator import IntentQueryGenerator
|
||||
from .intent_aware_analyzer import IntentAwareAnalyzer
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
__all__ = [
|
||||
"ResearchIntentInference",
|
||||
"IntentQueryGenerator",
|
||||
"IntentAwareAnalyzer",
|
||||
"IntentPromptBuilder",
|
||||
]
|
||||
547
backend/services/research/intent/intent_aware_analyzer.py
Normal file
547
backend/services/research/intent/intent_aware_analyzer.py
Normal file
@@ -0,0 +1,547 @@
|
||||
"""
|
||||
Intent-Aware Result Analyzer
|
||||
|
||||
Analyzes research results based on user intent.
|
||||
Extracts exactly what the user needs from raw research data.
|
||||
|
||||
This is the key innovation - instead of generic analysis,
|
||||
we analyze results through the lens of what the user wants to accomplish.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
IntentDrivenResearchResult,
|
||||
ExpectedDeliverable,
|
||||
StatisticWithCitation,
|
||||
ExpertQuote,
|
||||
CaseStudySummary,
|
||||
TrendAnalysis,
|
||||
ComparisonTable,
|
||||
ComparisonItem,
|
||||
ProsCons,
|
||||
SourceWithRelevance,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
|
||||
class IntentAwareAnalyzer:
|
||||
"""
|
||||
Analyzes research results based on user intent.
|
||||
|
||||
Instead of generic summaries, this extracts exactly what the user
|
||||
needs: statistics, quotes, case studies, trends, etc.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the analyzer."""
|
||||
self.prompt_builder = IntentPromptBuilder()
|
||||
logger.info("IntentAwareAnalyzer initialized")
|
||||
|
||||
async def analyze(
|
||||
self,
|
||||
raw_results: Dict[str, Any],
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> IntentDrivenResearchResult:
|
||||
"""
|
||||
Analyze raw research results based on user intent.
|
||||
|
||||
Args:
|
||||
raw_results: Raw results from Exa/Tavily/Google
|
||||
intent: The user's research intent
|
||||
research_persona: Optional persona for context
|
||||
|
||||
Returns:
|
||||
IntentDrivenResearchResult with extracted deliverables
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Analyzing results for intent: {intent.primary_question[:50]}...")
|
||||
|
||||
# Format raw results for analysis
|
||||
formatted_results = self._format_raw_results(raw_results)
|
||||
|
||||
# Build the analysis prompt
|
||||
prompt = self.prompt_builder.build_intent_aware_analysis_prompt(
|
||||
raw_results=formatted_results,
|
||||
intent=intent,
|
||||
research_persona=research_persona,
|
||||
)
|
||||
|
||||
# Define the expected JSON schema
|
||||
analysis_schema = self._build_analysis_schema(intent.expected_deliverables)
|
||||
|
||||
# Call LLM for analysis
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=analysis_schema,
|
||||
user_id=None
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Intent-aware analysis failed: {result.get('error')}")
|
||||
return self._create_fallback_result(raw_results, intent)
|
||||
|
||||
# Parse and validate the result
|
||||
analyzed_result = self._parse_analysis_result(result, intent, raw_results)
|
||||
|
||||
logger.info(
|
||||
f"Analysis complete: {len(analyzed_result.key_takeaways)} takeaways, "
|
||||
f"{len(analyzed_result.statistics)} stats, "
|
||||
f"{len(analyzed_result.sources)} sources"
|
||||
)
|
||||
|
||||
return analyzed_result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in intent-aware analysis: {e}")
|
||||
return self._create_fallback_result(raw_results, intent)
|
||||
|
||||
def _format_raw_results(self, raw_results: Dict[str, Any]) -> str:
|
||||
"""Format raw research results for LLM analysis."""
|
||||
|
||||
formatted_parts = []
|
||||
|
||||
# Extract content
|
||||
content = raw_results.get("content", "")
|
||||
if content:
|
||||
formatted_parts.append(f"=== MAIN CONTENT ===\n{content[:8000]}")
|
||||
|
||||
# Extract sources with their content
|
||||
sources = raw_results.get("sources", [])
|
||||
if sources:
|
||||
formatted_parts.append("\n=== SOURCES ===")
|
||||
for i, source in enumerate(sources[:15], 1): # Limit to 15 sources
|
||||
title = source.get("title", "Untitled")
|
||||
url = source.get("url", "")
|
||||
excerpt = source.get("excerpt", source.get("text", source.get("content", "")))
|
||||
|
||||
formatted_parts.append(f"\nSource {i}: {title}")
|
||||
formatted_parts.append(f"URL: {url}")
|
||||
if excerpt:
|
||||
formatted_parts.append(f"Content: {excerpt[:500]}")
|
||||
|
||||
# Extract grounding metadata if available (from Google)
|
||||
grounding = raw_results.get("grounding_metadata", {})
|
||||
if grounding:
|
||||
formatted_parts.append("\n=== GROUNDING DATA ===")
|
||||
formatted_parts.append(json.dumps(grounding, indent=2)[:2000])
|
||||
|
||||
# Extract any AI answers (from Tavily)
|
||||
answer = raw_results.get("answer", "")
|
||||
if answer:
|
||||
formatted_parts.append(f"\n=== AI-GENERATED ANSWER ===\n{answer}")
|
||||
|
||||
return "\n".join(formatted_parts)
|
||||
|
||||
def _build_analysis_schema(self, expected_deliverables: List[str]) -> Dict[str, Any]:
|
||||
"""Build JSON schema based on expected deliverables."""
|
||||
|
||||
# Base schema
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"primary_answer": {"type": "string"},
|
||||
"secondary_answers": {
|
||||
"type": "object",
|
||||
"additionalProperties": {"type": "string"}
|
||||
},
|
||||
"executive_summary": {"type": "string"},
|
||||
"key_takeaways": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"maxItems": 7
|
||||
},
|
||||
"confidence": {"type": "number"},
|
||||
"gaps_identified": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"follow_up_queries": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
},
|
||||
"required": ["primary_answer", "executive_summary", "key_takeaways", "confidence"]
|
||||
}
|
||||
|
||||
# Add deliverable-specific properties
|
||||
if ExpectedDeliverable.KEY_STATISTICS.value in expected_deliverables:
|
||||
schema["properties"]["statistics"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"statistic": {"type": "string"},
|
||||
"value": {"type": "string"},
|
||||
"context": {"type": "string"},
|
||||
"source": {"type": "string"},
|
||||
"url": {"type": "string"},
|
||||
"credibility": {"type": "number"},
|
||||
"recency": {"type": "string"}
|
||||
},
|
||||
"required": ["statistic", "context", "source", "url"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.EXPERT_QUOTES.value in expected_deliverables:
|
||||
schema["properties"]["expert_quotes"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quote": {"type": "string"},
|
||||
"speaker": {"type": "string"},
|
||||
"title": {"type": "string"},
|
||||
"organization": {"type": "string"},
|
||||
"source": {"type": "string"},
|
||||
"url": {"type": "string"}
|
||||
},
|
||||
"required": ["quote", "speaker", "source", "url"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.CASE_STUDIES.value in expected_deliverables:
|
||||
schema["properties"]["case_studies"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"organization": {"type": "string"},
|
||||
"challenge": {"type": "string"},
|
||||
"solution": {"type": "string"},
|
||||
"outcome": {"type": "string"},
|
||||
"key_metrics": {"type": "array", "items": {"type": "string"}},
|
||||
"source": {"type": "string"},
|
||||
"url": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "organization", "challenge", "solution", "outcome"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.TRENDS.value in expected_deliverables:
|
||||
schema["properties"]["trends"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"trend": {"type": "string"},
|
||||
"direction": {"type": "string"},
|
||||
"evidence": {"type": "array", "items": {"type": "string"}},
|
||||
"impact": {"type": "string"},
|
||||
"timeline": {"type": "string"},
|
||||
"sources": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["trend", "direction", "evidence"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.COMPARISONS.value in expected_deliverables:
|
||||
schema["properties"]["comparisons"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"criteria": {"type": "array", "items": {"type": "string"}},
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"pros": {"type": "array", "items": {"type": "string"}},
|
||||
"cons": {"type": "array", "items": {"type": "string"}},
|
||||
"features": {"type": "object"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"verdict": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.PROS_CONS.value in expected_deliverables:
|
||||
schema["properties"]["pros_cons"] = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"subject": {"type": "string"},
|
||||
"pros": {"type": "array", "items": {"type": "string"}},
|
||||
"cons": {"type": "array", "items": {"type": "string"}},
|
||||
"balanced_verdict": {"type": "string"}
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.BEST_PRACTICES.value in expected_deliverables:
|
||||
schema["properties"]["best_practices"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.STEP_BY_STEP.value in expected_deliverables:
|
||||
schema["properties"]["step_by_step"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.DEFINITIONS.value in expected_deliverables:
|
||||
schema["properties"]["definitions"] = {
|
||||
"type": "object",
|
||||
"additionalProperties": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.EXAMPLES.value in expected_deliverables:
|
||||
schema["properties"]["examples"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.PREDICTIONS.value in expected_deliverables:
|
||||
schema["properties"]["predictions"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
# Always include sources and suggested outline
|
||||
schema["properties"]["sources"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"url": {"type": "string"},
|
||||
"relevance_score": {"type": "number"},
|
||||
"relevance_reason": {"type": "string"},
|
||||
"content_type": {"type": "string"},
|
||||
"credibility_score": {"type": "number"}
|
||||
},
|
||||
"required": ["title", "url"]
|
||||
}
|
||||
}
|
||||
|
||||
schema["properties"]["suggested_outline"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
return schema
|
||||
|
||||
def _parse_analysis_result(
|
||||
self,
|
||||
result: Dict[str, Any],
|
||||
intent: ResearchIntent,
|
||||
raw_results: Dict[str, Any],
|
||||
) -> IntentDrivenResearchResult:
|
||||
"""Parse LLM analysis result into structured format."""
|
||||
|
||||
# Parse statistics
|
||||
statistics = []
|
||||
for stat in result.get("statistics", []):
|
||||
try:
|
||||
statistics.append(StatisticWithCitation(
|
||||
statistic=stat.get("statistic", ""),
|
||||
value=stat.get("value"),
|
||||
context=stat.get("context", ""),
|
||||
source=stat.get("source", ""),
|
||||
url=stat.get("url", ""),
|
||||
credibility=float(stat.get("credibility", 0.8)),
|
||||
recency=stat.get("recency"),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse statistic: {e}")
|
||||
|
||||
# Parse expert quotes
|
||||
expert_quotes = []
|
||||
for quote in result.get("expert_quotes", []):
|
||||
try:
|
||||
expert_quotes.append(ExpertQuote(
|
||||
quote=quote.get("quote", ""),
|
||||
speaker=quote.get("speaker", ""),
|
||||
title=quote.get("title"),
|
||||
organization=quote.get("organization"),
|
||||
context=quote.get("context"),
|
||||
source=quote.get("source", ""),
|
||||
url=quote.get("url", ""),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse expert quote: {e}")
|
||||
|
||||
# Parse case studies
|
||||
case_studies = []
|
||||
for cs in result.get("case_studies", []):
|
||||
try:
|
||||
case_studies.append(CaseStudySummary(
|
||||
title=cs.get("title", ""),
|
||||
organization=cs.get("organization", ""),
|
||||
challenge=cs.get("challenge", ""),
|
||||
solution=cs.get("solution", ""),
|
||||
outcome=cs.get("outcome", ""),
|
||||
key_metrics=cs.get("key_metrics", []),
|
||||
source=cs.get("source", ""),
|
||||
url=cs.get("url", ""),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse case study: {e}")
|
||||
|
||||
# Parse trends
|
||||
trends = []
|
||||
for trend in result.get("trends", []):
|
||||
try:
|
||||
trends.append(TrendAnalysis(
|
||||
trend=trend.get("trend", ""),
|
||||
direction=trend.get("direction", "growing"),
|
||||
evidence=trend.get("evidence", []),
|
||||
impact=trend.get("impact"),
|
||||
timeline=trend.get("timeline"),
|
||||
sources=trend.get("sources", []),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse trend: {e}")
|
||||
|
||||
# Parse comparisons
|
||||
comparisons = []
|
||||
for comp in result.get("comparisons", []):
|
||||
try:
|
||||
items = []
|
||||
for item in comp.get("items", []):
|
||||
items.append(ComparisonItem(
|
||||
name=item.get("name", ""),
|
||||
description=item.get("description"),
|
||||
pros=item.get("pros", []),
|
||||
cons=item.get("cons", []),
|
||||
features=item.get("features", {}),
|
||||
rating=item.get("rating"),
|
||||
source=item.get("source"),
|
||||
))
|
||||
comparisons.append(ComparisonTable(
|
||||
title=comp.get("title", ""),
|
||||
criteria=comp.get("criteria", []),
|
||||
items=items,
|
||||
winner=comp.get("winner"),
|
||||
verdict=comp.get("verdict"),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse comparison: {e}")
|
||||
|
||||
# Parse pros/cons
|
||||
pros_cons = None
|
||||
pc_data = result.get("pros_cons")
|
||||
if pc_data:
|
||||
try:
|
||||
pros_cons = ProsCons(
|
||||
subject=pc_data.get("subject", intent.original_input),
|
||||
pros=pc_data.get("pros", []),
|
||||
cons=pc_data.get("cons", []),
|
||||
balanced_verdict=pc_data.get("balanced_verdict", ""),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse pros/cons: {e}")
|
||||
|
||||
# Parse sources
|
||||
sources = []
|
||||
for src in result.get("sources", []):
|
||||
try:
|
||||
sources.append(SourceWithRelevance(
|
||||
title=src.get("title", ""),
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("excerpt"),
|
||||
relevance_score=float(src.get("relevance_score", 0.8)),
|
||||
relevance_reason=src.get("relevance_reason"),
|
||||
content_type=src.get("content_type"),
|
||||
published_date=src.get("published_date"),
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse source: {e}")
|
||||
|
||||
# If no sources from analysis, extract from raw results
|
||||
if not sources:
|
||||
sources = self._extract_sources_from_raw(raw_results)
|
||||
|
||||
return IntentDrivenResearchResult(
|
||||
success=True,
|
||||
primary_answer=result.get("primary_answer", ""),
|
||||
secondary_answers=result.get("secondary_answers", {}),
|
||||
statistics=statistics,
|
||||
expert_quotes=expert_quotes,
|
||||
case_studies=case_studies,
|
||||
comparisons=comparisons,
|
||||
trends=trends,
|
||||
best_practices=result.get("best_practices", []),
|
||||
step_by_step=result.get("step_by_step", []),
|
||||
pros_cons=pros_cons,
|
||||
definitions=result.get("definitions", {}),
|
||||
examples=result.get("examples", []),
|
||||
predictions=result.get("predictions", []),
|
||||
executive_summary=result.get("executive_summary", ""),
|
||||
key_takeaways=result.get("key_takeaways", []),
|
||||
suggested_outline=result.get("suggested_outline", []),
|
||||
sources=sources,
|
||||
raw_content=self._format_raw_results(raw_results)[:5000],
|
||||
confidence=float(result.get("confidence", 0.7)),
|
||||
gaps_identified=result.get("gaps_identified", []),
|
||||
follow_up_queries=result.get("follow_up_queries", []),
|
||||
original_intent=intent,
|
||||
)
|
||||
|
||||
def _extract_sources_from_raw(self, raw_results: Dict[str, Any]) -> List[SourceWithRelevance]:
|
||||
"""Extract sources from raw results when analysis doesn't provide them."""
|
||||
|
||||
sources = []
|
||||
for src in raw_results.get("sources", [])[:10]:
|
||||
try:
|
||||
sources.append(SourceWithRelevance(
|
||||
title=src.get("title", "Untitled"),
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("excerpt", src.get("text", ""))[:200],
|
||||
relevance_score=0.8,
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract source: {e}")
|
||||
|
||||
return sources
|
||||
|
||||
def _create_fallback_result(
|
||||
self,
|
||||
raw_results: Dict[str, Any],
|
||||
intent: ResearchIntent,
|
||||
) -> IntentDrivenResearchResult:
|
||||
"""Create a fallback result when AI analysis fails."""
|
||||
|
||||
# Extract basic information from raw results
|
||||
content = raw_results.get("content", "")
|
||||
sources = self._extract_sources_from_raw(raw_results)
|
||||
|
||||
# Create basic takeaways from content
|
||||
key_takeaways = []
|
||||
if content:
|
||||
sentences = content.split(". ")[:5]
|
||||
key_takeaways = [s.strip() + "." for s in sentences if len(s) > 20]
|
||||
|
||||
return IntentDrivenResearchResult(
|
||||
success=True,
|
||||
primary_answer=f"Research findings for: {intent.primary_question}",
|
||||
secondary_answers={},
|
||||
executive_summary=content[:300] if content else "Research completed",
|
||||
key_takeaways=key_takeaways,
|
||||
sources=sources,
|
||||
raw_content=self._format_raw_results(raw_results)[:5000],
|
||||
confidence=0.5,
|
||||
gaps_identified=[
|
||||
"AI analysis failed - showing raw results",
|
||||
"Manual review recommended"
|
||||
],
|
||||
follow_up_queries=[],
|
||||
original_intent=intent,
|
||||
)
|
||||
627
backend/services/research/intent/intent_prompt_builder.py
Normal file
627
backend/services/research/intent/intent_prompt_builder.py
Normal file
@@ -0,0 +1,627 @@
|
||||
"""
|
||||
Intent Prompt Builder
|
||||
|
||||
Builds comprehensive AI prompts for:
|
||||
1. Intent inference from user input
|
||||
2. Targeted query generation
|
||||
3. Intent-aware result analysis
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchPurpose,
|
||||
ContentOutput,
|
||||
ExpectedDeliverable,
|
||||
ResearchDepthLevel,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
|
||||
|
||||
class IntentPromptBuilder:
|
||||
"""Builds prompts for intent-driven research."""
|
||||
|
||||
# Purpose explanations for the AI
|
||||
PURPOSE_EXPLANATIONS = {
|
||||
ResearchPurpose.LEARN: "User wants to understand a topic for personal knowledge",
|
||||
ResearchPurpose.CREATE_CONTENT: "User will create content (blog, video, podcast) from this research",
|
||||
ResearchPurpose.MAKE_DECISION: "User needs to make a choice/decision based on research",
|
||||
ResearchPurpose.COMPARE: "User wants to compare alternatives or competitors",
|
||||
ResearchPurpose.SOLVE_PROBLEM: "User is looking for a solution to a specific problem",
|
||||
ResearchPurpose.FIND_DATA: "User needs specific statistics, facts, or citations",
|
||||
ResearchPurpose.EXPLORE_TRENDS: "User wants to understand current/future trends",
|
||||
ResearchPurpose.VALIDATE: "User wants to verify or fact-check information",
|
||||
ResearchPurpose.GENERATE_IDEAS: "User wants to brainstorm content ideas",
|
||||
}
|
||||
|
||||
# Deliverable descriptions
|
||||
DELIVERABLE_DESCRIPTIONS = {
|
||||
ExpectedDeliverable.KEY_STATISTICS: "Numbers, percentages, data points with citations",
|
||||
ExpectedDeliverable.EXPERT_QUOTES: "Authoritative quotes from industry experts",
|
||||
ExpectedDeliverable.CASE_STUDIES: "Real examples and success stories",
|
||||
ExpectedDeliverable.COMPARISONS: "Side-by-side analysis tables",
|
||||
ExpectedDeliverable.TRENDS: "Current and emerging industry trends",
|
||||
ExpectedDeliverable.BEST_PRACTICES: "Recommended approaches and guidelines",
|
||||
ExpectedDeliverable.STEP_BY_STEP: "Process guides and how-to instructions",
|
||||
ExpectedDeliverable.PROS_CONS: "Advantages and disadvantages analysis",
|
||||
ExpectedDeliverable.DEFINITIONS: "Clear explanations of concepts and terms",
|
||||
ExpectedDeliverable.CITATIONS: "Authoritative sources for reference",
|
||||
ExpectedDeliverable.EXAMPLES: "Concrete examples to illustrate points",
|
||||
ExpectedDeliverable.PREDICTIONS: "Future outlook and predictions",
|
||||
}
|
||||
|
||||
def build_intent_inference_prompt(
|
||||
self,
|
||||
user_input: str,
|
||||
keywords: List[str],
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build prompt for inferring user's research intent.
|
||||
|
||||
This prompt analyzes the user's input and determines:
|
||||
- What they want to accomplish
|
||||
- What questions they need answered
|
||||
- What specific deliverables they need
|
||||
"""
|
||||
|
||||
# Build persona context
|
||||
persona_context = self._build_persona_context(research_persona, industry, target_audience)
|
||||
|
||||
# Build competitor context
|
||||
competitor_context = self._build_competitor_context(competitor_data)
|
||||
|
||||
prompt = f"""You are an expert research intent analyzer. Your job is to understand what a content creator REALLY needs from their research.
|
||||
|
||||
## USER INPUT
|
||||
"{user_input}"
|
||||
|
||||
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
|
||||
|
||||
## USER CONTEXT
|
||||
{persona_context}
|
||||
|
||||
{competitor_context}
|
||||
|
||||
## YOUR TASK
|
||||
|
||||
Analyze the user's input and infer their research intent. Determine:
|
||||
|
||||
1. **INPUT TYPE**: Is this:
|
||||
- "keywords": Simple topic keywords (e.g., "AI healthcare 2025")
|
||||
- "question": A specific question (e.g., "What are the best AI tools for healthcare?")
|
||||
- "goal": A goal statement (e.g., "I need to write a blog about AI in healthcare")
|
||||
- "mixed": Combination of above
|
||||
|
||||
2. **PRIMARY QUESTION**: What is the main question to answer? Convert their input into a clear question.
|
||||
|
||||
3. **SECONDARY QUESTIONS**: What related questions should also be answered? (3-5 questions)
|
||||
|
||||
4. **PURPOSE**: Why are they researching? Choose ONE:
|
||||
- "learn": Understand a topic for personal knowledge
|
||||
- "create_content": Create content (blog, video, podcast)
|
||||
- "make_decision": Make a choice between options
|
||||
- "compare": Compare alternatives/competitors
|
||||
- "solve_problem": Find a solution
|
||||
- "find_data": Get specific statistics/facts
|
||||
- "explore_trends": Understand industry trends
|
||||
- "validate": Verify claims/information
|
||||
- "generate_ideas": Brainstorm ideas
|
||||
|
||||
5. **CONTENT OUTPUT**: What will they create? Choose ONE:
|
||||
- "blog", "podcast", "video", "social_post", "newsletter", "presentation", "report", "whitepaper", "email", "general"
|
||||
|
||||
6. **EXPECTED DELIVERABLES**: What specific outputs do they need? Choose ALL that apply:
|
||||
- "key_statistics": Numbers, data points
|
||||
- "expert_quotes": Authoritative quotes
|
||||
- "case_studies": Real examples
|
||||
- "comparisons": Side-by-side analysis
|
||||
- "trends": Industry trends
|
||||
- "best_practices": Recommendations
|
||||
- "step_by_step": How-to guides
|
||||
- "pros_cons": Advantages/disadvantages
|
||||
- "definitions": Concept explanations
|
||||
- "citations": Source references
|
||||
- "examples": Concrete examples
|
||||
- "predictions": Future outlook
|
||||
|
||||
7. **DEPTH**: How deep should the research go?
|
||||
- "overview": Quick summary
|
||||
- "detailed": In-depth analysis
|
||||
- "expert": Comprehensive expert-level
|
||||
|
||||
8. **FOCUS AREAS**: What specific aspects should be researched? (2-4 areas)
|
||||
|
||||
9. **PERSPECTIVE**: From whose viewpoint? (e.g., "marketing manager", "small business owner")
|
||||
|
||||
10. **TIME SENSITIVITY**: Is recency important?
|
||||
- "real_time": Latest only (past 24-48 hours)
|
||||
- "recent": Past week/month
|
||||
- "historical": Include older content
|
||||
- "evergreen": Timeless content
|
||||
|
||||
11. **CONFIDENCE**: How confident are you in this inference? (0.0-1.0)
|
||||
- If < 0.7, set needs_clarification to true and provide clarifying_questions
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return a JSON object:
|
||||
```json
|
||||
{{
|
||||
"input_type": "keywords|question|goal|mixed",
|
||||
"primary_question": "The main question to answer",
|
||||
"secondary_questions": ["question 1", "question 2", "question 3"],
|
||||
"purpose": "one of the purpose options",
|
||||
"content_output": "one of the content options",
|
||||
"expected_deliverables": ["deliverable1", "deliverable2"],
|
||||
"depth": "overview|detailed|expert",
|
||||
"focus_areas": ["area1", "area2"],
|
||||
"perspective": "target perspective or null",
|
||||
"time_sensitivity": "real_time|recent|historical|evergreen",
|
||||
"confidence": 0.85,
|
||||
"needs_clarification": false,
|
||||
"clarifying_questions": [],
|
||||
"analysis_summary": "Brief summary of what the user wants"
|
||||
}}
|
||||
```
|
||||
|
||||
## IMPORTANT RULES
|
||||
|
||||
1. Always convert vague input into a specific primary question
|
||||
2. Infer deliverables based on purpose (e.g., create_content → statistics + examples)
|
||||
3. Use persona context to refine perspective and focus areas
|
||||
4. If input is ambiguous, provide clarifying questions
|
||||
5. Default to "detailed" depth unless input suggests otherwise
|
||||
6. For content creation, include relevant deliverables automatically
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def build_query_generation_prompt(
|
||||
self,
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build prompt for generating targeted research queries.
|
||||
|
||||
Generates multiple queries, each targeting a specific deliverable.
|
||||
"""
|
||||
|
||||
deliverables_list = "\n".join([
|
||||
f"- {d}: {self.DELIVERABLE_DESCRIPTIONS.get(ExpectedDeliverable(d), d)}"
|
||||
for d in intent.expected_deliverables
|
||||
])
|
||||
|
||||
persona_keywords = ""
|
||||
if research_persona and research_persona.suggested_keywords:
|
||||
persona_keywords = f"\nSUGGESTED KEYWORDS FROM PERSONA: {', '.join(research_persona.suggested_keywords[:10])}"
|
||||
|
||||
prompt = f"""You are a research query optimizer. Generate multiple targeted search queries based on the user's research intent.
|
||||
|
||||
## RESEARCH INTENT
|
||||
|
||||
PRIMARY QUESTION: {intent.primary_question}
|
||||
|
||||
SECONDARY QUESTIONS:
|
||||
{chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None'}
|
||||
|
||||
PURPOSE: {intent.purpose} - {self.PURPOSE_EXPLANATIONS.get(ResearchPurpose(intent.purpose), intent.purpose)}
|
||||
|
||||
CONTENT OUTPUT: {intent.content_output}
|
||||
|
||||
EXPECTED DELIVERABLES:
|
||||
{deliverables_list}
|
||||
|
||||
DEPTH: {intent.depth}
|
||||
|
||||
FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'}
|
||||
|
||||
PERSPECTIVE: {intent.perspective or 'General audience'}
|
||||
|
||||
TIME SENSITIVITY: {intent.time_sensitivity or 'No specific requirement'}
|
||||
{persona_keywords}
|
||||
|
||||
## YOUR TASK
|
||||
|
||||
Generate 4-8 targeted research queries. Each query should:
|
||||
1. Target a specific deliverable or question
|
||||
2. Be optimized for semantic search (Exa/Tavily)
|
||||
3. Include relevant context for better results
|
||||
|
||||
For each query, specify:
|
||||
- The query string
|
||||
- What deliverable it targets
|
||||
- Best provider (exa for semantic/deep, tavily for news/real-time, google for factual)
|
||||
- Priority (1-5, higher = more important)
|
||||
- What we expect to find
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return a JSON object:
|
||||
```json
|
||||
{{
|
||||
"queries": [
|
||||
{{
|
||||
"query": "Healthcare AI adoption statistics 2025 hospitals implementation data",
|
||||
"purpose": "key_statistics",
|
||||
"provider": "exa",
|
||||
"priority": 5,
|
||||
"expected_results": "Statistics on hospital AI adoption rates"
|
||||
}},
|
||||
{{
|
||||
"query": "AI healthcare trends predictions future outlook 2025 2026",
|
||||
"purpose": "trends",
|
||||
"provider": "tavily",
|
||||
"priority": 4,
|
||||
"expected_results": "Current trends and future predictions in healthcare AI"
|
||||
}}
|
||||
],
|
||||
"enhanced_keywords": ["keyword1", "keyword2", "keyword3"],
|
||||
"research_angles": [
|
||||
"Angle 1: Focus on adoption challenges",
|
||||
"Angle 2: Focus on ROI and outcomes"
|
||||
]
|
||||
}}
|
||||
```
|
||||
|
||||
## QUERY OPTIMIZATION RULES
|
||||
|
||||
1. For STATISTICS: Include words like "statistics", "data", "percentage", "report", "study"
|
||||
2. For CASE STUDIES: Include "case study", "success story", "implementation", "example"
|
||||
3. For TRENDS: Include "trends", "future", "predictions", "emerging", year numbers
|
||||
4. For EXPERT QUOTES: Include expert names if known, or "expert opinion", "interview"
|
||||
5. For COMPARISONS: Include "vs", "compare", "comparison", "alternative"
|
||||
6. For NEWS/REAL-TIME: Use Tavily, include recent year/month
|
||||
7. For ACADEMIC/DEEP: Use Exa with neural search
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def build_intent_aware_analysis_prompt(
|
||||
self,
|
||||
raw_results: str,
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build prompt for analyzing research results based on user intent.
|
||||
|
||||
This is the key prompt that extracts exactly what the user needs.
|
||||
"""
|
||||
|
||||
purpose_explanation = self.PURPOSE_EXPLANATIONS.get(
|
||||
ResearchPurpose(intent.purpose),
|
||||
intent.purpose
|
||||
)
|
||||
|
||||
deliverables_instructions = self._build_deliverables_instructions(intent.expected_deliverables)
|
||||
|
||||
perspective_instruction = ""
|
||||
if intent.perspective:
|
||||
perspective_instruction = f"\n**PERSPECTIVE**: Analyze results from the viewpoint of: {intent.perspective}"
|
||||
|
||||
prompt = f"""You are a research analyst helping a content creator find exactly what they need. Your job is to analyze raw research results and extract precisely what the user is looking for.
|
||||
|
||||
## USER'S RESEARCH INTENT
|
||||
|
||||
PRIMARY QUESTION: {intent.primary_question}
|
||||
|
||||
SECONDARY QUESTIONS:
|
||||
{chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None specified'}
|
||||
|
||||
PURPOSE: {intent.purpose}
|
||||
→ {purpose_explanation}
|
||||
|
||||
CONTENT OUTPUT: {intent.content_output}
|
||||
|
||||
EXPECTED DELIVERABLES: {', '.join(intent.expected_deliverables)}
|
||||
|
||||
FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'}
|
||||
{perspective_instruction}
|
||||
|
||||
## RAW RESEARCH RESULTS
|
||||
|
||||
{raw_results[:15000]} # Truncated for token limits
|
||||
|
||||
## YOUR TASK
|
||||
|
||||
Analyze the raw research results and extract EXACTLY what the user needs.
|
||||
|
||||
{deliverables_instructions}
|
||||
|
||||
## OUTPUT REQUIREMENTS
|
||||
|
||||
Provide results in this JSON structure:
|
||||
|
||||
```json
|
||||
{{
|
||||
"primary_answer": "Direct 2-3 sentence answer to the primary question",
|
||||
"secondary_answers": {{
|
||||
"Question 1?": "Answer to question 1",
|
||||
"Question 2?": "Answer to question 2"
|
||||
}},
|
||||
"executive_summary": "2-3 sentence executive summary of all findings",
|
||||
"key_takeaways": [
|
||||
"Key takeaway 1 - most important finding",
|
||||
"Key takeaway 2",
|
||||
"Key takeaway 3",
|
||||
"Key takeaway 4",
|
||||
"Key takeaway 5"
|
||||
],
|
||||
"statistics": [
|
||||
{{
|
||||
"statistic": "72% of hospitals plan to adopt AI by 2025",
|
||||
"value": "72%",
|
||||
"context": "Survey of 500 US hospitals in 2024",
|
||||
"source": "Healthcare AI Report 2024",
|
||||
"url": "https://example.com/report",
|
||||
"credibility": 0.9,
|
||||
"recency": "2024"
|
||||
}}
|
||||
],
|
||||
"expert_quotes": [
|
||||
{{
|
||||
"quote": "AI will revolutionize patient care within 5 years",
|
||||
"speaker": "Dr. Jane Smith",
|
||||
"title": "Chief Medical Officer",
|
||||
"organization": "HealthTech Inc",
|
||||
"source": "TechCrunch",
|
||||
"url": "https://example.com/article"
|
||||
}}
|
||||
],
|
||||
"case_studies": [
|
||||
{{
|
||||
"title": "Mayo Clinic AI Implementation",
|
||||
"organization": "Mayo Clinic",
|
||||
"challenge": "High patient wait times",
|
||||
"solution": "AI-powered triage system",
|
||||
"outcome": "40% reduction in wait times",
|
||||
"key_metrics": ["40% faster triage", "95% patient satisfaction"],
|
||||
"source": "Healthcare IT News",
|
||||
"url": "https://example.com"
|
||||
}}
|
||||
],
|
||||
"trends": [
|
||||
{{
|
||||
"trend": "AI-assisted diagnostics adoption",
|
||||
"direction": "growing",
|
||||
"evidence": ["25% YoY growth", "Major hospital chains investing"],
|
||||
"impact": "Could reduce misdiagnosis by 30%",
|
||||
"timeline": "Expected mainstream by 2027",
|
||||
"sources": ["url1", "url2"]
|
||||
}}
|
||||
],
|
||||
"comparisons": [
|
||||
{{
|
||||
"title": "Top AI Healthcare Platforms",
|
||||
"criteria": ["Cost", "Features", "Support"],
|
||||
"items": [
|
||||
{{
|
||||
"name": "Platform A",
|
||||
"pros": ["Easy integration", "Good support"],
|
||||
"cons": ["Higher cost"],
|
||||
"features": {{"Cost": "$500/month", "Support": "24/7"}}
|
||||
}}
|
||||
],
|
||||
"verdict": "Platform A best for large hospitals"
|
||||
}}
|
||||
],
|
||||
"best_practices": [
|
||||
"Start with a pilot program before full deployment",
|
||||
"Ensure staff training is comprehensive"
|
||||
],
|
||||
"step_by_step": [
|
||||
"Step 1: Assess current infrastructure",
|
||||
"Step 2: Define use cases",
|
||||
"Step 3: Select vendor"
|
||||
],
|
||||
"pros_cons": {{
|
||||
"subject": "AI in Healthcare",
|
||||
"pros": ["Improved accuracy", "Cost savings"],
|
||||
"cons": ["Initial investment", "Training required"],
|
||||
"balanced_verdict": "Benefits outweigh costs for most hospitals"
|
||||
}},
|
||||
"definitions": {{
|
||||
"Clinical AI": "AI systems designed for medical diagnosis and treatment recommendations"
|
||||
}},
|
||||
"examples": [
|
||||
"Example: Hospital X reduced readmissions by 25% using predictive AI"
|
||||
],
|
||||
"predictions": [
|
||||
"By 2030, AI will assist in 80% of initial diagnoses"
|
||||
],
|
||||
"suggested_outline": [
|
||||
"1. Introduction: The AI Healthcare Revolution",
|
||||
"2. Current State: Where We Are Today",
|
||||
"3. Key Statistics and Trends",
|
||||
"4. Case Studies: Success Stories",
|
||||
"5. Implementation Guide",
|
||||
"6. Future Outlook"
|
||||
],
|
||||
"sources": [
|
||||
{{
|
||||
"title": "Healthcare AI Report 2024",
|
||||
"url": "https://example.com",
|
||||
"relevance_score": 0.95,
|
||||
"relevance_reason": "Directly addresses adoption statistics",
|
||||
"content_type": "research report",
|
||||
"credibility_score": 0.9
|
||||
}}
|
||||
],
|
||||
"confidence": 0.85,
|
||||
"gaps_identified": [
|
||||
"Specific cost data for small clinics not found",
|
||||
"Limited information on regulatory challenges"
|
||||
],
|
||||
"follow_up_queries": [
|
||||
"AI healthcare regulations FDA 2025",
|
||||
"Small clinic AI implementation costs"
|
||||
]
|
||||
}}
|
||||
```
|
||||
|
||||
## CRITICAL RULES
|
||||
|
||||
1. **ONLY include information directly from the raw results** - do not make up data
|
||||
2. **ALWAYS include source URLs** for every statistic, quote, and case study
|
||||
3. **If a deliverable type has no relevant data**, return an empty array for it
|
||||
4. **Prioritize recency and credibility** when multiple sources conflict
|
||||
5. **Answer the PRIMARY QUESTION directly** in 2-3 clear sentences
|
||||
6. **Keep KEY TAKEAWAYS to 5-7 points** - the most important findings
|
||||
7. **Add to gaps_identified** if expected information is missing
|
||||
8. **Suggest follow_up_queries** for gaps or incomplete areas
|
||||
9. **Rate confidence** based on how well results match the user's intent
|
||||
10. **Include deliverables ONLY if they are in expected_deliverables** or critical to the question
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _build_persona_context(
|
||||
self,
|
||||
research_persona: Optional[ResearchPersona],
|
||||
industry: Optional[str],
|
||||
target_audience: Optional[str],
|
||||
) -> str:
|
||||
"""Build persona context section for prompts."""
|
||||
|
||||
if not research_persona and not industry:
|
||||
return "No specific persona context available."
|
||||
|
||||
context_parts = []
|
||||
|
||||
if research_persona:
|
||||
context_parts.append(f"INDUSTRY: {research_persona.default_industry}")
|
||||
context_parts.append(f"TARGET AUDIENCE: {research_persona.default_target_audience}")
|
||||
if research_persona.suggested_keywords:
|
||||
context_parts.append(f"TYPICAL TOPICS: {', '.join(research_persona.suggested_keywords[:5])}")
|
||||
if research_persona.research_angles:
|
||||
context_parts.append(f"RESEARCH ANGLES: {', '.join(research_persona.research_angles[:3])}")
|
||||
else:
|
||||
if industry:
|
||||
context_parts.append(f"INDUSTRY: {industry}")
|
||||
if target_audience:
|
||||
context_parts.append(f"TARGET AUDIENCE: {target_audience}")
|
||||
|
||||
return "\n".join(context_parts)
|
||||
|
||||
def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
|
||||
"""Build competitor context section for prompts."""
|
||||
|
||||
if not competitor_data:
|
||||
return ""
|
||||
|
||||
competitor_names = []
|
||||
for comp in competitor_data[:5]: # Limit to 5
|
||||
name = comp.get("name") or comp.get("domain") or comp.get("url", "Unknown")
|
||||
competitor_names.append(name)
|
||||
|
||||
if competitor_names:
|
||||
return f"\nKNOWN COMPETITORS: {', '.join(competitor_names)}"
|
||||
|
||||
return ""
|
||||
|
||||
def _build_deliverables_instructions(self, expected_deliverables: List[str]) -> str:
|
||||
"""Build specific extraction instructions for each expected deliverable."""
|
||||
|
||||
instructions = ["### EXTRACTION INSTRUCTIONS\n"]
|
||||
instructions.append("For each requested deliverable, extract the following:\n")
|
||||
|
||||
deliverable_instructions = {
|
||||
ExpectedDeliverable.KEY_STATISTICS: """
|
||||
**STATISTICS**:
|
||||
- Extract ALL relevant statistics with exact numbers
|
||||
- Include source attribution (publication name, URL)
|
||||
- Note the recency of the data
|
||||
- Rate credibility based on source authority
|
||||
- Format: statistic statement, value, context, source, URL, credibility score
|
||||
""",
|
||||
ExpectedDeliverable.EXPERT_QUOTES: """
|
||||
**EXPERT QUOTES**:
|
||||
- Extract authoritative quotes from named experts
|
||||
- Include speaker name, title, and organization
|
||||
- Provide context for the quote
|
||||
- Include source URL
|
||||
""",
|
||||
ExpectedDeliverable.CASE_STUDIES: """
|
||||
**CASE STUDIES**:
|
||||
- Summarize each case study: challenge → solution → outcome
|
||||
- Include key metrics and results
|
||||
- Name the organization involved
|
||||
- Provide source URL
|
||||
""",
|
||||
ExpectedDeliverable.TRENDS: """
|
||||
**TRENDS**:
|
||||
- Identify current and emerging trends
|
||||
- Note direction: growing, declining, emerging, or stable
|
||||
- List supporting evidence
|
||||
- Include timeline predictions if available
|
||||
- Cite sources
|
||||
""",
|
||||
ExpectedDeliverable.COMPARISONS: """
|
||||
**COMPARISONS**:
|
||||
- Build comparison tables where applicable
|
||||
- Define clear comparison criteria
|
||||
- List pros and cons for each option
|
||||
- Provide a verdict/recommendation if data supports it
|
||||
""",
|
||||
ExpectedDeliverable.BEST_PRACTICES: """
|
||||
**BEST PRACTICES**:
|
||||
- Extract recommended approaches
|
||||
- Provide actionable guidelines
|
||||
- Order by importance or sequence
|
||||
""",
|
||||
ExpectedDeliverable.STEP_BY_STEP: """
|
||||
**STEP BY STEP**:
|
||||
- Extract process/how-to instructions
|
||||
- Number steps clearly
|
||||
- Include any prerequisites or requirements
|
||||
""",
|
||||
ExpectedDeliverable.PROS_CONS: """
|
||||
**PROS AND CONS**:
|
||||
- List advantages (pros)
|
||||
- List disadvantages (cons)
|
||||
- Provide a balanced verdict
|
||||
""",
|
||||
ExpectedDeliverable.DEFINITIONS: """
|
||||
**DEFINITIONS**:
|
||||
- Extract clear explanations of key terms and concepts
|
||||
- Keep definitions concise but comprehensive
|
||||
""",
|
||||
ExpectedDeliverable.EXAMPLES: """
|
||||
**EXAMPLES**:
|
||||
- Extract concrete examples that illustrate key points
|
||||
- Include real-world applications
|
||||
""",
|
||||
ExpectedDeliverable.PREDICTIONS: """
|
||||
**PREDICTIONS**:
|
||||
- Extract future outlook and predictions
|
||||
- Note the source and their track record if known
|
||||
- Include timeframes where mentioned
|
||||
""",
|
||||
ExpectedDeliverable.CITATIONS: """
|
||||
**CITATIONS**:
|
||||
- List all authoritative sources with URLs
|
||||
- Rate credibility and relevance
|
||||
- Note content type (research, news, opinion, etc.)
|
||||
""",
|
||||
}
|
||||
|
||||
for deliverable in expected_deliverables:
|
||||
try:
|
||||
d_enum = ExpectedDeliverable(deliverable)
|
||||
if d_enum in deliverable_instructions:
|
||||
instructions.append(deliverable_instructions[d_enum])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return "\n".join(instructions)
|
||||
387
backend/services/research/intent/intent_query_generator.py
Normal file
387
backend/services/research/intent/intent_query_generator.py
Normal file
@@ -0,0 +1,387 @@
|
||||
"""
|
||||
Intent Query Generator
|
||||
|
||||
Generates multiple targeted research queries based on user intent.
|
||||
Each query targets a specific deliverable or question.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchQuery,
|
||||
ExpectedDeliverable,
|
||||
ResearchPurpose,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
|
||||
class IntentQueryGenerator:
|
||||
"""
|
||||
Generates targeted research queries based on user intent.
|
||||
|
||||
Instead of a single generic search, generates multiple queries
|
||||
each targeting a specific deliverable or question.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the query generator."""
|
||||
self.prompt_builder = IntentPromptBuilder()
|
||||
logger.info("IntentQueryGenerator initialized")
|
||||
|
||||
async def generate_queries(
|
||||
self,
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate targeted research queries based on intent.
|
||||
|
||||
Args:
|
||||
intent: The inferred research intent
|
||||
research_persona: Optional persona for context
|
||||
|
||||
Returns:
|
||||
Dict with queries, enhanced_keywords, and research_angles
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Generating queries for: {intent.primary_question[:50]}...")
|
||||
|
||||
# Build the query generation prompt
|
||||
prompt = self.prompt_builder.build_query_generation_prompt(
|
||||
intent=intent,
|
||||
research_persona=research_persona,
|
||||
)
|
||||
|
||||
# Define the expected JSON schema
|
||||
query_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"queries": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
"purpose": {"type": "string"},
|
||||
"provider": {"type": "string"},
|
||||
"priority": {"type": "integer"},
|
||||
"expected_results": {"type": "string"}
|
||||
},
|
||||
"required": ["query", "purpose", "provider", "priority", "expected_results"]
|
||||
}
|
||||
},
|
||||
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"research_angles": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["queries", "enhanced_keywords", "research_angles"]
|
||||
}
|
||||
|
||||
# Call LLM for query generation
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=query_schema,
|
||||
user_id=None
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Query generation failed: {result.get('error')}")
|
||||
return self._create_fallback_queries(intent)
|
||||
|
||||
# Parse queries
|
||||
queries = self._parse_queries(result.get("queries", []))
|
||||
|
||||
# Ensure we have queries for all expected deliverables
|
||||
queries = self._ensure_deliverable_coverage(queries, intent)
|
||||
|
||||
# Sort by priority
|
||||
queries.sort(key=lambda q: q.priority, reverse=True)
|
||||
|
||||
logger.info(f"Generated {len(queries)} targeted queries")
|
||||
|
||||
return {
|
||||
"queries": queries,
|
||||
"enhanced_keywords": result.get("enhanced_keywords", []),
|
||||
"research_angles": result.get("research_angles", []),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating queries: {e}")
|
||||
return self._create_fallback_queries(intent)
|
||||
|
||||
def _parse_queries(self, raw_queries: List[Dict]) -> List[ResearchQuery]:
|
||||
"""Parse raw query data into ResearchQuery objects."""
|
||||
|
||||
queries = []
|
||||
for q in raw_queries:
|
||||
try:
|
||||
# Validate purpose
|
||||
purpose_str = q.get("purpose", "key_statistics")
|
||||
try:
|
||||
purpose = ExpectedDeliverable(purpose_str)
|
||||
except ValueError:
|
||||
purpose = ExpectedDeliverable.KEY_STATISTICS
|
||||
|
||||
query = ResearchQuery(
|
||||
query=q.get("query", ""),
|
||||
purpose=purpose,
|
||||
provider=q.get("provider", "exa"),
|
||||
priority=min(max(int(q.get("priority", 3)), 1), 5), # Clamp 1-5
|
||||
expected_results=q.get("expected_results", ""),
|
||||
)
|
||||
queries.append(query)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse query: {e}")
|
||||
continue
|
||||
|
||||
return queries
|
||||
|
||||
def _ensure_deliverable_coverage(
|
||||
self,
|
||||
queries: List[ResearchQuery],
|
||||
intent: ResearchIntent,
|
||||
) -> List[ResearchQuery]:
|
||||
"""Ensure we have queries for all expected deliverables."""
|
||||
|
||||
# Get deliverables already covered
|
||||
covered = set(q.purpose.value for q in queries)
|
||||
|
||||
# Check for missing deliverables
|
||||
for deliverable in intent.expected_deliverables:
|
||||
if deliverable not in covered:
|
||||
# Generate a query for this deliverable
|
||||
query = self._generate_query_for_deliverable(
|
||||
deliverable=deliverable,
|
||||
intent=intent,
|
||||
)
|
||||
queries.append(query)
|
||||
|
||||
return queries
|
||||
|
||||
def _generate_query_for_deliverable(
|
||||
self,
|
||||
deliverable: str,
|
||||
intent: ResearchIntent,
|
||||
) -> ResearchQuery:
|
||||
"""Generate a query targeting a specific deliverable."""
|
||||
|
||||
# Extract topic from primary question
|
||||
topic = intent.original_input
|
||||
|
||||
# Query templates by deliverable type
|
||||
templates = {
|
||||
ExpectedDeliverable.KEY_STATISTICS.value: {
|
||||
"query": f"{topic} statistics data report study",
|
||||
"provider": "exa",
|
||||
"priority": 5,
|
||||
"expected": "Statistical data and research findings",
|
||||
},
|
||||
ExpectedDeliverable.EXPERT_QUOTES.value: {
|
||||
"query": f"{topic} expert opinion interview insights",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Expert opinions and authoritative quotes",
|
||||
},
|
||||
ExpectedDeliverable.CASE_STUDIES.value: {
|
||||
"query": f"{topic} case study success story implementation example",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Real-world case studies and examples",
|
||||
},
|
||||
ExpectedDeliverable.TRENDS.value: {
|
||||
"query": f"{topic} trends 2025 future predictions emerging",
|
||||
"provider": "tavily",
|
||||
"priority": 4,
|
||||
"expected": "Current trends and future predictions",
|
||||
},
|
||||
ExpectedDeliverable.COMPARISONS.value: {
|
||||
"query": f"{topic} comparison vs versus alternatives",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Comparison and alternative options",
|
||||
},
|
||||
ExpectedDeliverable.BEST_PRACTICES.value: {
|
||||
"query": f"{topic} best practices recommendations guidelines",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Best practices and recommendations",
|
||||
},
|
||||
ExpectedDeliverable.STEP_BY_STEP.value: {
|
||||
"query": f"{topic} how to guide tutorial steps",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Step-by-step guides and tutorials",
|
||||
},
|
||||
ExpectedDeliverable.PROS_CONS.value: {
|
||||
"query": f"{topic} advantages disadvantages pros cons benefits",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Pros, cons, and trade-offs",
|
||||
},
|
||||
ExpectedDeliverable.DEFINITIONS.value: {
|
||||
"query": f"what is {topic} definition explained",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Clear definitions and explanations",
|
||||
},
|
||||
ExpectedDeliverable.EXAMPLES.value: {
|
||||
"query": f"{topic} examples real world applications",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Real-world examples and applications",
|
||||
},
|
||||
ExpectedDeliverable.PREDICTIONS.value: {
|
||||
"query": f"{topic} future outlook predictions 2025 2030",
|
||||
"provider": "tavily",
|
||||
"priority": 4,
|
||||
"expected": "Future predictions and outlook",
|
||||
},
|
||||
ExpectedDeliverable.CITATIONS.value: {
|
||||
"query": f"{topic} research paper study academic",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Authoritative academic sources",
|
||||
},
|
||||
}
|
||||
|
||||
template = templates.get(deliverable, {
|
||||
"query": f"{topic}",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "General information",
|
||||
})
|
||||
|
||||
return ResearchQuery(
|
||||
query=template["query"],
|
||||
purpose=ExpectedDeliverable(deliverable) if deliverable in [e.value for e in ExpectedDeliverable] else ExpectedDeliverable.KEY_STATISTICS,
|
||||
provider=template["provider"],
|
||||
priority=template["priority"],
|
||||
expected_results=template["expected"],
|
||||
)
|
||||
|
||||
def _create_fallback_queries(self, intent: ResearchIntent) -> Dict[str, Any]:
|
||||
"""Create fallback queries when AI generation fails."""
|
||||
|
||||
topic = intent.original_input
|
||||
|
||||
# Generate basic queries for each expected deliverable
|
||||
queries = []
|
||||
for deliverable in intent.expected_deliverables[:5]: # Limit to 5
|
||||
query = self._generate_query_for_deliverable(deliverable, intent)
|
||||
queries.append(query)
|
||||
|
||||
# Add a general query if we have none
|
||||
if not queries:
|
||||
queries.append(ResearchQuery(
|
||||
query=topic,
|
||||
purpose=ExpectedDeliverable.KEY_STATISTICS,
|
||||
provider="exa",
|
||||
priority=5,
|
||||
expected_results="General information and insights",
|
||||
))
|
||||
|
||||
return {
|
||||
"queries": queries,
|
||||
"enhanced_keywords": topic.split()[:10],
|
||||
"research_angles": [
|
||||
f"Overview of {topic}",
|
||||
f"Latest trends in {topic}",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class QueryOptimizer:
|
||||
"""
|
||||
Optimizes queries for different research providers.
|
||||
|
||||
Different providers have different strengths:
|
||||
- Exa: Semantic search, good for deep research
|
||||
- Tavily: Real-time search, good for news/trends
|
||||
- Google: Factual search, good for basic info
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def optimize_for_exa(query: str, intent: ResearchIntent) -> Dict[str, Any]:
|
||||
"""Optimize query and parameters for Exa."""
|
||||
|
||||
# Determine best Exa settings based on deliverable
|
||||
deliverables = intent.expected_deliverables
|
||||
|
||||
# Determine category
|
||||
category = None
|
||||
if ExpectedDeliverable.CITATIONS.value in deliverables:
|
||||
category = "research paper"
|
||||
elif ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
category = "news"
|
||||
elif intent.purpose == ResearchPurpose.COMPARE.value:
|
||||
category = "company"
|
||||
|
||||
# Determine search type
|
||||
search_type = "neural" # Default to neural for semantic understanding
|
||||
if ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
search_type = "auto" # Auto is better for time-sensitive queries
|
||||
|
||||
# Number of results
|
||||
num_results = 10
|
||||
if intent.depth == "expert":
|
||||
num_results = 20
|
||||
elif intent.depth == "overview":
|
||||
num_results = 5
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"type": search_type,
|
||||
"category": category,
|
||||
"num_results": num_results,
|
||||
"text": True,
|
||||
"highlights": True,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def optimize_for_tavily(query: str, intent: ResearchIntent) -> Dict[str, Any]:
|
||||
"""Optimize query and parameters for Tavily."""
|
||||
|
||||
deliverables = intent.expected_deliverables
|
||||
|
||||
# Determine topic
|
||||
topic = "general"
|
||||
if ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
topic = "news"
|
||||
|
||||
# Determine search depth
|
||||
search_depth = "basic"
|
||||
if intent.depth in ["detailed", "expert"]:
|
||||
search_depth = "advanced"
|
||||
|
||||
# Include answer for factual queries
|
||||
include_answer = False
|
||||
if ExpectedDeliverable.DEFINITIONS.value in deliverables:
|
||||
include_answer = "advanced"
|
||||
elif ExpectedDeliverable.KEY_STATISTICS.value in deliverables:
|
||||
include_answer = "basic"
|
||||
|
||||
# Time range for trends
|
||||
time_range = None
|
||||
if intent.time_sensitivity == "real_time":
|
||||
time_range = "day"
|
||||
elif intent.time_sensitivity == "recent":
|
||||
time_range = "week"
|
||||
elif ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
time_range = "month"
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"topic": topic,
|
||||
"search_depth": search_depth,
|
||||
"include_answer": include_answer,
|
||||
"time_range": time_range,
|
||||
"max_results": 10,
|
||||
}
|
||||
378
backend/services/research/intent/research_intent_inference.py
Normal file
378
backend/services/research/intent/research_intent_inference.py
Normal file
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
Research Intent Inference Service
|
||||
|
||||
Analyzes user input to understand their research intent.
|
||||
Uses AI to infer:
|
||||
- What the user wants to accomplish
|
||||
- What questions need answering
|
||||
- What deliverables they expect
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchPurpose,
|
||||
ContentOutput,
|
||||
ExpectedDeliverable,
|
||||
ResearchDepthLevel,
|
||||
InputType,
|
||||
IntentInferenceRequest,
|
||||
IntentInferenceResponse,
|
||||
ResearchQuery,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
|
||||
class ResearchIntentInference:
|
||||
"""
|
||||
Infers user research intent from minimal input.
|
||||
|
||||
Instead of asking a formal questionnaire, this service
|
||||
uses AI to understand what the user really wants.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the intent inference service."""
|
||||
self.prompt_builder = IntentPromptBuilder()
|
||||
logger.info("ResearchIntentInference initialized")
|
||||
|
||||
async def infer_intent(
|
||||
self,
|
||||
user_input: str,
|
||||
keywords: Optional[List[str]] = None,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
) -> IntentInferenceResponse:
|
||||
"""
|
||||
Analyze user input and infer their research intent.
|
||||
|
||||
Args:
|
||||
user_input: User's keywords, question, or goal
|
||||
keywords: Extracted keywords (optional)
|
||||
research_persona: User's research persona (optional)
|
||||
competitor_data: Competitor analysis data (optional)
|
||||
industry: Industry context (optional)
|
||||
target_audience: Target audience context (optional)
|
||||
|
||||
Returns:
|
||||
IntentInferenceResponse with inferred intent and suggested queries
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Inferring intent for: {user_input[:100]}...")
|
||||
|
||||
keywords = keywords or []
|
||||
|
||||
# Build the inference prompt
|
||||
prompt = self.prompt_builder.build_intent_inference_prompt(
|
||||
user_input=user_input,
|
||||
keywords=keywords,
|
||||
research_persona=research_persona,
|
||||
competitor_data=competitor_data,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
)
|
||||
|
||||
# Define the expected JSON schema
|
||||
intent_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
|
||||
"primary_question": {"type": "string"},
|
||||
"secondary_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"purpose": {"type": "string"},
|
||||
"content_output": {"type": "string"},
|
||||
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
|
||||
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
|
||||
"focus_areas": {"type": "array", "items": {"type": "string"}},
|
||||
"perspective": {"type": "string"},
|
||||
"time_sensitivity": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"needs_clarification": {"type": "boolean"},
|
||||
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"analysis_summary": {"type": "string"}
|
||||
},
|
||||
"required": [
|
||||
"input_type", "primary_question", "purpose", "content_output",
|
||||
"expected_deliverables", "depth", "confidence", "analysis_summary"
|
||||
]
|
||||
}
|
||||
|
||||
# Call LLM for intent inference
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=intent_schema,
|
||||
user_id=None
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Intent inference failed: {result.get('error')}")
|
||||
return self._create_fallback_response(user_input, keywords)
|
||||
|
||||
# Parse and validate the result
|
||||
intent = self._parse_intent_result(result, user_input)
|
||||
|
||||
# Generate quick options for UI
|
||||
quick_options = self._generate_quick_options(intent, result)
|
||||
|
||||
# Create response
|
||||
response = IntentInferenceResponse(
|
||||
success=True,
|
||||
intent=intent,
|
||||
analysis_summary=result.get("analysis_summary", "Research intent analyzed"),
|
||||
suggested_queries=[], # Will be populated by query generator
|
||||
suggested_keywords=self._extract_keywords_from_input(user_input, keywords),
|
||||
suggested_angles=result.get("focus_areas", []),
|
||||
quick_options=quick_options,
|
||||
)
|
||||
|
||||
logger.info(f"Intent inferred: purpose={intent.purpose}, confidence={intent.confidence}")
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error inferring intent: {e}")
|
||||
return self._create_fallback_response(user_input, keywords or [])
|
||||
|
||||
def _parse_intent_result(self, result: Dict[str, Any], user_input: str) -> ResearchIntent:
|
||||
"""Parse LLM result into ResearchIntent model."""
|
||||
|
||||
# Map string values to enums safely
|
||||
input_type = self._safe_enum(InputType, result.get("input_type", "keywords"), InputType.KEYWORDS)
|
||||
purpose = self._safe_enum(ResearchPurpose, result.get("purpose", "learn"), ResearchPurpose.LEARN)
|
||||
content_output = self._safe_enum(ContentOutput, result.get("content_output", "general"), ContentOutput.GENERAL)
|
||||
depth = self._safe_enum(ResearchDepthLevel, result.get("depth", "detailed"), ResearchDepthLevel.DETAILED)
|
||||
|
||||
# Parse expected deliverables
|
||||
raw_deliverables = result.get("expected_deliverables", [])
|
||||
expected_deliverables = []
|
||||
for d in raw_deliverables:
|
||||
try:
|
||||
expected_deliverables.append(ExpectedDeliverable(d).value)
|
||||
except ValueError:
|
||||
# Skip invalid deliverables
|
||||
pass
|
||||
|
||||
# Ensure we have at least some deliverables
|
||||
if not expected_deliverables:
|
||||
expected_deliverables = self._infer_deliverables_from_purpose(purpose)
|
||||
|
||||
return ResearchIntent(
|
||||
primary_question=result.get("primary_question", user_input),
|
||||
secondary_questions=result.get("secondary_questions", []),
|
||||
purpose=purpose.value,
|
||||
content_output=content_output.value,
|
||||
expected_deliverables=expected_deliverables,
|
||||
depth=depth.value,
|
||||
focus_areas=result.get("focus_areas", []),
|
||||
perspective=result.get("perspective"),
|
||||
time_sensitivity=result.get("time_sensitivity"),
|
||||
input_type=input_type.value,
|
||||
original_input=user_input,
|
||||
confidence=float(result.get("confidence", 0.7)),
|
||||
needs_clarification=result.get("needs_clarification", False),
|
||||
clarifying_questions=result.get("clarifying_questions", []),
|
||||
)
|
||||
|
||||
def _safe_enum(self, enum_class, value: str, default):
|
||||
"""Safely convert string to enum, returning default if invalid."""
|
||||
try:
|
||||
return enum_class(value)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
def _infer_deliverables_from_purpose(self, purpose: ResearchPurpose) -> List[str]:
|
||||
"""Infer expected deliverables based on research purpose."""
|
||||
|
||||
purpose_deliverables = {
|
||||
ResearchPurpose.LEARN: [
|
||||
ExpectedDeliverable.DEFINITIONS.value,
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
],
|
||||
ResearchPurpose.CREATE_CONTENT: [
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.EXPERT_QUOTES.value,
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.CASE_STUDIES.value,
|
||||
],
|
||||
ResearchPurpose.MAKE_DECISION: [
|
||||
ExpectedDeliverable.PROS_CONS.value,
|
||||
ExpectedDeliverable.COMPARISONS.value,
|
||||
ExpectedDeliverable.BEST_PRACTICES.value,
|
||||
],
|
||||
ResearchPurpose.COMPARE: [
|
||||
ExpectedDeliverable.COMPARISONS.value,
|
||||
ExpectedDeliverable.PROS_CONS.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
],
|
||||
ResearchPurpose.SOLVE_PROBLEM: [
|
||||
ExpectedDeliverable.STEP_BY_STEP.value,
|
||||
ExpectedDeliverable.BEST_PRACTICES.value,
|
||||
ExpectedDeliverable.CASE_STUDIES.value,
|
||||
],
|
||||
ResearchPurpose.FIND_DATA: [
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.CITATIONS.value,
|
||||
],
|
||||
ResearchPurpose.EXPLORE_TRENDS: [
|
||||
ExpectedDeliverable.TRENDS.value,
|
||||
ExpectedDeliverable.PREDICTIONS.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
],
|
||||
ResearchPurpose.VALIDATE: [
|
||||
ExpectedDeliverable.CITATIONS.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.EXPERT_QUOTES.value,
|
||||
],
|
||||
ResearchPurpose.GENERATE_IDEAS: [
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.TRENDS.value,
|
||||
ExpectedDeliverable.CASE_STUDIES.value,
|
||||
],
|
||||
}
|
||||
|
||||
return purpose_deliverables.get(purpose, [ExpectedDeliverable.KEY_STATISTICS.value])
|
||||
|
||||
def _generate_quick_options(self, intent: ResearchIntent, result: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Generate quick options for UI confirmation."""
|
||||
|
||||
options = []
|
||||
|
||||
# Purpose option
|
||||
options.append({
|
||||
"id": "purpose",
|
||||
"label": "Research Purpose",
|
||||
"value": intent.purpose,
|
||||
"display": self._purpose_display(intent.purpose),
|
||||
"alternatives": [p.value for p in ResearchPurpose],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
})
|
||||
|
||||
# Content output option
|
||||
if intent.content_output != ContentOutput.GENERAL.value:
|
||||
options.append({
|
||||
"id": "content_output",
|
||||
"label": "Content Type",
|
||||
"value": intent.content_output,
|
||||
"display": intent.content_output.replace("_", " ").title(),
|
||||
"alternatives": [c.value for c in ContentOutput],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
})
|
||||
|
||||
# Deliverables option
|
||||
options.append({
|
||||
"id": "deliverables",
|
||||
"label": "What I'll Find",
|
||||
"value": intent.expected_deliverables,
|
||||
"display": [d.replace("_", " ").title() for d in intent.expected_deliverables[:4]],
|
||||
"alternatives": [d.value for d in ExpectedDeliverable],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
"multi_select": True,
|
||||
})
|
||||
|
||||
# Depth option
|
||||
options.append({
|
||||
"id": "depth",
|
||||
"label": "Research Depth",
|
||||
"value": intent.depth,
|
||||
"display": intent.depth.title(),
|
||||
"alternatives": [d.value for d in ResearchDepthLevel],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
})
|
||||
|
||||
return options
|
||||
|
||||
def _purpose_display(self, purpose: str) -> str:
|
||||
"""Get display-friendly purpose text."""
|
||||
display_map = {
|
||||
"learn": "Understand this topic",
|
||||
"create_content": "Create content about this",
|
||||
"make_decision": "Make a decision",
|
||||
"compare": "Compare options",
|
||||
"solve_problem": "Solve a problem",
|
||||
"find_data": "Find specific data",
|
||||
"explore_trends": "Explore trends",
|
||||
"validate": "Validate information",
|
||||
"generate_ideas": "Generate ideas",
|
||||
}
|
||||
return display_map.get(purpose, purpose.replace("_", " ").title())
|
||||
|
||||
def _extract_keywords_from_input(self, user_input: str, keywords: List[str]) -> List[str]:
|
||||
"""Extract and enhance keywords from user input."""
|
||||
|
||||
# Start with provided keywords
|
||||
extracted = list(keywords) if keywords else []
|
||||
|
||||
# Simple extraction from input (split on common delimiters)
|
||||
words = user_input.lower().replace(",", " ").replace(";", " ").split()
|
||||
|
||||
# Filter out common words
|
||||
stop_words = {
|
||||
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
||||
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "must", "shall", "can", "need", "dare",
|
||||
"to", "of", "in", "for", "on", "with", "at", "by", "from", "up",
|
||||
"about", "into", "through", "during", "before", "after", "above",
|
||||
"below", "between", "under", "again", "further", "then", "once",
|
||||
"here", "there", "when", "where", "why", "how", "all", "each",
|
||||
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
||||
"only", "own", "same", "so", "than", "too", "very", "just", "and",
|
||||
"but", "if", "or", "because", "as", "until", "while", "i", "we",
|
||||
"you", "they", "what", "which", "who", "whom", "this", "that",
|
||||
"these", "those", "am", "want", "write", "blog", "post", "article",
|
||||
}
|
||||
|
||||
for word in words:
|
||||
if word not in stop_words and len(word) > 2 and word not in extracted:
|
||||
extracted.append(word)
|
||||
|
||||
return extracted[:15] # Limit to 15 keywords
|
||||
|
||||
def _create_fallback_response(self, user_input: str, keywords: List[str]) -> IntentInferenceResponse:
|
||||
"""Create a fallback response when AI inference fails."""
|
||||
|
||||
# Create a basic intent from the input
|
||||
fallback_intent = ResearchIntent(
|
||||
primary_question=f"What are the key insights about: {user_input}?",
|
||||
secondary_questions=[
|
||||
f"What are the latest trends in {user_input}?",
|
||||
f"What are best practices for {user_input}?",
|
||||
],
|
||||
purpose=ResearchPurpose.LEARN.value,
|
||||
content_output=ContentOutput.GENERAL.value,
|
||||
expected_deliverables=[
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.BEST_PRACTICES.value,
|
||||
],
|
||||
depth=ResearchDepthLevel.DETAILED.value,
|
||||
focus_areas=[],
|
||||
input_type=InputType.KEYWORDS.value,
|
||||
original_input=user_input,
|
||||
confidence=0.5,
|
||||
needs_clarification=True,
|
||||
clarifying_questions=[
|
||||
"What type of content are you creating?",
|
||||
"What specific aspects are you most interested in?",
|
||||
],
|
||||
)
|
||||
|
||||
return IntentInferenceResponse(
|
||||
success=True, # Still return success, just with lower confidence
|
||||
intent=fallback_intent,
|
||||
analysis_summary=f"Basic research analysis for: {user_input}",
|
||||
suggested_queries=[],
|
||||
suggested_keywords=keywords,
|
||||
suggested_angles=[],
|
||||
quick_options=[],
|
||||
)
|
||||
660
backend/services/research/research_persona_prompt_builder.py
Normal file
660
backend/services/research/research_persona_prompt_builder.py
Normal file
@@ -0,0 +1,660 @@
|
||||
"""
|
||||
Research Persona Prompt Builder
|
||||
|
||||
Handles building comprehensive prompts for research persona generation.
|
||||
Generates personalized research defaults, suggestions, and configurations.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
import json
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ResearchPersonaPromptBuilder:
|
||||
"""Builds comprehensive prompts for research persona generation."""
|
||||
|
||||
def build_research_persona_prompt(self, onboarding_data: Dict[str, Any]) -> str:
|
||||
"""Build the research persona generation prompt with comprehensive data."""
|
||||
|
||||
# Extract data from onboarding_data
|
||||
website_analysis = onboarding_data.get("website_analysis", {}) or {}
|
||||
persona_data = onboarding_data.get("persona_data", {}) or {}
|
||||
research_prefs = onboarding_data.get("research_preferences", {}) or {}
|
||||
business_info = onboarding_data.get("business_info", {}) or {}
|
||||
competitor_analysis = onboarding_data.get("competitor_analysis", []) or []
|
||||
|
||||
# Extract core persona - handle both camelCase and snake_case
|
||||
core_persona = persona_data.get("corePersona") or persona_data.get("core_persona") or {}
|
||||
|
||||
# Phase 1: Extract key website analysis fields for enhanced personalization
|
||||
writing_style = website_analysis.get("writing_style", {}) or {}
|
||||
content_type = website_analysis.get("content_type", {}) or {}
|
||||
crawl_result = website_analysis.get("crawl_result", {}) or {}
|
||||
|
||||
# Phase 2: Extract additional fields for pattern-based personalization
|
||||
style_patterns = website_analysis.get("style_patterns", {}) or {}
|
||||
content_characteristics = website_analysis.get("content_characteristics", {}) or {}
|
||||
style_guidelines = website_analysis.get("style_guidelines", {}) or {}
|
||||
|
||||
# Extract topics/keywords from crawl_result (if available)
|
||||
extracted_topics = self._extract_topics_from_crawl(crawl_result)
|
||||
extracted_keywords = self._extract_keywords_from_crawl(crawl_result)
|
||||
|
||||
# Phase 2: Extract patterns and vocabulary level
|
||||
extracted_patterns = self._extract_writing_patterns(style_patterns)
|
||||
vocabulary_level = content_characteristics.get("vocabulary_level", "medium") if content_characteristics else "medium"
|
||||
extracted_guidelines = self._extract_style_guidelines(style_guidelines)
|
||||
|
||||
# Phase 3: Full crawl analysis and comprehensive mapping
|
||||
crawl_analysis = self._analyze_crawl_result_comprehensive(crawl_result)
|
||||
writing_style_mapping = self._map_writing_style_comprehensive(writing_style, content_characteristics)
|
||||
content_themes = self._extract_content_themes(crawl_result, extracted_topics)
|
||||
|
||||
prompt = f"""
|
||||
COMPREHENSIVE RESEARCH PERSONA GENERATION TASK: Create a highly detailed, personalized research persona based on the user's business, writing style, and content strategy. This persona will provide intelligent defaults and suggestions for research inputs.
|
||||
|
||||
=== USER CONTEXT ===
|
||||
|
||||
BUSINESS INFORMATION:
|
||||
{json.dumps(business_info, indent=2)}
|
||||
|
||||
WEBSITE ANALYSIS:
|
||||
{json.dumps(website_analysis, indent=2)}
|
||||
|
||||
CORE PERSONA:
|
||||
{json.dumps(core_persona, indent=2)}
|
||||
|
||||
RESEARCH PREFERENCES:
|
||||
{json.dumps(research_prefs, indent=2)}
|
||||
|
||||
COMPETITOR ANALYSIS:
|
||||
{json.dumps(competitor_analysis, indent=2) if competitor_analysis else "No competitor data available"}
|
||||
|
||||
=== PHASE 1: WEBSITE ANALYSIS INTELLIGENCE ===
|
||||
|
||||
WRITING STYLE (for research depth mapping):
|
||||
{json.dumps(writing_style, indent=2) if writing_style else "Not available"}
|
||||
|
||||
CONTENT TYPE (for preset generation):
|
||||
{json.dumps(content_type, indent=2) if content_type else "Not available"}
|
||||
|
||||
EXTRACTED TOPICS FROM WEBSITE CONTENT:
|
||||
{json.dumps(extracted_topics, indent=2) if extracted_topics else "No topics extracted"}
|
||||
|
||||
EXTRACTED KEYWORDS FROM WEBSITE CONTENT:
|
||||
{json.dumps(extracted_keywords[:20], indent=2) if extracted_keywords else "No keywords extracted"}
|
||||
|
||||
=== PHASE 2: WRITING PATTERNS & STYLE INTELLIGENCE ===
|
||||
|
||||
STYLE PATTERNS (for research angles):
|
||||
{json.dumps(style_patterns, indent=2) if style_patterns else "Not available"}
|
||||
|
||||
EXTRACTED WRITING PATTERNS:
|
||||
{json.dumps(extracted_patterns, indent=2) if extracted_patterns else "No patterns extracted"}
|
||||
|
||||
CONTENT CHARACTERISTICS (for keyword sophistication):
|
||||
{json.dumps(content_characteristics, indent=2) if content_characteristics else "Not available"}
|
||||
|
||||
VOCABULARY LEVEL:
|
||||
{vocabulary_level}
|
||||
|
||||
STYLE GUIDELINES (for query enhancement):
|
||||
{json.dumps(style_guidelines, indent=2) if style_guidelines else "Not available"}
|
||||
|
||||
EXTRACTED GUIDELINES:
|
||||
{json.dumps(extracted_guidelines, indent=2) if extracted_guidelines else "No guidelines extracted"}
|
||||
|
||||
=== PHASE 3: COMPREHENSIVE ANALYSIS & MAPPING ===
|
||||
|
||||
CRAWL ANALYSIS (Full Content Intelligence):
|
||||
{json.dumps(crawl_analysis, indent=2) if crawl_analysis else "No crawl analysis available"}
|
||||
|
||||
WRITING STYLE COMPREHENSIVE MAPPING:
|
||||
{json.dumps(writing_style_mapping, indent=2) if writing_style_mapping else "No style mapping available"}
|
||||
|
||||
CONTENT THEMES (Extracted from Website):
|
||||
{json.dumps(content_themes, indent=2) if content_themes else "No themes extracted"}
|
||||
|
||||
=== RESEARCH PERSONA GENERATION REQUIREMENTS ===
|
||||
|
||||
Generate a comprehensive research persona in JSON format with the following structure:
|
||||
|
||||
1. DEFAULT VALUES:
|
||||
- "default_industry": Extract from core_persona.industry, business_info.industry, or website_analysis target_audience. If none available, infer from content patterns in website_analysis or research_preferences. Never use "General" - always provide a specific industry based on context.
|
||||
- "default_target_audience": Extract from core_persona.target_audience, website_analysis.target_audience, or business_info.target_audience. Be specific and descriptive.
|
||||
- "default_research_mode": **PHASE 3 ENHANCEMENT** - Use comprehensive writing_style_mapping:
|
||||
* **PRIMARY**: Use writing_style_mapping.research_depth_preference (from comprehensive analysis)
|
||||
* **SECONDARY**: Map from writing_style.complexity:
|
||||
- If writing_style.complexity == "high": Use "comprehensive" (deep research needed)
|
||||
- If writing_style.complexity == "medium": Use "targeted" (balanced research)
|
||||
- If writing_style.complexity == "low": Use "basic" (quick research)
|
||||
* **FALLBACK**: Use research_preferences.research_depth if complexity not available
|
||||
* This ensures research depth matches the user's writing sophistication level and comprehensive style analysis
|
||||
- "default_provider": **PHASE 3 ENHANCEMENT** - Use writing_style_mapping.provider_preference:
|
||||
* **PRIMARY**: Use writing_style_mapping.provider_preference (from comprehensive style analysis)
|
||||
* **SECONDARY**: Suggest based on user's typical research needs:
|
||||
- Academic/research users: "exa" (semantic search, papers)
|
||||
- News/current events users: "tavily" (real-time, AI answers)
|
||||
- General business users: "exa" (better for content creation)
|
||||
* **DEFAULT**: "exa" (generally better for content creators)
|
||||
|
||||
2. KEYWORD INTELLIGENCE:
|
||||
- "suggested_keywords": **PHASE 1 ENHANCEMENT** - Prioritize extracted keywords from crawl_result:
|
||||
* First, use extracted_keywords from website content (top 8-10 most relevant)
|
||||
* Then, supplement with keywords from user's industry, interests (from core_persona), and content goals
|
||||
* Total: 8-12 keywords, with at least 50% from extracted_keywords if available
|
||||
* This ensures keywords reflect the user's actual content topics
|
||||
- "keyword_expansion_patterns": **PHASE 2 ENHANCEMENT** - Create a dictionary mapping common keywords to expanded, industry-specific terms based on vocabulary_level:
|
||||
* If vocabulary_level == "advanced": Use sophisticated, technical, industry-specific terminology
|
||||
Example: {{"AI": ["machine learning algorithms", "neural network architectures", "deep learning frameworks", "algorithmic intelligence systems"], "tools": ["enterprise software platforms", "integrated development environments", "cloud-native solutions"]}}
|
||||
* If vocabulary_level == "medium": Use balanced, professional terminology
|
||||
Example: {{"AI": ["artificial intelligence", "automated systems", "smart technology", "intelligent automation"], "tools": ["software solutions", "digital platforms", "business applications"]}}
|
||||
* If vocabulary_level == "simple": Use accessible, beginner-friendly terminology
|
||||
Example: {{"AI": ["smart technology", "automated tools", "helpful software", "intelligent helpers"], "tools": ["apps", "software", "platforms", "online services"]}}
|
||||
* Include 10-15 patterns, matching the user's vocabulary sophistication level
|
||||
* Focus on industry-specific terminology from the user's domain, but at the appropriate complexity level
|
||||
|
||||
3. PROVIDER-SPECIFIC OPTIMIZATION:
|
||||
- "suggested_exa_domains": List 4-6 authoritative domains for the user's industry (e.g., Healthcare: ["pubmed.gov", "nejm.org", "thelancet.com"]).
|
||||
- "suggested_exa_category": Suggest appropriate Exa category based on industry:
|
||||
- Healthcare/Science: "research paper"
|
||||
- Finance: "financial report"
|
||||
- Technology/Business: "company" or "news"
|
||||
- Social Media/Marketing: "tweet" or "linkedin profile"
|
||||
- Default: null (empty string for all categories)
|
||||
- "suggested_exa_search_type": Suggest Exa search algorithm:
|
||||
- Academic/research content: "neural" (semantic understanding)
|
||||
- Current news/trends: "fast" (speed optimized)
|
||||
- General research: "auto" (balanced)
|
||||
- Code/technical: "neural"
|
||||
- "suggested_tavily_topic": Choose based on content type:
|
||||
- Financial content: "finance"
|
||||
- News/current events: "news"
|
||||
- General research: "general"
|
||||
- "suggested_tavily_search_depth": Choose based on research needs:
|
||||
- Quick overview: "basic" (1 credit, faster)
|
||||
- In-depth analysis: "advanced" (2 credits, more comprehensive)
|
||||
- Breaking news: "fast" (speed optimized)
|
||||
- "suggested_tavily_include_answer": AI-generated answers:
|
||||
- For factual queries needing quick answers: "advanced"
|
||||
- For research summaries: "basic"
|
||||
- When building custom content: "false" (use raw results)
|
||||
- "suggested_tavily_time_range": Time filtering:
|
||||
- Breaking news: "day"
|
||||
- Recent developments: "week"
|
||||
- Industry analysis: "month"
|
||||
- Historical research: null (no time limit)
|
||||
- "suggested_tavily_raw_content_format": Raw content for LLM processing:
|
||||
- For blog content creation: "markdown" (structured)
|
||||
- For simple text extraction: "text"
|
||||
- No raw content needed: "false"
|
||||
- "provider_recommendations": Map use cases to best providers:
|
||||
{{"trends": "tavily", "deep_research": "exa", "factual": "google", "news": "tavily", "academic": "exa"}}
|
||||
|
||||
4. RESEARCH ANGLES:
|
||||
- "research_angles": **PHASE 2 ENHANCEMENT** - Generate 5-8 alternative research angles/focuses based on:
|
||||
* **PRIMARY SOURCE**: Extract from extracted_patterns (writing patterns from style_patterns):
|
||||
- If "comparison" in patterns: "Compare {{topic}} solutions and alternatives"
|
||||
- If "how-to" or "tutorial" in patterns: "Step-by-step guide to {{topic}} implementation"
|
||||
- If "case-study" or "case_study" in patterns: "Real-world {{topic}} case studies and success stories"
|
||||
- If "trend-analysis" or "trends" in patterns: "Latest {{topic}} trends and future predictions"
|
||||
- If "best-practices" or "best_practices" in patterns: "{{topic}} best practices and industry standards"
|
||||
- If "review" or "evaluation" in patterns: "{{topic}} review and evaluation criteria"
|
||||
- If "problem-solving" in patterns: "{{topic}} problem-solving strategies and solutions"
|
||||
* **SECONDARY SOURCES** (if patterns not available):
|
||||
- User's pain points and challenges (from core_persona.identity or core_persona)
|
||||
- Industry trends and opportunities (from website_analysis or business_info)
|
||||
- Content goals (from research_preferences.content_types)
|
||||
- Audience interests (from core_persona or website_analysis.target_audience)
|
||||
- Competitive landscape (if competitor_analysis exists, include competitive angles)
|
||||
* Make angles specific to the user's industry and actionable for content creation
|
||||
* Use the same language style and structure as the user's writing patterns
|
||||
|
||||
5. QUERY ENHANCEMENT:
|
||||
- "query_enhancement_rules": **PHASE 2 ENHANCEMENT** - Create templates for improving vague user queries based on extracted_guidelines:
|
||||
* **PRIMARY SOURCE**: Use extracted_guidelines (from style_guidelines) to create enhancement rules:
|
||||
- If guidelines include "Use specific examples": {{"vague_query": "Research: {{query}} with specific examples and case studies"}}
|
||||
- If guidelines include "Include data points" or "statistics": {{"general_query": "Research: {{query}} including statistics, metrics, and data analysis"}}
|
||||
- If guidelines include "Reference industry standards": {{"basic_query": "Research: {{query}} with industry benchmarks and best practices"}}
|
||||
- If guidelines include "Cite authoritative sources": {{"factual_query": "Research: {{query}} from authoritative sources and expert opinions"}}
|
||||
- If guidelines include "Provide actionable insights": {{"theoretical_query": "Research: {{query}} with actionable strategies and implementation steps"}}
|
||||
- If guidelines include "Compare alternatives": {{"single_item_query": "Research: Compare {{query}} alternatives and evaluate options"}}
|
||||
* **FALLBACK PATTERNS** (if guidelines not available):
|
||||
{{"vague_ai": "Research: AI applications in {{industry}} for {{audience}}", "vague_tools": "Compare top {{industry}} tools", "vague_trends": "Research latest {{industry}} trends and developments", ...}}
|
||||
* Include 5-8 enhancement patterns
|
||||
* Match the enhancement style to the user's writing guidelines and preferences
|
||||
|
||||
6. RECOMMENDED PRESETS:
|
||||
- "recommended_presets": **PHASE 3 ENHANCEMENT** - Generate 3-5 personalized research preset templates using comprehensive analysis:
|
||||
* **USE CONTENT THEMES**: If content_themes available, create at least one preset per major theme (up to 3 themes)
|
||||
- Example: If themes include ["AI automation", "content marketing", "SEO strategies"], create presets for each
|
||||
- Use theme names in preset keywords: "Research latest {theme} trends and best practices"
|
||||
* **USE CRAWL ANALYSIS**: Leverage crawl_analysis.content_categories and crawl_analysis.main_topics for preset generation
|
||||
- Create presets that match the user's actual website content categories
|
||||
- Use main_topics for preset keywords and descriptions
|
||||
* **CONTENT TYPE BASED**: Generate presets based on content_type (from Phase 1):
|
||||
* **Content-Type-Specific Presets**: Use content_type.primary_type and content_type.secondary_types to create presets:
|
||||
- If primary_type == "blog": Create "Blog Topic Research" preset with trending topics
|
||||
- If primary_type == "article": Create "Article Research" preset with in-depth analysis
|
||||
- If primary_type == "case_study": Create "Case Study Research" preset with real-world examples
|
||||
- If primary_type == "tutorial": Create "Tutorial Research" preset with step-by-step guides
|
||||
- If "tutorial" in secondary_types: Add "How-To Guide Research" preset
|
||||
- If "comparison" in secondary_types or style_patterns: Add "Comparison Research" preset
|
||||
- If content_type.purpose == "thought_leadership": Create "Thought Leadership Research" with expert insights
|
||||
- If content_type.purpose == "education": Create "Educational Content Research" preset
|
||||
* **Use Extracted Topics**: If extracted_topics available, create at least one preset using actual website topics:
|
||||
- "Latest {extracted_topic} Trends" preset
|
||||
- "{extracted_topic} Best Practices" preset
|
||||
* Each preset should include:
|
||||
- name: Descriptive, action-oriented name that clearly indicates what research will be done
|
||||
* Use research_angles as inspiration for preset names (e.g., "Compare {Industry} Tools", "{Industry} ROI Analysis")
|
||||
* If competitor_analysis exists, create at least one competitive analysis preset (e.g., "Competitive Landscape Analysis")
|
||||
* Make names specific and actionable, not generic
|
||||
* **NEW**: Include content type in name when relevant (e.g., "Blog: {Industry} Trends", "Tutorial: {Topic} Guide")
|
||||
- keywords: Research query string that is:
|
||||
* **NEW**: Use extracted_topics and extracted_keywords when available for more relevant queries
|
||||
* Specific and detailed (not vague like "AI tools")
|
||||
* Industry-focused (includes industry context)
|
||||
* Audience-aware (considers target audience needs)
|
||||
* Actionable (user can immediately understand what research will provide)
|
||||
* Examples: "Research latest AI-powered marketing automation platforms for B2B SaaS companies" (GOOD)
|
||||
* Avoid: "AI tools" or "marketing research" (TOO VAGUE)
|
||||
- industry: User's industry (from business_info or inferred)
|
||||
- target_audience: User's target audience (from business_info or inferred)
|
||||
- research_mode: "basic", "comprehensive", or "targeted" based on:
|
||||
* **NEW**: Also consider content_type.purpose:
|
||||
- "thought_leadership" → "comprehensive" (needs deep research)
|
||||
- "education" → "comprehensive" (needs thorough coverage)
|
||||
- "marketing" → "targeted" (needs specific insights)
|
||||
- "entertainment" → "basic" (needs quick facts)
|
||||
* "comprehensive" for deep analysis, trends, competitive research
|
||||
* "targeted" for specific questions, quick insights
|
||||
* "basic" for simple fact-finding
|
||||
- config: Complete ResearchConfig object with:
|
||||
* provider: Use suggested_exa_category to determine if "exa" or "tavily" is better
|
||||
* exa_category: Use suggested_exa_category if available
|
||||
* exa_include_domains: Use suggested_exa_domains if available (limit to 3-5 most relevant)
|
||||
* exa_search_type: Use suggested_exa_search_type if available
|
||||
* max_sources: 15-25 for comprehensive, 10-15 for targeted, 8-12 for basic
|
||||
* include_competitors: true if competitor_analysis exists and preset is about competitive research
|
||||
* include_trends: true for trend-focused presets
|
||||
* include_statistics: true for data-driven research
|
||||
* include_expert_quotes: true for comprehensive research or thought_leadership content
|
||||
- description: Brief (1-2 sentences) explaining what this preset researches and why it's valuable
|
||||
- icon: Optional emoji that represents the preset (e.g., "📊" for trends, "🎯" for targeted, "🔍" for analysis, "📝" for blog, "📚" for tutorial)
|
||||
- gradient: Optional CSS gradient for visual appeal
|
||||
|
||||
PRESET GENERATION GUIDELINES:
|
||||
- **PHASE 1 PRIORITY**: Create presets that match the user's actual content types (from content_type)
|
||||
- Use extracted_topics to create presets based on actual website content
|
||||
- Create presets that the user would actually want to use for their content creation
|
||||
- Use research_angles to inspire preset names and keywords
|
||||
- If competitor_analysis has data, create at least one competitive analysis preset
|
||||
- Make each preset unique with different research focus (trends, tools, best practices, competitive, etc.)
|
||||
- Ensure keywords are detailed enough to generate meaningful research
|
||||
- Vary research_mode across presets to offer different depth levels
|
||||
- Use industry-specific terminology in preset names and keywords
|
||||
|
||||
7. RESEARCH PREFERENCES:
|
||||
- "research_preferences": Extract and structure research preferences from onboarding:
|
||||
- research_depth: From research_preferences.research_depth
|
||||
- content_types: From research_preferences.content_types
|
||||
- auto_research: From research_preferences.auto_research
|
||||
- factual_content: From research_preferences.factual_content
|
||||
|
||||
=== OUTPUT REQUIREMENTS ===
|
||||
|
||||
Return a valid JSON object matching this exact structure:
|
||||
{{
|
||||
"default_industry": "string",
|
||||
"default_target_audience": "string",
|
||||
"default_research_mode": "basic" | "comprehensive" | "targeted",
|
||||
"default_provider": "google" | "exa",
|
||||
"suggested_keywords": ["keyword1", "keyword2", ...],
|
||||
"keyword_expansion_patterns": {{
|
||||
"keyword": ["expansion1", "expansion2", ...]
|
||||
}},
|
||||
"suggested_exa_domains": ["domain1.com", "domain2.com", ...],
|
||||
"suggested_exa_category": "string or null",
|
||||
"suggested_exa_search_type": "auto | neural | keyword | fast | deep",
|
||||
"suggested_tavily_topic": "general | news | finance",
|
||||
"suggested_tavily_search_depth": "basic | advanced | fast | ultra-fast",
|
||||
"suggested_tavily_include_answer": "false | basic | advanced",
|
||||
"suggested_tavily_time_range": "day | week | month | year or null",
|
||||
"suggested_tavily_raw_content_format": "false | markdown | text",
|
||||
"provider_recommendations": {{
|
||||
"trends": "tavily",
|
||||
"deep_research": "exa",
|
||||
"factual": "google"
|
||||
}},
|
||||
"research_angles": ["angle1", "angle2", ...],
|
||||
"query_enhancement_rules": {{
|
||||
"pattern": "template"
|
||||
}},
|
||||
"recommended_presets": [
|
||||
{{
|
||||
"name": "string",
|
||||
"keywords": "string",
|
||||
"industry": "string",
|
||||
"target_audience": "string",
|
||||
"research_mode": "basic" | "comprehensive" | "targeted",
|
||||
"config": {{
|
||||
"mode": "basic" | "comprehensive" | "targeted",
|
||||
"provider": "google" | "exa",
|
||||
"max_sources": 10 | 15 | 12,
|
||||
"include_statistics": true | false,
|
||||
"include_expert_quotes": true | false,
|
||||
"include_competitors": true | false,
|
||||
"include_trends": true | false,
|
||||
"exa_category": "string or null",
|
||||
"exa_include_domains": ["domain1.com", ...],
|
||||
"exa_search_type": "auto" | "keyword" | "neural"
|
||||
}},
|
||||
"description": "string"
|
||||
}}
|
||||
],
|
||||
"research_preferences": {{
|
||||
"research_depth": "string",
|
||||
"content_types": ["type1", "type2", ...],
|
||||
"auto_research": true | false,
|
||||
"factual_content": true | false
|
||||
}},
|
||||
"version": "1.0",
|
||||
"confidence_score": 85.0
|
||||
}}
|
||||
|
||||
=== IMPORTANT INSTRUCTIONS ===
|
||||
|
||||
1. Be highly specific and personalized - use actual data from the user's business, persona, and preferences.
|
||||
2. NEVER use "General" for industry or target_audience - always infer or create specific categories based on available context.
|
||||
3. For minimal data scenarios:
|
||||
- If industry is unclear, infer from research_preferences.content_types or website_analysis.content_characteristics
|
||||
- If target_audience is unclear, infer from writing_style patterns or content goals
|
||||
- Use business_info to fill gaps when persona_data is incomplete
|
||||
4. Generate industry-specific intelligence even with limited data:
|
||||
- For content creators: assume "Content Marketing" or "Digital Publishing"
|
||||
- For business users: assume "Business Consulting" or "Professional Services"
|
||||
- For technical users: assume "Technology" or "Software Development"
|
||||
5. Ensure all suggested keywords, domains, and angles are relevant to the user's industry and audience.
|
||||
6. Generate realistic, actionable presets that the user would actually want to use.
|
||||
7. Confidence score should reflect data richness (0-100): higher if rich onboarding data, lower if minimal data.
|
||||
8. Return ONLY valid JSON - no markdown formatting, no explanatory text.
|
||||
|
||||
Generate the research persona now:
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _extract_topics_from_crawl(self, crawl_result: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract topics from crawl_result JSON data.
|
||||
|
||||
Args:
|
||||
crawl_result: Dictionary containing crawled website data
|
||||
|
||||
Returns:
|
||||
List of extracted topics (max 15)
|
||||
"""
|
||||
topics = []
|
||||
|
||||
if not crawl_result:
|
||||
return topics
|
||||
|
||||
try:
|
||||
# Try to extract from common crawl result structures
|
||||
# Method 1: Direct topics field
|
||||
if isinstance(crawl_result.get('topics'), list):
|
||||
topics.extend(crawl_result['topics'][:10])
|
||||
|
||||
# Method 2: Extract from headings
|
||||
if isinstance(crawl_result.get('headings'), list):
|
||||
headings = crawl_result['headings']
|
||||
# Filter out common non-topic headings
|
||||
filtered_headings = [
|
||||
h for h in headings[:15]
|
||||
if h and len(h.strip()) > 3
|
||||
and h.lower() not in ['home', 'about', 'contact', 'menu', 'navigation', 'footer', 'header']
|
||||
]
|
||||
topics.extend(filtered_headings)
|
||||
|
||||
# Method 3: Extract from page titles
|
||||
if isinstance(crawl_result.get('titles'), list):
|
||||
titles = crawl_result['titles']
|
||||
topics.extend([t for t in titles[:10] if t and len(t.strip()) > 3])
|
||||
|
||||
# Method 4: Extract from content sections
|
||||
if isinstance(crawl_result.get('sections'), list):
|
||||
sections = crawl_result['sections']
|
||||
for section in sections[:10]:
|
||||
if isinstance(section, dict):
|
||||
section_title = section.get('title') or section.get('heading')
|
||||
if section_title and len(section_title.strip()) > 3:
|
||||
topics.append(section_title)
|
||||
|
||||
# Method 5: Extract from metadata
|
||||
if isinstance(crawl_result.get('metadata'), dict):
|
||||
meta = crawl_result['metadata']
|
||||
if meta.get('title'):
|
||||
topics.append(meta['title'])
|
||||
if isinstance(meta.get('keywords'), list):
|
||||
topics.extend(meta['keywords'][:5])
|
||||
|
||||
# Remove duplicates and clean
|
||||
unique_topics = []
|
||||
seen = set()
|
||||
for topic in topics:
|
||||
if topic and isinstance(topic, str):
|
||||
cleaned = topic.strip()
|
||||
if cleaned and cleaned.lower() not in seen:
|
||||
seen.add(cleaned.lower())
|
||||
unique_topics.append(cleaned)
|
||||
|
||||
return unique_topics[:15] # Limit to 15 topics
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting topics from crawl_result: {e}")
|
||||
return []
|
||||
|
||||
def _extract_keywords_from_crawl(self, crawl_result: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract keywords from crawl_result JSON data.
|
||||
|
||||
Args:
|
||||
crawl_result: Dictionary containing crawled website data
|
||||
|
||||
Returns:
|
||||
List of extracted keywords (max 20)
|
||||
"""
|
||||
keywords = []
|
||||
|
||||
if not crawl_result:
|
||||
return keywords
|
||||
|
||||
try:
|
||||
# Method 1: Direct keywords field
|
||||
if isinstance(crawl_result.get('keywords'), list):
|
||||
keywords.extend(crawl_result['keywords'][:15])
|
||||
|
||||
# Method 2: Extract from metadata keywords
|
||||
if isinstance(crawl_result.get('metadata'), dict):
|
||||
meta = crawl_result['metadata']
|
||||
if isinstance(meta.get('keywords'), list):
|
||||
keywords.extend(meta['keywords'][:10])
|
||||
if meta.get('description'):
|
||||
# Extract potential keywords from description (simple word extraction)
|
||||
desc = meta['description']
|
||||
words = [w.strip() for w in desc.split() if len(w.strip()) > 4]
|
||||
keywords.extend(words[:5])
|
||||
|
||||
# Method 3: Extract from tags
|
||||
if isinstance(crawl_result.get('tags'), list):
|
||||
keywords.extend(crawl_result['tags'][:10])
|
||||
|
||||
# Method 4: Extract from content (simple frequency-based, if available)
|
||||
if isinstance(crawl_result.get('content'), str):
|
||||
content = crawl_result['content']
|
||||
# Simple extraction: words that appear multiple times and are > 4 chars
|
||||
words = content.lower().split()
|
||||
word_freq = {}
|
||||
for word in words:
|
||||
cleaned = ''.join(c for c in word if c.isalnum())
|
||||
if len(cleaned) > 4:
|
||||
word_freq[cleaned] = word_freq.get(cleaned, 0) + 1
|
||||
|
||||
# Get top keywords by frequency
|
||||
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
||||
keywords.extend([word for word, freq in sorted_words[:10] if freq > 1])
|
||||
|
||||
# Remove duplicates and clean
|
||||
unique_keywords = []
|
||||
seen = set()
|
||||
for keyword in keywords:
|
||||
if keyword and isinstance(keyword, str):
|
||||
cleaned = keyword.strip().lower()
|
||||
if cleaned and len(cleaned) > 2 and cleaned not in seen:
|
||||
seen.add(cleaned)
|
||||
unique_keywords.append(keyword.strip())
|
||||
|
||||
return unique_keywords[:20] # Limit to 20 keywords
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting keywords from crawl_result: {e}")
|
||||
return []
|
||||
|
||||
def _extract_writing_patterns(self, style_patterns: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract writing patterns from style_patterns JSON data.
|
||||
|
||||
Args:
|
||||
style_patterns: Dictionary containing writing patterns analysis
|
||||
|
||||
Returns:
|
||||
List of extracted patterns (max 10)
|
||||
"""
|
||||
patterns = []
|
||||
|
||||
if not style_patterns:
|
||||
return patterns
|
||||
|
||||
try:
|
||||
# Method 1: Direct patterns field
|
||||
if isinstance(style_patterns.get('patterns'), list):
|
||||
patterns.extend(style_patterns['patterns'][:10])
|
||||
|
||||
# Method 2: Common patterns field
|
||||
if isinstance(style_patterns.get('common_patterns'), list):
|
||||
patterns.extend(style_patterns['common_patterns'][:10])
|
||||
|
||||
# Method 3: Writing patterns field
|
||||
if isinstance(style_patterns.get('writing_patterns'), list):
|
||||
patterns.extend(style_patterns['writing_patterns'][:10])
|
||||
|
||||
# Method 4: Content structure patterns
|
||||
if isinstance(style_patterns.get('content_structure'), dict):
|
||||
structure = style_patterns['content_structure']
|
||||
if isinstance(structure.get('patterns'), list):
|
||||
patterns.extend(structure['patterns'][:5])
|
||||
|
||||
# Method 5: Extract from analysis field
|
||||
if isinstance(style_patterns.get('analysis'), dict):
|
||||
analysis = style_patterns['analysis']
|
||||
if isinstance(analysis.get('identified_patterns'), list):
|
||||
patterns.extend(analysis['identified_patterns'][:10])
|
||||
|
||||
# Normalize patterns (lowercase, remove duplicates)
|
||||
normalized_patterns = []
|
||||
seen = set()
|
||||
for pattern in patterns:
|
||||
if pattern and isinstance(pattern, str):
|
||||
cleaned = pattern.strip().lower().replace('_', '-').replace(' ', '-')
|
||||
if cleaned and cleaned not in seen:
|
||||
seen.add(cleaned)
|
||||
normalized_patterns.append(cleaned)
|
||||
|
||||
return normalized_patterns[:10] # Limit to 10 patterns
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting writing patterns: {e}")
|
||||
return []
|
||||
|
||||
def _extract_style_guidelines(self, style_guidelines: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract style guidelines from style_guidelines JSON data.
|
||||
|
||||
Args:
|
||||
style_guidelines: Dictionary containing generated style guidelines
|
||||
|
||||
Returns:
|
||||
List of extracted guidelines (max 15)
|
||||
"""
|
||||
guidelines = []
|
||||
|
||||
if not style_guidelines:
|
||||
return guidelines
|
||||
|
||||
try:
|
||||
# Method 1: Direct guidelines field
|
||||
if isinstance(style_guidelines.get('guidelines'), list):
|
||||
guidelines.extend(style_guidelines['guidelines'][:15])
|
||||
|
||||
# Method 2: Recommendations field
|
||||
if isinstance(style_guidelines.get('recommendations'), list):
|
||||
guidelines.extend(style_guidelines['recommendations'][:15])
|
||||
|
||||
# Method 3: Best practices field
|
||||
if isinstance(style_guidelines.get('best_practices'), list):
|
||||
guidelines.extend(style_guidelines['best_practices'][:10])
|
||||
|
||||
# Method 4: Tone recommendations
|
||||
if isinstance(style_guidelines.get('tone_recommendations'), list):
|
||||
guidelines.extend(style_guidelines['tone_recommendations'][:5])
|
||||
|
||||
# Method 5: Structure guidelines
|
||||
if isinstance(style_guidelines.get('structure_guidelines'), list):
|
||||
guidelines.extend(style_guidelines['structure_guidelines'][:5])
|
||||
|
||||
# Method 6: Vocabulary suggestions
|
||||
if isinstance(style_guidelines.get('vocabulary_suggestions'), list):
|
||||
guidelines.extend(style_guidelines['vocabulary_suggestions'][:5])
|
||||
|
||||
# Method 7: Engagement tips
|
||||
if isinstance(style_guidelines.get('engagement_tips'), list):
|
||||
guidelines.extend(style_guidelines['engagement_tips'][:5])
|
||||
|
||||
# Method 8: Audience considerations
|
||||
if isinstance(style_guidelines.get('audience_considerations'), list):
|
||||
guidelines.extend(style_guidelines['audience_considerations'][:5])
|
||||
|
||||
# Method 9: SEO optimization (if available)
|
||||
if isinstance(style_guidelines.get('seo_optimization'), list):
|
||||
guidelines.extend(style_guidelines['seo_optimization'][:3])
|
||||
|
||||
# Method 10: Conversion optimization (if available)
|
||||
if isinstance(style_guidelines.get('conversion_optimization'), list):
|
||||
guidelines.extend(style_guidelines['conversion_optimization'][:3])
|
||||
|
||||
# Remove duplicates and clean
|
||||
unique_guidelines = []
|
||||
seen = set()
|
||||
for guideline in guidelines:
|
||||
if guideline and isinstance(guideline, str):
|
||||
cleaned = guideline.strip()
|
||||
# Normalize for comparison (lowercase, remove extra spaces)
|
||||
normalized = ' '.join(cleaned.lower().split())
|
||||
if cleaned and normalized not in seen and len(cleaned) > 5:
|
||||
seen.add(normalized)
|
||||
unique_guidelines.append(cleaned)
|
||||
|
||||
return unique_guidelines[:15] # Limit to 15 guidelines
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting style guidelines: {e}")
|
||||
return []
|
||||
|
||||
def get_json_schema(self) -> Dict[str, Any]:
|
||||
"""Return JSON schema for structured LLM response."""
|
||||
# This will be used with llm_text_gen(json_struct=...)
|
||||
from models.research_persona_models import ResearchPersona, ResearchPreset
|
||||
|
||||
# Convert Pydantic model to JSON schema
|
||||
return ResearchPersona.schema()
|
||||
194
backend/services/research/research_persona_scheduler.py
Normal file
194
backend/services/research/research_persona_scheduler.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
Research Persona Scheduler
|
||||
Handles scheduled generation of research personas after onboarding.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from services.database import get_db_session
|
||||
from services.research.research_persona_service import ResearchPersonaService
|
||||
from models.scheduler_models import SchedulerEventLog
|
||||
|
||||
|
||||
async def generate_research_persona_task(user_id: str):
|
||||
"""
|
||||
Async task function to generate research persona for a user.
|
||||
|
||||
This function is called by the scheduler 20 minutes after onboarding completion.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
"""
|
||||
db = None
|
||||
try:
|
||||
logger.info(f"Scheduled research persona generation started for user {user_id}")
|
||||
|
||||
# Get database session
|
||||
db = get_db_session()
|
||||
if not db:
|
||||
logger.error(f"Failed to get database session for research persona generation (user: {user_id})")
|
||||
return
|
||||
|
||||
# Generate research persona
|
||||
persona_service = ResearchPersonaService(db_session=db)
|
||||
|
||||
# Check if persona already exists to avoid unnecessary API calls
|
||||
persona_data = persona_service._get_persona_data_record(user_id)
|
||||
if persona_data and persona_data.research_persona:
|
||||
logger.info(f"Research persona already exists for user {user_id}, skipping generation")
|
||||
return
|
||||
|
||||
start_time = datetime.utcnow()
|
||||
try:
|
||||
research_persona = persona_service.get_or_generate(user_id, force_refresh=False)
|
||||
execution_time = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
if research_persona:
|
||||
logger.info(f"✅ Scheduled research persona generation completed for user {user_id}")
|
||||
|
||||
# Log success to scheduler event log for dashboard
|
||||
try:
|
||||
event_log = SchedulerEventLog(
|
||||
event_type='job_completed',
|
||||
event_date=start_time,
|
||||
job_id=f"research_persona_{user_id}",
|
||||
job_type='one_time',
|
||||
user_id=user_id,
|
||||
event_data={
|
||||
'job_function': 'generate_research_persona_task',
|
||||
'execution_time_seconds': execution_time,
|
||||
'status': 'success'
|
||||
}
|
||||
)
|
||||
db.add(event_log)
|
||||
db.commit()
|
||||
except Exception as log_error:
|
||||
logger.warning(f"Failed to log persona generation success to scheduler event log: {log_error}")
|
||||
if db:
|
||||
db.rollback()
|
||||
else:
|
||||
error_msg = (
|
||||
f"Scheduled research persona generation FAILED for user {user_id}. "
|
||||
f"Expensive API call was made but generation failed. "
|
||||
f"Will NOT automatically retry to prevent wasteful API calls."
|
||||
)
|
||||
logger.error(f"❌ {error_msg}")
|
||||
|
||||
# Log failure to scheduler event log for dashboard visibility
|
||||
try:
|
||||
event_log = SchedulerEventLog(
|
||||
event_type='job_failed',
|
||||
event_date=start_time,
|
||||
job_id=f"research_persona_{user_id}",
|
||||
job_type='one_time',
|
||||
user_id=user_id,
|
||||
error_message=error_msg,
|
||||
event_data={
|
||||
'job_function': 'generate_research_persona_task',
|
||||
'execution_time_seconds': execution_time,
|
||||
'status': 'failed',
|
||||
'failure_reason': 'generation_returned_none',
|
||||
'expensive_api_call': True
|
||||
}
|
||||
)
|
||||
db.add(event_log)
|
||||
db.commit()
|
||||
except Exception as log_error:
|
||||
logger.warning(f"Failed to log persona generation failure to scheduler event log: {log_error}")
|
||||
if db:
|
||||
db.rollback()
|
||||
|
||||
# DO NOT reschedule - this prevents infinite retry loops
|
||||
# User can manually trigger generation from frontend if needed
|
||||
except Exception as gen_error:
|
||||
execution_time = (datetime.utcnow() - start_time).total_seconds()
|
||||
error_msg = (
|
||||
f"Exception during scheduled research persona generation for user {user_id}: {str(gen_error)}. "
|
||||
f"Expensive API call may have been made. Will NOT automatically retry."
|
||||
)
|
||||
logger.error(f"❌ {error_msg}")
|
||||
|
||||
# Log exception to scheduler event log for dashboard visibility
|
||||
try:
|
||||
event_log = SchedulerEventLog(
|
||||
event_type='job_failed',
|
||||
event_date=start_time,
|
||||
job_id=f"research_persona_{user_id}", # Match scheduled job ID format
|
||||
job_type='one_time',
|
||||
user_id=user_id,
|
||||
error_message=error_msg,
|
||||
event_data={
|
||||
'job_function': 'generate_research_persona_task',
|
||||
'execution_time_seconds': execution_time,
|
||||
'status': 'failed',
|
||||
'failure_reason': 'exception',
|
||||
'exception_type': type(gen_error).__name__,
|
||||
'exception_message': str(gen_error),
|
||||
'expensive_api_call': True
|
||||
}
|
||||
)
|
||||
db.add(event_log)
|
||||
db.commit()
|
||||
except Exception as log_error:
|
||||
logger.warning(f"Failed to log persona generation exception to scheduler event log: {log_error}")
|
||||
if db:
|
||||
db.rollback()
|
||||
|
||||
# DO NOT reschedule - prevent infinite retry loops
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in scheduled research persona generation for user {user_id}: {e}")
|
||||
finally:
|
||||
if db:
|
||||
try:
|
||||
db.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing database session: {e}")
|
||||
|
||||
|
||||
def schedule_research_persona_generation(user_id: str, delay_minutes: int = 20) -> str:
|
||||
"""
|
||||
Schedule research persona generation for a user after a delay.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
delay_minutes: Delay in minutes before generating persona (default: 20)
|
||||
|
||||
Returns:
|
||||
Job ID
|
||||
"""
|
||||
try:
|
||||
from services.scheduler import get_scheduler
|
||||
|
||||
scheduler = get_scheduler()
|
||||
|
||||
# Calculate run date (current time + delay) - ensure UTC timezone-aware
|
||||
run_date = datetime.now(timezone.utc) + timedelta(minutes=delay_minutes)
|
||||
|
||||
# Generate consistent job ID (without timestamp) for proper restoration
|
||||
# This allows restoration to find and restore the job with original scheduled time
|
||||
# Note: Clerk user_id already includes "user_" prefix, so we don't add it again
|
||||
job_id = f"research_persona_{user_id}"
|
||||
|
||||
# Schedule the task
|
||||
scheduled_job_id = scheduler.schedule_one_time_task(
|
||||
func=generate_research_persona_task,
|
||||
run_date=run_date,
|
||||
job_id=job_id,
|
||||
kwargs={"user_id": user_id},
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Scheduled research persona generation for user {user_id} "
|
||||
f"at {run_date} (job_id: {scheduled_job_id})"
|
||||
)
|
||||
|
||||
return scheduled_job_id
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to schedule research persona generation for user {user_id}: {e}")
|
||||
raise
|
||||
|
||||
421
backend/services/research/research_persona_service.py
Normal file
421
backend/services/research/research_persona_service.py
Normal file
@@ -0,0 +1,421 @@
|
||||
"""
|
||||
Research Persona Service
|
||||
|
||||
Handles generation, caching, and retrieval of AI-powered research personas.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
from fastapi import HTTPException
|
||||
|
||||
from services.database import get_db_session
|
||||
from models.onboarding import PersonaData, OnboardingSession
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
from services.onboarding.database_service import OnboardingDatabaseService
|
||||
from services.persona_data_service import PersonaDataService
|
||||
|
||||
|
||||
class ResearchPersonaService:
|
||||
"""Service for generating and managing research personas."""
|
||||
|
||||
CACHE_TTL_DAYS = 7 # 7-day cache TTL
|
||||
|
||||
def __init__(self, db_session=None):
|
||||
self.db = db_session or get_db_session()
|
||||
self.prompt_builder = ResearchPersonaPromptBuilder()
|
||||
self.onboarding_service = OnboardingDatabaseService(db=self.db)
|
||||
self.persona_data_service = PersonaDataService(db_session=self.db)
|
||||
|
||||
def get_cached_only(
|
||||
self,
|
||||
user_id: str
|
||||
) -> Optional[ResearchPersona]:
|
||||
"""
|
||||
Get research persona for user ONLY if it exists in cache.
|
||||
This method NEVER generates - it only returns cached personas.
|
||||
Use this for config endpoints to avoid triggering rate limit checks.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
|
||||
Returns:
|
||||
ResearchPersona if cached and valid, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Get persona data record
|
||||
persona_data = self._get_persona_data_record(user_id)
|
||||
|
||||
if not persona_data:
|
||||
logger.debug(f"No persona data found for user {user_id}")
|
||||
return None
|
||||
|
||||
# Only return if cache is valid and persona exists
|
||||
if self.is_cache_valid(persona_data) and persona_data.research_persona:
|
||||
try:
|
||||
logger.debug(f"Returning cached research persona for user {user_id}")
|
||||
return ResearchPersona(**persona_data.research_persona)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse cached research persona: {e}")
|
||||
return None
|
||||
|
||||
# Cache invalid or persona missing - return None (don't generate)
|
||||
logger.debug(f"No valid cached research persona for user {user_id}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting cached research persona for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def get_or_generate(
|
||||
self,
|
||||
user_id: str,
|
||||
force_refresh: bool = False
|
||||
) -> Optional[ResearchPersona]:
|
||||
"""
|
||||
Get research persona for user, generating if missing or expired.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
force_refresh: If True, regenerate even if cache is valid
|
||||
|
||||
Returns:
|
||||
ResearchPersona if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Get persona data record
|
||||
persona_data = self._get_persona_data_record(user_id)
|
||||
|
||||
if not persona_data:
|
||||
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
|
||||
return None
|
||||
|
||||
# Check cache if not forcing refresh
|
||||
if not force_refresh and self.is_cache_valid(persona_data):
|
||||
if persona_data.research_persona:
|
||||
logger.info(f"Using cached research persona for user {user_id}")
|
||||
try:
|
||||
return ResearchPersona(**persona_data.research_persona)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse cached research persona: {e}, regenerating...")
|
||||
# Fall through to regeneration
|
||||
else:
|
||||
logger.info(f"Research persona missing for user {user_id}, generating...")
|
||||
else:
|
||||
if force_refresh:
|
||||
logger.info(f"Forcing refresh of research persona for user {user_id}")
|
||||
else:
|
||||
logger.info(f"Cache expired for user {user_id}, regenerating...")
|
||||
|
||||
# Generate new research persona
|
||||
try:
|
||||
research_persona = self.generate_research_persona(user_id)
|
||||
except HTTPException:
|
||||
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
|
||||
raise
|
||||
|
||||
if research_persona:
|
||||
# Save to database
|
||||
if self.save_research_persona(user_id, research_persona):
|
||||
logger.info(f"✅ Research persona generated and saved for user {user_id}")
|
||||
else:
|
||||
logger.warning(f"Failed to save research persona for user {user_id}")
|
||||
|
||||
return research_persona
|
||||
else:
|
||||
# Log detailed error for debugging expensive failures
|
||||
logger.error(
|
||||
f"❌ Failed to generate research persona for user {user_id} - "
|
||||
f"This is an expensive failure (API call consumed). Check logs above for details."
|
||||
)
|
||||
# Don't return None silently - let the caller know this failed
|
||||
return None
|
||||
|
||||
except HTTPException:
|
||||
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
|
||||
"""
|
||||
Generate a new research persona for the user.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
|
||||
Returns:
|
||||
ResearchPersona if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Generating research persona for user {user_id}")
|
||||
|
||||
# Collect onboarding data
|
||||
onboarding_data = self._collect_onboarding_data(user_id)
|
||||
|
||||
if not onboarding_data:
|
||||
logger.warning(f"Insufficient onboarding data for user {user_id}")
|
||||
return None
|
||||
|
||||
# Build prompt
|
||||
prompt = self.prompt_builder.build_research_persona_prompt(onboarding_data)
|
||||
|
||||
# Get JSON schema for structured response
|
||||
json_schema = self.prompt_builder.get_json_schema()
|
||||
|
||||
# Call LLM with structured JSON response
|
||||
logger.info(f"Calling LLM for research persona generation (user: {user_id})")
|
||||
try:
|
||||
response_text = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=json_schema,
|
||||
user_id=user_id
|
||||
)
|
||||
except HTTPException:
|
||||
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
|
||||
logger.warning(f"HTTPException during LLM call for user {user_id} - re-raising")
|
||||
raise
|
||||
except RuntimeError as e:
|
||||
# Re-raise RuntimeError (subscription limits) as HTTPException
|
||||
logger.warning(f"RuntimeError during LLM call for user {user_id}: {e}")
|
||||
raise HTTPException(status_code=429, detail=str(e))
|
||||
|
||||
if not response_text:
|
||||
logger.error("Empty response from LLM")
|
||||
return None
|
||||
|
||||
# Parse JSON response
|
||||
import json
|
||||
try:
|
||||
# When json_struct is provided, llm_text_gen may return a dict directly
|
||||
if isinstance(response_text, dict):
|
||||
# Already parsed, use directly
|
||||
persona_dict = response_text
|
||||
elif isinstance(response_text, str):
|
||||
# Handle case where LLM returns markdown-wrapped JSON or plain JSON string
|
||||
response_text = response_text.strip()
|
||||
if response_text.startswith("```json"):
|
||||
response_text = response_text[7:]
|
||||
if response_text.startswith("```"):
|
||||
response_text = response_text[3:]
|
||||
if response_text.endswith("```"):
|
||||
response_text = response_text[:-3]
|
||||
response_text = response_text.strip()
|
||||
|
||||
persona_dict = json.loads(response_text)
|
||||
else:
|
||||
logger.error(f"Unexpected response type from LLM: {type(response_text)}")
|
||||
return None
|
||||
|
||||
# Add generated_at timestamp
|
||||
persona_dict["generated_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
# Validate and create ResearchPersona
|
||||
# Log the dict structure for debugging if validation fails
|
||||
try:
|
||||
research_persona = ResearchPersona(**persona_dict)
|
||||
logger.info(f"✅ Research persona generated successfully for user {user_id}")
|
||||
return research_persona
|
||||
except Exception as validation_error:
|
||||
logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
|
||||
logger.debug(f"Persona dict keys: {list(persona_dict.keys()) if isinstance(persona_dict, dict) else 'Not a dict'}")
|
||||
logger.debug(f"Persona dict sample: {str(persona_dict)[:500]}")
|
||||
# Re-raise to be caught by outer exception handler
|
||||
raise
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse LLM response as JSON: {e}")
|
||||
logger.debug(f"Response text: {response_text[:500] if isinstance(response_text, str) else str(response_text)[:500]}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create ResearchPersona from response: {e}")
|
||||
return None
|
||||
|
||||
except HTTPException:
|
||||
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating research persona for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def is_cache_valid(self, persona_data: PersonaData) -> bool:
|
||||
"""
|
||||
Check if cached research persona is still valid (within TTL).
|
||||
|
||||
Args:
|
||||
persona_data: PersonaData database record
|
||||
|
||||
Returns:
|
||||
True if cache is valid, False otherwise
|
||||
"""
|
||||
if not persona_data.research_persona_generated_at:
|
||||
return False
|
||||
|
||||
# Check if within TTL
|
||||
cache_age = datetime.utcnow() - persona_data.research_persona_generated_at
|
||||
is_valid = cache_age < timedelta(days=self.CACHE_TTL_DAYS)
|
||||
|
||||
if not is_valid:
|
||||
logger.debug(f"Cache expired (age: {cache_age.days} days, TTL: {self.CACHE_TTL_DAYS} days)")
|
||||
|
||||
return is_valid
|
||||
|
||||
def save_research_persona(
|
||||
self,
|
||||
user_id: str,
|
||||
research_persona: ResearchPersona
|
||||
) -> bool:
|
||||
"""
|
||||
Save research persona to database.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
research_persona: ResearchPersona to save
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
persona_data = self._get_persona_data_record(user_id)
|
||||
|
||||
if not persona_data:
|
||||
logger.error(f"No persona data record found for user {user_id}")
|
||||
return False
|
||||
|
||||
# Convert ResearchPersona to dict for JSON storage
|
||||
persona_dict = research_persona.dict()
|
||||
|
||||
# Update database record
|
||||
persona_data.research_persona = persona_dict
|
||||
persona_data.research_persona_generated_at = datetime.utcnow()
|
||||
|
||||
self.db.commit()
|
||||
|
||||
logger.info(f"✅ Research persona saved for user {user_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving research persona for user {user_id}: {e}")
|
||||
self.db.rollback()
|
||||
return False
|
||||
|
||||
def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
|
||||
"""Get PersonaData database record for user."""
|
||||
try:
|
||||
# Ensure research_persona columns exist before querying
|
||||
self.onboarding_service._ensure_research_persona_columns(self.db)
|
||||
|
||||
# Get onboarding session
|
||||
session = self.db.query(OnboardingSession).filter(
|
||||
OnboardingSession.user_id == user_id
|
||||
).first()
|
||||
|
||||
if not session:
|
||||
return None
|
||||
|
||||
# Get persona data
|
||||
persona_data = self.db.query(PersonaData).filter(
|
||||
PersonaData.session_id == session.id
|
||||
).first()
|
||||
|
||||
return persona_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting persona data record for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Collect all onboarding data needed for research persona generation.
|
||||
|
||||
Returns:
|
||||
Dictionary with website_analysis, persona_data, research_preferences, business_info
|
||||
"""
|
||||
try:
|
||||
# Get website analysis
|
||||
website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}
|
||||
|
||||
# Get persona data
|
||||
persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}
|
||||
|
||||
# Get research preferences
|
||||
research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}
|
||||
|
||||
# Get business info - construct from persona data and website analysis
|
||||
business_info = {}
|
||||
|
||||
# Try to extract from persona data
|
||||
if persona_data_dict:
|
||||
core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
|
||||
if core_persona:
|
||||
if core_persona.get('industry'):
|
||||
business_info['industry'] = core_persona['industry']
|
||||
if core_persona.get('target_audience'):
|
||||
business_info['target_audience'] = core_persona['target_audience']
|
||||
|
||||
# Fallback to website analysis if not in persona
|
||||
if not business_info.get('industry') and website_analysis:
|
||||
target_audience_data = website_analysis.get('target_audience', {})
|
||||
if isinstance(target_audience_data, dict):
|
||||
industry_focus = target_audience_data.get('industry_focus')
|
||||
if industry_focus:
|
||||
business_info['industry'] = industry_focus
|
||||
demographics = target_audience_data.get('demographics')
|
||||
if demographics:
|
||||
business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
|
||||
|
||||
# Check if we have enough data - be more lenient since we can infer from minimal data
|
||||
# We need at least some basic information to generate a meaningful persona
|
||||
has_basic_data = bool(
|
||||
website_analysis or
|
||||
persona_data_dict or
|
||||
research_prefs.get('content_types') or
|
||||
business_info.get('industry')
|
||||
)
|
||||
|
||||
if not has_basic_data:
|
||||
logger.warning(f"Insufficient onboarding data for user {user_id} - no basic data found")
|
||||
return None
|
||||
|
||||
# If we have minimal data, add intelligent defaults to help the AI
|
||||
if not business_info.get('industry'):
|
||||
# Try to infer industry from research preferences or content types
|
||||
content_types = research_prefs.get('content_types', [])
|
||||
if 'blog' in content_types or 'article' in content_types:
|
||||
business_info['industry'] = 'Content Marketing'
|
||||
business_info['inferred'] = True
|
||||
elif 'social_media' in content_types:
|
||||
business_info['industry'] = 'Social Media Marketing'
|
||||
business_info['inferred'] = True
|
||||
elif 'video' in content_types:
|
||||
business_info['industry'] = 'Video Content Creation'
|
||||
business_info['inferred'] = True
|
||||
|
||||
if not business_info.get('target_audience'):
|
||||
# Default to professionals for content creators
|
||||
business_info['target_audience'] = 'Professionals and content consumers'
|
||||
business_info['inferred'] = True
|
||||
|
||||
# Get competitor analysis data (if available)
|
||||
competitor_analysis = None
|
||||
try:
|
||||
competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
|
||||
if competitor_analysis:
|
||||
logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")
|
||||
|
||||
return {
|
||||
"website_analysis": website_analysis,
|
||||
"persona_data": persona_data_dict,
|
||||
"research_preferences": research_prefs,
|
||||
"business_info": business_info,
|
||||
"competitor_analysis": competitor_analysis # Add competitor data for better preset generation
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting onboarding data for user {user_id}: {e}")
|
||||
return None
|
||||
425
backend/services/research/tavily_service.py
Normal file
425
backend/services/research/tavily_service.py
Normal file
@@ -0,0 +1,425 @@
|
||||
"""
|
||||
Tavily API Service for ALwrity
|
||||
|
||||
This service provides web search and research capabilities using the Tavily API,
|
||||
which offers AI-powered search with real-time information retrieval.
|
||||
|
||||
Key Features:
|
||||
- Web search with AI-powered results
|
||||
- Content extraction and summarization
|
||||
- Real-time information retrieval
|
||||
- Topic-based search (general, news, finance)
|
||||
- Advanced search depth options
|
||||
- Cost-effective API usage with caching
|
||||
|
||||
Dependencies:
|
||||
- aiohttp (for async HTTP requests)
|
||||
- os (for environment variables)
|
||||
- logging (for debugging)
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
Last Updated: January 2025
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import aiohttp
|
||||
from typing import Dict, List, Optional, Any, Union
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
class TavilyService:
|
||||
"""
|
||||
Service for web search and research using the Tavily API.
|
||||
|
||||
This service provides AI-powered search capabilities to find relevant
|
||||
content and information for research purposes.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Tavily Service with API credentials."""
|
||||
self.api_key = os.getenv("TAVILY_API_KEY")
|
||||
self.base_url = "https://api.tavily.com"
|
||||
self.enabled = False
|
||||
|
||||
# Don't assume key is available at import time in production.
|
||||
# Keys may be injected per-request via middleware, so defer init.
|
||||
self._try_initialize()
|
||||
|
||||
def _try_initialize(self) -> None:
|
||||
"""Attempt to (re)initialize the Tavily service from current environment."""
|
||||
if self.enabled and self.api_key:
|
||||
return
|
||||
try:
|
||||
self.api_key = os.getenv("TAVILY_API_KEY")
|
||||
if not self.api_key:
|
||||
# Leave disabled; caller may try again after middleware injection
|
||||
logger.warning("TAVILY_API_KEY not configured; Tavily service will be disabled")
|
||||
self.enabled = False
|
||||
return
|
||||
self.enabled = True
|
||||
logger.info("Tavily Service initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Tavily service: {e}")
|
||||
self.enabled = False
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
topic: str = "general",
|
||||
search_depth: str = "basic",
|
||||
max_results: int = 10,
|
||||
include_domains: Optional[List[str]] = None,
|
||||
exclude_domains: Optional[List[str]] = None,
|
||||
include_answer: Union[bool, str] = False,
|
||||
include_raw_content: Union[bool, str] = False,
|
||||
include_images: bool = False,
|
||||
include_image_descriptions: bool = False,
|
||||
include_favicon: bool = False,
|
||||
time_range: Optional[str] = None,
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None,
|
||||
country: Optional[str] = None,
|
||||
chunks_per_source: int = 3,
|
||||
auto_parameters: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Execute a search query using Tavily API.
|
||||
|
||||
Args:
|
||||
query: The search query to execute
|
||||
topic: Category of search (general, news, finance)
|
||||
search_depth: Depth of search (basic, advanced) - basic costs 1 credit, advanced costs 2
|
||||
max_results: Maximum number of results to return (0-20)
|
||||
include_domains: List of domains to specifically include
|
||||
exclude_domains: List of domains to specifically exclude
|
||||
include_answer: Include LLM-generated answer (basic/advanced/true/false)
|
||||
include_raw_content: Include raw HTML content (markdown/text/true/false)
|
||||
include_images: Include image search results
|
||||
include_image_descriptions: Include image descriptions
|
||||
include_favicon: Include favicon URLs
|
||||
time_range: Time range filter (day, week, month, year, d, w, m, y)
|
||||
start_date: Start date filter (YYYY-MM-DD)
|
||||
end_date: End date filter (YYYY-MM-DD)
|
||||
country: Country filter (boost results from specific country)
|
||||
chunks_per_source: Maximum chunks per source (1-3, only for advanced search)
|
||||
auto_parameters: Auto-configure parameters based on query
|
||||
|
||||
Returns:
|
||||
Dictionary containing search results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Tavily Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting Tavily search for: {query}")
|
||||
|
||||
# Build request payload
|
||||
payload = {
|
||||
"api_key": self.api_key,
|
||||
"query": query,
|
||||
"topic": topic,
|
||||
"search_depth": search_depth,
|
||||
"max_results": min(max_results, 20), # Tavily limit
|
||||
"include_favicon": include_favicon
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
if include_domains:
|
||||
payload["include_domains"] = include_domains[:300] # Tavily limit
|
||||
|
||||
if exclude_domains:
|
||||
payload["exclude_domains"] = exclude_domains[:150] # Tavily limit
|
||||
|
||||
if include_answer:
|
||||
payload["include_answer"] = include_answer
|
||||
|
||||
if include_raw_content:
|
||||
payload["include_raw_content"] = include_raw_content
|
||||
|
||||
if include_images:
|
||||
payload["include_images"] = include_images
|
||||
if include_image_descriptions:
|
||||
payload["include_image_descriptions"] = include_image_descriptions
|
||||
|
||||
if time_range:
|
||||
payload["time_range"] = time_range
|
||||
|
||||
if start_date:
|
||||
payload["start_date"] = start_date
|
||||
|
||||
if end_date:
|
||||
payload["end_date"] = end_date
|
||||
|
||||
if country and topic == "general":
|
||||
payload["country"] = country
|
||||
|
||||
if search_depth == "advanced" and 1 <= chunks_per_source <= 3:
|
||||
payload["chunks_per_source"] = chunks_per_source
|
||||
|
||||
if auto_parameters:
|
||||
payload["auto_parameters"] = True
|
||||
|
||||
# Make API request
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"{self.base_url}/search",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=aiohttp.ClientTimeout(total=60)
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
result = await response.json()
|
||||
logger.info(f"Tavily search completed successfully. Found {len(result.get('results', []))} results.")
|
||||
|
||||
# Process and structure results
|
||||
processed_results = self._process_search_results(result, query)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"query": result.get("query", query),
|
||||
"answer": result.get("answer"), # If include_answer was requested
|
||||
"results": processed_results,
|
||||
"images": result.get("images", []),
|
||||
"response_time": result.get("response_time"),
|
||||
"request_id": result.get("request_id"),
|
||||
"auto_parameters": result.get("auto_parameters"),
|
||||
"total_results": len(processed_results),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Tavily API error: {response.status} - {error_text}")
|
||||
raise RuntimeError(f"Tavily API error: {response.status} - {error_text}")
|
||||
|
||||
except aiohttp.ClientTimeout:
|
||||
logger.error("Tavily API request timed out")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Request timed out",
|
||||
"details": "The search request took too long to complete"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Tavily search: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during search"
|
||||
}
|
||||
|
||||
def _process_search_results(self, api_response: Dict[str, Any], query: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process and structure Tavily API response into standardized format.
|
||||
|
||||
Args:
|
||||
api_response: Raw response from Tavily API
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
List of processed search results
|
||||
"""
|
||||
results = []
|
||||
raw_results = api_response.get("results", [])
|
||||
|
||||
for result in raw_results:
|
||||
try:
|
||||
# Extract domain from URL
|
||||
url = result.get("url", "")
|
||||
domain = urlparse(url).netloc if url else ""
|
||||
|
||||
# Calculate relevance score (Tavily provides score field)
|
||||
relevance_score = result.get("score", 0.5)
|
||||
|
||||
processed_result = {
|
||||
"url": url,
|
||||
"domain": domain,
|
||||
"title": result.get("title", ""),
|
||||
"content": result.get("content", ""),
|
||||
"raw_content": result.get("raw_content"), # If include_raw_content was requested
|
||||
"score": relevance_score,
|
||||
"relevance_score": relevance_score, # Alias for compatibility
|
||||
"favicon": result.get("favicon"),
|
||||
"published_date": result.get("published_date"),
|
||||
}
|
||||
|
||||
results.append(processed_result)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing Tavily result: {str(e)}")
|
||||
continue
|
||||
|
||||
# Sort by relevance score (highest first)
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
||||
|
||||
return results
|
||||
|
||||
async def search_industry_trends(
|
||||
self,
|
||||
topic: str,
|
||||
industry: str,
|
||||
max_results: int = 10,
|
||||
search_depth: str = "basic"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Search for current industry trends and insights.
|
||||
|
||||
Args:
|
||||
topic: The specific topic to research
|
||||
industry: The industry context for the search
|
||||
max_results: Maximum number of search results to return
|
||||
search_depth: Depth of search (basic or advanced)
|
||||
|
||||
Returns:
|
||||
Dictionary containing search results with industry context
|
||||
"""
|
||||
# Build industry-specific query
|
||||
search_query = f"{topic} {industry} trends insights"
|
||||
|
||||
# Use news topic for current trends
|
||||
return await self.search(
|
||||
query=search_query,
|
||||
topic="news" if search_depth == "basic" else "general",
|
||||
search_depth=search_depth,
|
||||
max_results=max_results,
|
||||
include_answer="basic",
|
||||
include_favicon=True,
|
||||
time_range="month" # Last month for current trends
|
||||
)
|
||||
|
||||
async def discover_competitors(
|
||||
self,
|
||||
user_url: str,
|
||||
num_results: int = 10,
|
||||
include_domains: Optional[List[str]] = None,
|
||||
exclude_domains: Optional[List[str]] = None,
|
||||
industry_context: Optional[str] = None,
|
||||
website_analysis_data: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Discover competitors for a given website using Tavily search.
|
||||
|
||||
Args:
|
||||
user_url: The website URL to find competitors for
|
||||
num_results: Number of competitor results to return
|
||||
include_domains: List of domains to include in search
|
||||
exclude_domains: List of domains to exclude from search
|
||||
industry_context: Industry context for better competitor discovery
|
||||
|
||||
Returns:
|
||||
Dictionary containing competitor analysis results
|
||||
"""
|
||||
try:
|
||||
# Ensure we pick up any per-request injected key
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Tavily Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting competitor discovery for: {user_url}")
|
||||
|
||||
# Extract user domain for exclusion
|
||||
user_domain = urlparse(user_url).netloc
|
||||
exclude_domains_list = exclude_domains or []
|
||||
exclude_domains_list.append(user_domain)
|
||||
|
||||
# Build search query
|
||||
query_parts = ["similar websites", "competitors"]
|
||||
if industry_context:
|
||||
query_parts.append(f"in {industry_context}")
|
||||
|
||||
# Extract insights from website analysis if available
|
||||
if website_analysis_data:
|
||||
analysis = website_analysis_data.get('analysis', {})
|
||||
if 'target_audience' in analysis:
|
||||
audience = analysis['target_audience']
|
||||
if isinstance(audience, dict) and 'primary_audience' in audience:
|
||||
query_parts.append(audience['primary_audience'])
|
||||
|
||||
search_query = " ".join(query_parts)
|
||||
|
||||
# Perform search
|
||||
search_result = await self.search(
|
||||
query=search_query,
|
||||
topic="general",
|
||||
search_depth="advanced", # Use advanced for better competitor discovery
|
||||
max_results=num_results,
|
||||
include_domains=include_domains,
|
||||
exclude_domains=exclude_domains_list,
|
||||
include_favicon=True,
|
||||
chunks_per_source=3
|
||||
)
|
||||
|
||||
if not search_result.get("success"):
|
||||
return search_result
|
||||
|
||||
# Process results into competitor format
|
||||
competitors = []
|
||||
for result in search_result.get("results", []):
|
||||
competitor_data = {
|
||||
"url": result.get("url"),
|
||||
"domain": result.get("domain"),
|
||||
"title": result.get("title"),
|
||||
"summary": result.get("content", ""),
|
||||
"relevance_score": result.get("relevance_score", 0.5),
|
||||
"favicon": result.get("favicon"),
|
||||
"published_date": result.get("published_date"),
|
||||
"highlights": self._extract_highlights(result.get("content", "")),
|
||||
"competitive_insights": self._extract_competitive_insights(result),
|
||||
"content_insights": self._analyze_content_quality(result)
|
||||
}
|
||||
competitors.append(competitor_data)
|
||||
|
||||
logger.info(f"Successfully discovered {len(competitors)} competitors for {user_url}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"user_url": user_url,
|
||||
"competitors": competitors,
|
||||
"total_competitors": len(competitors),
|
||||
"analysis_timestamp": datetime.utcnow().isoformat(),
|
||||
"industry_context": industry_context,
|
||||
"request_id": search_result.get("request_id")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in competitor discovery: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during competitor discovery"
|
||||
}
|
||||
|
||||
def _extract_highlights(self, content: str, num_sentences: int = 3) -> List[str]:
|
||||
"""Extract key highlights from content."""
|
||||
if not content:
|
||||
return []
|
||||
|
||||
# Simple sentence extraction (can be enhanced with NLP)
|
||||
sentences = [s.strip() for s in content.split('.') if s.strip()]
|
||||
return sentences[:num_sentences]
|
||||
|
||||
def _extract_competitive_insights(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Extract competitive insights from search result."""
|
||||
content = result.get("content", "")
|
||||
title = result.get("title", "")
|
||||
|
||||
return {
|
||||
"business_model": "unknown",
|
||||
"target_audience": "unknown",
|
||||
"key_differentiators": []
|
||||
}
|
||||
|
||||
def _analyze_content_quality(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Analyze content quality metrics."""
|
||||
content = result.get("content", "")
|
||||
|
||||
return {
|
||||
"content_focus": "general",
|
||||
"content_quality": "medium",
|
||||
"publishing_frequency": "unknown"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user