Added video studio router and endpoints. Added research router and endpoints. Added youtube router and endpoints. Added onboarding utils router and endpoints. Added onboarding utils service. Added onboarding utils models. Added onboarding utils routes. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils.
This commit is contained in:
@@ -7,20 +7,49 @@ replacing mock research with real-time industry information.
|
||||
Available Services:
|
||||
- GoogleSearchService: Real-time industry research using Google Custom Search API
|
||||
- ExaService: Competitor discovery and analysis using Exa API
|
||||
- TavilyService: AI-powered web search with real-time information
|
||||
- Source ranking and credibility assessment
|
||||
- Content extraction and insight generation
|
||||
|
||||
Core Module (v2.0):
|
||||
- ResearchEngine: Standalone AI research engine for any content tool
|
||||
- ResearchContext: Unified input schema for research requests
|
||||
- ParameterOptimizer: AI-driven parameter optimization
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
Last Updated: January 2025
|
||||
Version: 2.0
|
||||
Last Updated: December 2025
|
||||
"""
|
||||
|
||||
from .google_search_service import GoogleSearchService
|
||||
from .exa_service import ExaService
|
||||
from .tavily_service import TavilyService
|
||||
|
||||
# Core Research Engine (v2.0)
|
||||
from .core import (
|
||||
ResearchEngine,
|
||||
ResearchContext,
|
||||
ResearchPersonalizationContext,
|
||||
ContentType,
|
||||
ResearchGoal,
|
||||
ResearchDepth,
|
||||
ProviderPreference,
|
||||
ParameterOptimizer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Legacy services (still used by blog writer)
|
||||
"GoogleSearchService",
|
||||
"ExaService",
|
||||
"TavilyService"
|
||||
"TavilyService",
|
||||
|
||||
# Core Research Engine (v2.0)
|
||||
"ResearchEngine",
|
||||
"ResearchContext",
|
||||
"ResearchPersonalizationContext",
|
||||
"ContentType",
|
||||
"ResearchGoal",
|
||||
"ResearchDepth",
|
||||
"ProviderPreference",
|
||||
"ParameterOptimizer",
|
||||
]
|
||||
|
||||
51
backend/services/research/core/__init__.py
Normal file
51
backend/services/research/core/__init__.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Research Engine Core Module
|
||||
|
||||
This is the standalone AI Research Engine that can be imported by
|
||||
Blog Writer, Podcast Maker, YouTube Creator, and other ALwrity tools.
|
||||
|
||||
Design Goals:
|
||||
- Tool-agnostic: Any content tool can import and use this
|
||||
- AI-driven parameter optimization: Users don't need to understand Exa/Tavily internals
|
||||
- Provider priority: Exa → Tavily → Google (fallback)
|
||||
- Personalization-aware: Accepts context from calling tools
|
||||
- Advanced by default: Prioritizes quality over speed
|
||||
|
||||
Usage:
|
||||
from services.research.core import ResearchEngine, ResearchContext
|
||||
|
||||
engine = ResearchEngine()
|
||||
result = await engine.research(ResearchContext(
|
||||
query="AI trends in healthcare 2025",
|
||||
content_type=ContentType.BLOG,
|
||||
persona_context={"industry": "Healthcare", "audience": "Medical professionals"}
|
||||
))
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
Last Updated: December 2025
|
||||
"""
|
||||
|
||||
from .research_context import (
|
||||
ResearchContext,
|
||||
ResearchPersonalizationContext,
|
||||
ContentType,
|
||||
ResearchGoal,
|
||||
ResearchDepth,
|
||||
ProviderPreference,
|
||||
)
|
||||
from .parameter_optimizer import ParameterOptimizer
|
||||
from .research_engine import ResearchEngine
|
||||
|
||||
__all__ = [
|
||||
# Context schemas
|
||||
"ResearchContext",
|
||||
"ResearchPersonalizationContext",
|
||||
"ContentType",
|
||||
"ResearchGoal",
|
||||
"ResearchDepth",
|
||||
"ProviderPreference",
|
||||
# Core classes
|
||||
"ParameterOptimizer",
|
||||
"ResearchEngine",
|
||||
]
|
||||
384
backend/services/research/core/parameter_optimizer.py
Normal file
384
backend/services/research/core/parameter_optimizer.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""
|
||||
AI Parameter Optimizer for Research Engine
|
||||
|
||||
Uses AI to analyze the research query and context to select optimal
|
||||
parameters for Exa and Tavily APIs. This abstracts the complexity
|
||||
from non-technical users.
|
||||
|
||||
Key Decisions:
|
||||
- Provider selection (Exa vs Tavily vs Google)
|
||||
- Search type (neural vs keyword)
|
||||
- Category/topic selection
|
||||
- Depth and result limits
|
||||
- Domain filtering
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
from loguru import logger
|
||||
|
||||
from .research_context import (
|
||||
ResearchContext,
|
||||
ResearchGoal,
|
||||
ResearchDepth,
|
||||
ProviderPreference,
|
||||
ContentType,
|
||||
)
|
||||
from models.blog_models import ResearchConfig, ResearchProvider, ResearchMode
|
||||
|
||||
|
||||
class ParameterOptimizer:
|
||||
"""
|
||||
AI-driven parameter optimization for research providers.
|
||||
|
||||
Analyzes the research context and selects optimal parameters
|
||||
for Exa, Tavily, or Google without requiring user expertise.
|
||||
"""
|
||||
|
||||
# Query patterns for intelligent routing
|
||||
TRENDING_PATTERNS = [
|
||||
r'\b(latest|recent|new|2024|2025|current|trending|news)\b',
|
||||
r'\b(update|announcement|launch|release)\b',
|
||||
]
|
||||
|
||||
TECHNICAL_PATTERNS = [
|
||||
r'\b(api|sdk|framework|library|implementation|architecture)\b',
|
||||
r'\b(code|programming|developer|technical|engineering)\b',
|
||||
]
|
||||
|
||||
COMPETITIVE_PATTERNS = [
|
||||
r'\b(competitor|alternative|vs|versus|compare|comparison)\b',
|
||||
r'\b(market|industry|landscape|players)\b',
|
||||
]
|
||||
|
||||
FACTUAL_PATTERNS = [
|
||||
r'\b(statistics|data|research|study|report|survey)\b',
|
||||
r'\b(percent|percentage|number|figure|metric)\b',
|
||||
]
|
||||
|
||||
# Exa category mapping based on query analysis
|
||||
EXA_CATEGORY_MAP = {
|
||||
'research': 'research paper',
|
||||
'news': 'news',
|
||||
'company': 'company',
|
||||
'personal': 'personal site',
|
||||
'github': 'github',
|
||||
'linkedin': 'linkedin profile',
|
||||
'finance': 'financial report',
|
||||
}
|
||||
|
||||
# Tavily topic mapping
|
||||
TAVILY_TOPIC_MAP = {
|
||||
ResearchGoal.TRENDING: 'news',
|
||||
ResearchGoal.FACTUAL: 'general',
|
||||
ResearchGoal.COMPETITIVE: 'general',
|
||||
ResearchGoal.TECHNICAL: 'general',
|
||||
ResearchGoal.EDUCATIONAL: 'general',
|
||||
ResearchGoal.INSPIRATIONAL: 'general',
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the optimizer."""
|
||||
self.exa_available = bool(os.getenv("EXA_API_KEY"))
|
||||
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
|
||||
logger.info(f"ParameterOptimizer initialized: exa={self.exa_available}, tavily={self.tavily_available}")
|
||||
|
||||
def optimize(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
|
||||
"""
|
||||
Analyze research context and return optimized provider and config.
|
||||
|
||||
Args:
|
||||
context: The research context from the calling tool
|
||||
|
||||
Returns:
|
||||
Tuple of (selected_provider, optimized_config)
|
||||
"""
|
||||
# If advanced mode, use raw parameters
|
||||
if context.advanced_mode:
|
||||
return self._build_advanced_config(context)
|
||||
|
||||
# Analyze query to determine optimal approach
|
||||
query_analysis = self._analyze_query(context.query)
|
||||
|
||||
# Select provider based on analysis and preferences
|
||||
provider = self._select_provider(context, query_analysis)
|
||||
|
||||
# Build optimized config for selected provider
|
||||
config = self._build_config(context, provider, query_analysis)
|
||||
|
||||
logger.info(f"Optimized research: provider={provider.value}, mode={config.mode.value}")
|
||||
|
||||
return provider, config
|
||||
|
||||
def _analyze_query(self, query: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze the query to understand intent and optimal approach.
|
||||
|
||||
Returns dict with:
|
||||
- is_trending: Query is about recent/current events
|
||||
- is_technical: Query is technical in nature
|
||||
- is_competitive: Query is about competition/comparison
|
||||
- is_factual: Query needs data/statistics
|
||||
- suggested_category: Exa category if applicable
|
||||
- suggested_topic: Tavily topic
|
||||
"""
|
||||
query_lower = query.lower()
|
||||
|
||||
analysis = {
|
||||
'is_trending': self._matches_patterns(query_lower, self.TRENDING_PATTERNS),
|
||||
'is_technical': self._matches_patterns(query_lower, self.TECHNICAL_PATTERNS),
|
||||
'is_competitive': self._matches_patterns(query_lower, self.COMPETITIVE_PATTERNS),
|
||||
'is_factual': self._matches_patterns(query_lower, self.FACTUAL_PATTERNS),
|
||||
'suggested_category': None,
|
||||
'suggested_topic': 'general',
|
||||
'suggested_search_type': 'auto',
|
||||
}
|
||||
|
||||
# Determine Exa category
|
||||
if 'research' in query_lower or 'study' in query_lower or 'paper' in query_lower:
|
||||
analysis['suggested_category'] = 'research paper'
|
||||
elif 'github' in query_lower or 'repository' in query_lower:
|
||||
analysis['suggested_category'] = 'github'
|
||||
elif 'linkedin' in query_lower or 'professional' in query_lower:
|
||||
analysis['suggested_category'] = 'linkedin profile'
|
||||
elif analysis['is_trending']:
|
||||
analysis['suggested_category'] = 'news'
|
||||
elif 'company' in query_lower or 'startup' in query_lower:
|
||||
analysis['suggested_category'] = 'company'
|
||||
|
||||
# Determine Tavily topic
|
||||
if analysis['is_trending']:
|
||||
analysis['suggested_topic'] = 'news'
|
||||
elif 'finance' in query_lower or 'stock' in query_lower or 'investment' in query_lower:
|
||||
analysis['suggested_topic'] = 'finance'
|
||||
else:
|
||||
analysis['suggested_topic'] = 'general'
|
||||
|
||||
# Determine search type
|
||||
if analysis['is_technical'] or analysis['is_factual']:
|
||||
analysis['suggested_search_type'] = 'neural' # Better for semantic understanding
|
||||
elif analysis['is_trending']:
|
||||
analysis['suggested_search_type'] = 'keyword' # Better for current events
|
||||
|
||||
return analysis
|
||||
|
||||
def _matches_patterns(self, text: str, patterns: list) -> bool:
|
||||
"""Check if text matches any of the patterns."""
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _select_provider(self, context: ResearchContext, analysis: Dict[str, Any]) -> ResearchProvider:
|
||||
"""
|
||||
Select the optimal provider based on context and query analysis.
|
||||
|
||||
Priority: Exa → Tavily → Google for ALL modes (including basic).
|
||||
This provides better semantic search results for content creators.
|
||||
|
||||
Exa's neural search excels at understanding context and meaning,
|
||||
which is valuable for all research types, not just technical queries.
|
||||
"""
|
||||
preference = context.provider_preference
|
||||
|
||||
# If user explicitly requested a provider, respect that
|
||||
if preference == ProviderPreference.EXA:
|
||||
if self.exa_available:
|
||||
return ResearchProvider.EXA
|
||||
logger.warning("Exa requested but not available, falling back")
|
||||
|
||||
if preference == ProviderPreference.TAVILY:
|
||||
if self.tavily_available:
|
||||
return ResearchProvider.TAVILY
|
||||
logger.warning("Tavily requested but not available, falling back")
|
||||
|
||||
if preference == ProviderPreference.GOOGLE:
|
||||
return ResearchProvider.GOOGLE
|
||||
|
||||
# AUTO mode: Always prefer Exa → Tavily → Google
|
||||
# Exa provides superior semantic search for all content types
|
||||
if self.exa_available:
|
||||
logger.info(f"Selected Exa (primary provider): query analysis shows " +
|
||||
f"technical={analysis.get('is_technical', False)}, " +
|
||||
f"trending={analysis.get('is_trending', False)}")
|
||||
return ResearchProvider.EXA
|
||||
|
||||
# Tavily as secondary option - good for real-time and news
|
||||
if self.tavily_available:
|
||||
logger.info(f"Selected Tavily (secondary): Exa unavailable, " +
|
||||
f"trending={analysis.get('is_trending', False)}")
|
||||
return ResearchProvider.TAVILY
|
||||
|
||||
# Google grounding as fallback
|
||||
logger.info("Selected Google (fallback): Exa and Tavily unavailable")
|
||||
return ResearchProvider.GOOGLE
|
||||
|
||||
def _build_config(
|
||||
self,
|
||||
context: ResearchContext,
|
||||
provider: ResearchProvider,
|
||||
analysis: Dict[str, Any]
|
||||
) -> ResearchConfig:
|
||||
"""Build optimized ResearchConfig for the selected provider."""
|
||||
|
||||
# Map ResearchDepth to ResearchMode
|
||||
mode_map = {
|
||||
ResearchDepth.QUICK: ResearchMode.BASIC,
|
||||
ResearchDepth.STANDARD: ResearchMode.BASIC,
|
||||
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
|
||||
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
|
||||
}
|
||||
mode = mode_map.get(context.depth, ResearchMode.BASIC)
|
||||
|
||||
# Base config
|
||||
config = ResearchConfig(
|
||||
mode=mode,
|
||||
provider=provider,
|
||||
max_sources=context.max_sources,
|
||||
include_statistics=context.personalization.include_statistics if context.personalization else True,
|
||||
include_expert_quotes=context.personalization.include_expert_quotes if context.personalization else True,
|
||||
include_competitors=analysis['is_competitive'],
|
||||
include_trends=analysis['is_trending'],
|
||||
)
|
||||
|
||||
# Provider-specific optimizations
|
||||
if provider == ResearchProvider.EXA:
|
||||
config = self._optimize_exa_config(config, context, analysis)
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
config = self._optimize_tavily_config(config, context, analysis)
|
||||
|
||||
# Apply domain filters
|
||||
if context.include_domains:
|
||||
if provider == ResearchProvider.EXA:
|
||||
config.exa_include_domains = context.include_domains
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
config.tavily_include_domains = context.include_domains[:300] # Tavily limit
|
||||
|
||||
if context.exclude_domains:
|
||||
if provider == ResearchProvider.EXA:
|
||||
config.exa_exclude_domains = context.exclude_domains
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
config.tavily_exclude_domains = context.exclude_domains[:150] # Tavily limit
|
||||
|
||||
return config
|
||||
|
||||
def _optimize_exa_config(
|
||||
self,
|
||||
config: ResearchConfig,
|
||||
context: ResearchContext,
|
||||
analysis: Dict[str, Any]
|
||||
) -> ResearchConfig:
|
||||
"""Add Exa-specific optimizations."""
|
||||
|
||||
# Set category based on analysis
|
||||
if analysis['suggested_category']:
|
||||
config.exa_category = analysis['suggested_category']
|
||||
|
||||
# Set search type
|
||||
config.exa_search_type = analysis.get('suggested_search_type', 'auto')
|
||||
|
||||
# For comprehensive research, use neural search
|
||||
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
|
||||
config.exa_search_type = 'neural'
|
||||
|
||||
return config
|
||||
|
||||
def _optimize_tavily_config(
|
||||
self,
|
||||
config: ResearchConfig,
|
||||
context: ResearchContext,
|
||||
analysis: Dict[str, Any]
|
||||
) -> ResearchConfig:
|
||||
"""Add Tavily-specific optimizations."""
|
||||
|
||||
# Set topic based on analysis
|
||||
config.tavily_topic = analysis.get('suggested_topic', 'general')
|
||||
|
||||
# Set search depth based on research depth
|
||||
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
|
||||
config.tavily_search_depth = 'advanced' # 2 credits, but better results
|
||||
config.tavily_chunks_per_source = 3
|
||||
else:
|
||||
config.tavily_search_depth = 'basic' # 1 credit
|
||||
|
||||
# Set time range based on recency
|
||||
if context.recency:
|
||||
recency_map = {
|
||||
'day': 'd',
|
||||
'week': 'w',
|
||||
'month': 'm',
|
||||
'year': 'y',
|
||||
}
|
||||
config.tavily_time_range = recency_map.get(context.recency, context.recency)
|
||||
elif analysis['is_trending']:
|
||||
config.tavily_time_range = 'w' # Last week for trending topics
|
||||
|
||||
# Include answer for comprehensive research
|
||||
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
|
||||
config.tavily_include_answer = 'advanced'
|
||||
|
||||
# Include raw content for expert depth
|
||||
if context.depth == ResearchDepth.EXPERT:
|
||||
config.tavily_include_raw_content = 'markdown'
|
||||
|
||||
return config
|
||||
|
||||
def _build_advanced_config(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
|
||||
"""
|
||||
Build config from raw advanced parameters.
|
||||
Used when advanced_mode=True and user wants full control.
|
||||
"""
|
||||
# Determine provider from explicit parameters
|
||||
provider = ResearchProvider.GOOGLE
|
||||
|
||||
if context.exa_category or context.exa_search_type:
|
||||
provider = ResearchProvider.EXA if self.exa_available else ResearchProvider.GOOGLE
|
||||
elif context.tavily_topic or context.tavily_search_depth:
|
||||
provider = ResearchProvider.TAVILY if self.tavily_available else ResearchProvider.GOOGLE
|
||||
|
||||
# Check preference override
|
||||
if context.provider_preference == ProviderPreference.EXA and self.exa_available:
|
||||
provider = ResearchProvider.EXA
|
||||
elif context.provider_preference == ProviderPreference.TAVILY and self.tavily_available:
|
||||
provider = ResearchProvider.TAVILY
|
||||
elif context.provider_preference == ProviderPreference.GOOGLE:
|
||||
provider = ResearchProvider.GOOGLE
|
||||
|
||||
# Map depth to mode
|
||||
mode_map = {
|
||||
ResearchDepth.QUICK: ResearchMode.BASIC,
|
||||
ResearchDepth.STANDARD: ResearchMode.BASIC,
|
||||
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
|
||||
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
|
||||
}
|
||||
mode = mode_map.get(context.depth, ResearchMode.BASIC)
|
||||
|
||||
# Build config with raw parameters
|
||||
config = ResearchConfig(
|
||||
mode=mode,
|
||||
provider=provider,
|
||||
max_sources=context.max_sources,
|
||||
# Exa
|
||||
exa_category=context.exa_category,
|
||||
exa_search_type=context.exa_search_type,
|
||||
exa_include_domains=context.include_domains,
|
||||
exa_exclude_domains=context.exclude_domains,
|
||||
# Tavily
|
||||
tavily_topic=context.tavily_topic,
|
||||
tavily_search_depth=context.tavily_search_depth,
|
||||
tavily_include_domains=context.include_domains[:300] if context.include_domains else [],
|
||||
tavily_exclude_domains=context.exclude_domains[:150] if context.exclude_domains else [],
|
||||
tavily_include_answer=context.tavily_include_answer,
|
||||
tavily_include_raw_content=context.tavily_include_raw_content,
|
||||
tavily_time_range=context.tavily_time_range,
|
||||
tavily_country=context.tavily_country,
|
||||
)
|
||||
|
||||
logger.info(f"Advanced config: provider={provider.value}, mode={mode.value}")
|
||||
|
||||
return provider, config
|
||||
|
||||
198
backend/services/research/core/research_context.py
Normal file
198
backend/services/research/core/research_context.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Research Context Schema
|
||||
|
||||
Defines the unified input schema for the Research Engine.
|
||||
Any tool (Blog Writer, Podcast Maker, YouTube Creator) can create a ResearchContext
|
||||
and pass it to the Research Engine.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ContentType(str, Enum):
|
||||
"""Type of content being created - affects research focus."""
|
||||
BLOG = "blog"
|
||||
PODCAST = "podcast"
|
||||
VIDEO = "video"
|
||||
SOCIAL = "social"
|
||||
EMAIL = "email"
|
||||
NEWSLETTER = "newsletter"
|
||||
WHITEPAPER = "whitepaper"
|
||||
GENERAL = "general"
|
||||
|
||||
|
||||
class ResearchGoal(str, Enum):
|
||||
"""Primary goal of the research - affects provider selection and depth."""
|
||||
FACTUAL = "factual" # Stats, data, citations
|
||||
TRENDING = "trending" # Current trends, news
|
||||
COMPETITIVE = "competitive" # Competitor analysis
|
||||
EDUCATIONAL = "educational" # How-to, explanations
|
||||
INSPIRATIONAL = "inspirational" # Stories, quotes
|
||||
TECHNICAL = "technical" # Deep technical content
|
||||
|
||||
|
||||
class ResearchDepth(str, Enum):
|
||||
"""Depth of research - maps to existing ResearchMode."""
|
||||
QUICK = "quick" # Fast, surface-level (maps to BASIC)
|
||||
STANDARD = "standard" # Balanced depth (maps to BASIC with more sources)
|
||||
COMPREHENSIVE = "comprehensive" # Deep research (maps to COMPREHENSIVE)
|
||||
EXPERT = "expert" # Maximum depth with expert sources
|
||||
|
||||
|
||||
class ProviderPreference(str, Enum):
|
||||
"""Provider preference - AUTO lets the engine decide."""
|
||||
AUTO = "auto" # AI decides based on query (default)
|
||||
EXA = "exa" # Force Exa neural search
|
||||
TAVILY = "tavily" # Force Tavily AI search
|
||||
GOOGLE = "google" # Force Google grounding
|
||||
HYBRID = "hybrid" # Use multiple providers
|
||||
|
||||
|
||||
class ResearchPersonalizationContext(BaseModel):
|
||||
"""
|
||||
Context from the calling tool (Blog Writer, Podcast Maker, etc.)
|
||||
This personalizes the research without the Research Engine knowing
|
||||
the specific tool implementation.
|
||||
"""
|
||||
# Who is creating the content
|
||||
creator_id: Optional[str] = None # Clerk user ID
|
||||
|
||||
# Content context
|
||||
content_type: ContentType = ContentType.GENERAL
|
||||
industry: Optional[str] = None
|
||||
target_audience: Optional[str] = None
|
||||
tone: Optional[str] = None # professional, casual, technical, etc.
|
||||
|
||||
# Persona data (from onboarding)
|
||||
persona_id: Optional[str] = None
|
||||
brand_voice: Optional[str] = None
|
||||
competitor_urls: List[str] = Field(default_factory=list)
|
||||
|
||||
# Content requirements
|
||||
word_count_target: Optional[int] = None
|
||||
include_statistics: bool = True
|
||||
include_expert_quotes: bool = True
|
||||
include_case_studies: bool = False
|
||||
include_visuals: bool = False
|
||||
|
||||
# Platform-specific hints
|
||||
platform: Optional[str] = None # medium, wordpress, youtube, spotify, etc.
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
|
||||
class ResearchContext(BaseModel):
|
||||
"""
|
||||
Main input schema for the Research Engine.
|
||||
|
||||
This is what any tool passes to the Research Engine to get research results.
|
||||
The engine uses AI to optimize parameters based on this context.
|
||||
"""
|
||||
# Primary research input
|
||||
query: str = Field(..., description="Main research query or topic")
|
||||
keywords: List[str] = Field(default_factory=list, description="Additional keywords")
|
||||
|
||||
# Research configuration
|
||||
goal: ResearchGoal = ResearchGoal.FACTUAL
|
||||
depth: ResearchDepth = ResearchDepth.STANDARD
|
||||
provider_preference: ProviderPreference = ProviderPreference.AUTO
|
||||
|
||||
# Personalization from calling tool
|
||||
personalization: Optional[ResearchPersonalizationContext] = None
|
||||
|
||||
# Constraints
|
||||
max_sources: int = Field(default=10, ge=1, le=25)
|
||||
recency: Optional[str] = None # "day", "week", "month", "year", None for all-time
|
||||
|
||||
# Domain filtering
|
||||
include_domains: List[str] = Field(default_factory=list)
|
||||
exclude_domains: List[str] = Field(default_factory=list)
|
||||
|
||||
# Advanced mode (exposes raw provider parameters)
|
||||
advanced_mode: bool = False
|
||||
|
||||
# Raw provider parameters (only used if advanced_mode=True)
|
||||
# Exa-specific
|
||||
exa_category: Optional[str] = None
|
||||
exa_search_type: Optional[str] = None # auto, keyword, neural
|
||||
|
||||
# Tavily-specific
|
||||
tavily_topic: Optional[str] = None # general, news, finance
|
||||
tavily_search_depth: Optional[str] = None # basic, advanced
|
||||
tavily_include_answer: bool = False
|
||||
tavily_include_raw_content: bool = False
|
||||
tavily_time_range: Optional[str] = None
|
||||
tavily_country: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
def get_effective_query(self) -> str:
|
||||
"""Build effective query combining query and keywords."""
|
||||
if self.keywords:
|
||||
return f"{self.query} {' '.join(self.keywords)}"
|
||||
return self.query
|
||||
|
||||
def get_industry(self) -> str:
|
||||
"""Get industry from personalization or default."""
|
||||
if self.personalization and self.personalization.industry:
|
||||
return self.personalization.industry
|
||||
return "General"
|
||||
|
||||
def get_audience(self) -> str:
|
||||
"""Get target audience from personalization or default."""
|
||||
if self.personalization and self.personalization.target_audience:
|
||||
return self.personalization.target_audience
|
||||
return "General"
|
||||
|
||||
def get_user_id(self) -> Optional[str]:
|
||||
"""Get user ID from personalization."""
|
||||
if self.personalization:
|
||||
return self.personalization.creator_id
|
||||
return None
|
||||
|
||||
|
||||
class ResearchResult(BaseModel):
|
||||
"""
|
||||
Output schema from the Research Engine.
|
||||
Standardized format that any tool can consume.
|
||||
"""
|
||||
success: bool = True
|
||||
|
||||
# Content
|
||||
summary: Optional[str] = None # AI-generated summary of findings
|
||||
raw_content: Optional[str] = None # Raw aggregated content for LLM processing
|
||||
|
||||
# Sources
|
||||
sources: List[Dict[str, Any]] = Field(default_factory=list)
|
||||
|
||||
# Analysis (reuses existing blog writer analysis)
|
||||
keyword_analysis: Dict[str, Any] = Field(default_factory=dict)
|
||||
competitor_analysis: Dict[str, Any] = Field(default_factory=dict)
|
||||
suggested_angles: List[str] = Field(default_factory=list)
|
||||
|
||||
# Metadata
|
||||
provider_used: str = "google" # Which provider was actually used
|
||||
search_queries: List[str] = Field(default_factory=list)
|
||||
grounding_metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
# Cost tracking
|
||||
estimated_cost: float = 0.0
|
||||
|
||||
# Error handling
|
||||
error_message: Optional[str] = None
|
||||
error_code: Optional[str] = None
|
||||
retry_suggested: bool = False
|
||||
|
||||
# Original context for reference
|
||||
original_query: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
558
backend/services/research/core/research_engine.py
Normal file
558
backend/services/research/core/research_engine.py
Normal file
@@ -0,0 +1,558 @@
|
||||
"""
|
||||
Research Engine - Core Orchestrator
|
||||
|
||||
The main entry point for AI research across all ALwrity tools.
|
||||
This engine wraps existing providers (Exa, Tavily, Google) and provides
|
||||
a unified interface for any content generation tool.
|
||||
|
||||
Usage:
|
||||
from services.research.core import ResearchEngine, ResearchContext, ContentType
|
||||
|
||||
engine = ResearchEngine()
|
||||
result = await engine.research(ResearchContext(
|
||||
query="AI trends in healthcare 2025",
|
||||
content_type=ContentType.PODCAST,
|
||||
personalization=ResearchPersonalizationContext(
|
||||
industry="Healthcare",
|
||||
target_audience="Medical professionals"
|
||||
)
|
||||
))
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 2.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Dict, Any, Optional, Callable
|
||||
from loguru import logger
|
||||
|
||||
from .research_context import (
|
||||
ResearchContext,
|
||||
ResearchResult,
|
||||
ResearchDepth,
|
||||
ContentType,
|
||||
ResearchPersonalizationContext,
|
||||
)
|
||||
from .parameter_optimizer import ParameterOptimizer
|
||||
|
||||
# Reuse existing blog writer models and services
|
||||
from models.blog_models import (
|
||||
BlogResearchRequest,
|
||||
BlogResearchResponse,
|
||||
ResearchConfig,
|
||||
ResearchProvider,
|
||||
ResearchMode,
|
||||
PersonaInfo,
|
||||
ResearchSource,
|
||||
)
|
||||
|
||||
# Research persona for personalization
|
||||
from models.research_persona_models import ResearchPersona
|
||||
|
||||
|
||||
class ResearchEngine:
|
||||
"""
|
||||
AI Research Engine - Standalone module for content research.
|
||||
|
||||
This engine:
|
||||
1. Accepts a ResearchContext from any tool
|
||||
2. Uses AI to optimize parameters for Exa/Tavily
|
||||
3. Integrates research persona for personalization
|
||||
4. Executes research using existing providers
|
||||
5. Returns standardized ResearchResult
|
||||
|
||||
Can be imported by Blog Writer, Podcast Maker, YouTube Creator, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, db_session=None):
|
||||
"""Initialize the Research Engine."""
|
||||
self.optimizer = ParameterOptimizer()
|
||||
self._providers_initialized = False
|
||||
self._exa_provider = None
|
||||
self._tavily_provider = None
|
||||
self._google_provider = None
|
||||
self._db_session = db_session
|
||||
|
||||
# Check provider availability
|
||||
self.exa_available = bool(os.getenv("EXA_API_KEY"))
|
||||
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
logger.info(f"ResearchEngine initialized: exa={self.exa_available}, tavily={self.tavily_available}")
|
||||
|
||||
def _get_research_persona(self, user_id: str, generate_if_missing: bool = True) -> Optional[ResearchPersona]:
|
||||
"""
|
||||
Fetch research persona for user, generating if missing.
|
||||
|
||||
Phase 2: Since onboarding is mandatory and always completes before accessing
|
||||
any tool, we can safely generate research persona on first use. This ensures
|
||||
hyper-personalization without requiring "General" fallbacks.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
generate_if_missing: If True, generate persona if not cached (default: True)
|
||||
|
||||
Returns:
|
||||
ResearchPersona if successful, None only if user has no core persona
|
||||
"""
|
||||
if not user_id:
|
||||
return None
|
||||
|
||||
try:
|
||||
from services.research.research_persona_service import ResearchPersonaService
|
||||
|
||||
db = self._db_session
|
||||
if not db:
|
||||
from services.database import get_db_session
|
||||
db = get_db_session()
|
||||
|
||||
persona_service = ResearchPersonaService(db_session=db)
|
||||
|
||||
if generate_if_missing:
|
||||
# Phase 2: Use get_or_generate() to create persona on first visit
|
||||
# This triggers LLM call if not cached, but onboarding guarantees
|
||||
# core persona exists, so generation will succeed
|
||||
logger.info(f"🔄 Getting/generating research persona for user {user_id}...")
|
||||
persona = persona_service.get_or_generate(user_id, force_refresh=False)
|
||||
|
||||
if persona:
|
||||
logger.info(f"✅ Research persona ready for user {user_id}: industry={persona.default_industry}")
|
||||
else:
|
||||
logger.warning(f"⚠️ Could not get/generate research persona for user {user_id} - using core persona fallback")
|
||||
else:
|
||||
# Fast path: only return cached (for config endpoints)
|
||||
persona = persona_service.get_cached_only(user_id)
|
||||
if persona:
|
||||
logger.debug(f"Research persona loaded from cache for user {user_id}")
|
||||
|
||||
return persona
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load research persona for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def _enrich_context_with_persona(
|
||||
self,
|
||||
context: ResearchContext,
|
||||
persona: ResearchPersona
|
||||
) -> ResearchContext:
|
||||
"""
|
||||
Enrich the research context with persona data.
|
||||
|
||||
Only applies persona defaults if the context doesn't already have values.
|
||||
User-provided values always take precedence.
|
||||
"""
|
||||
# Create personalization context if not exists
|
||||
if not context.personalization:
|
||||
context.personalization = ResearchPersonalizationContext()
|
||||
|
||||
# Apply persona defaults only if not already set
|
||||
if not context.personalization.industry or context.personalization.industry == "General":
|
||||
if persona.default_industry:
|
||||
context.personalization.industry = persona.default_industry
|
||||
logger.debug(f"Applied persona industry: {persona.default_industry}")
|
||||
|
||||
if not context.personalization.target_audience or context.personalization.target_audience == "General":
|
||||
if persona.default_target_audience:
|
||||
context.personalization.target_audience = persona.default_target_audience
|
||||
logger.debug(f"Applied persona target_audience: {persona.default_target_audience}")
|
||||
|
||||
# Apply suggested Exa domains if not already set
|
||||
if not context.include_domains and persona.suggested_exa_domains:
|
||||
context.include_domains = persona.suggested_exa_domains[:6] # Limit to 6 domains
|
||||
logger.debug(f"Applied persona domains: {context.include_domains}")
|
||||
|
||||
# Apply suggested Exa category if not already set
|
||||
if not context.exa_category and persona.suggested_exa_category:
|
||||
context.exa_category = persona.suggested_exa_category
|
||||
logger.debug(f"Applied persona exa_category: {persona.suggested_exa_category}")
|
||||
|
||||
return context
|
||||
|
||||
async def research(
|
||||
self,
|
||||
context: ResearchContext,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> ResearchResult:
|
||||
"""
|
||||
Execute research based on the given context.
|
||||
|
||||
Args:
|
||||
context: Research context with query, goals, and personalization
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
ResearchResult with sources, analysis, and content
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Progress update
|
||||
self._progress(progress_callback, "🔍 Analyzing research query...")
|
||||
|
||||
# Enrich context with research persona (Phase 2: generate if missing)
|
||||
user_id = context.get_user_id()
|
||||
if user_id:
|
||||
self._progress(progress_callback, "👤 Loading personalized research profile...")
|
||||
persona = self._get_research_persona(user_id, generate_if_missing=True)
|
||||
if persona:
|
||||
self._progress(progress_callback, "✨ Applying hyper-personalized settings...")
|
||||
context = self._enrich_context_with_persona(context, persona)
|
||||
else:
|
||||
logger.warning(f"No research persona available for user {user_id} - proceeding with provided context")
|
||||
|
||||
# Optimize parameters based on enriched context
|
||||
provider, config = self.optimizer.optimize(context)
|
||||
|
||||
self._progress(progress_callback, f"🤖 Selected {provider.value.upper()} for research")
|
||||
|
||||
# Build the request using existing blog models
|
||||
request = self._build_request(context, config)
|
||||
user_id = context.get_user_id() or ""
|
||||
|
||||
# Execute research using appropriate provider
|
||||
self._progress(progress_callback, f"🌐 Connecting to {provider.value} search...")
|
||||
|
||||
if provider == ResearchProvider.EXA:
|
||||
response = await self._execute_exa_research(request, config, user_id, progress_callback)
|
||||
elif provider == ResearchProvider.TAVILY:
|
||||
response = await self._execute_tavily_research(request, config, user_id, progress_callback)
|
||||
else:
|
||||
response = await self._execute_google_research(request, config, user_id, progress_callback)
|
||||
|
||||
# Transform response to ResearchResult
|
||||
self._progress(progress_callback, "📊 Processing results...")
|
||||
|
||||
result = self._transform_response(response, provider, context)
|
||||
|
||||
duration_ms = (time.time() - start_time) * 1000
|
||||
logger.info(f"Research completed in {duration_ms:.0f}ms: {len(result.sources)} sources")
|
||||
|
||||
self._progress(progress_callback, f"✅ Research complete: {len(result.sources)} sources found")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Research failed: {e}")
|
||||
return ResearchResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
error_code="RESEARCH_FAILED",
|
||||
retry_suggested=True,
|
||||
original_query=context.query
|
||||
)
|
||||
|
||||
def _progress(self, callback: Optional[Callable[[str], None]], message: str):
|
||||
"""Send progress update if callback provided."""
|
||||
if callback:
|
||||
callback(message)
|
||||
logger.info(f"[Research] {message}")
|
||||
|
||||
def _build_request(self, context: ResearchContext, config: ResearchConfig) -> BlogResearchRequest:
|
||||
"""Build BlogResearchRequest from ResearchContext."""
|
||||
|
||||
# Extract keywords from query
|
||||
keywords = context.keywords if context.keywords else [context.query]
|
||||
|
||||
# Build persona info from personalization
|
||||
persona = None
|
||||
if context.personalization:
|
||||
persona = PersonaInfo(
|
||||
persona_id=context.personalization.persona_id,
|
||||
tone=context.personalization.tone,
|
||||
audience=context.personalization.target_audience,
|
||||
industry=context.personalization.industry,
|
||||
)
|
||||
|
||||
return BlogResearchRequest(
|
||||
keywords=keywords,
|
||||
topic=context.query,
|
||||
industry=context.get_industry(),
|
||||
target_audience=context.get_audience(),
|
||||
tone=context.personalization.tone if context.personalization else None,
|
||||
word_count_target=context.personalization.word_count_target if context.personalization else 1500,
|
||||
persona=persona,
|
||||
research_mode=config.mode,
|
||||
config=config,
|
||||
)
|
||||
|
||||
async def _execute_exa_research(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> BlogResearchResponse:
|
||||
"""Execute research using Exa provider."""
|
||||
from services.blog_writer.research.exa_provider import ExaResearchProvider
|
||||
from services.blog_writer.research.research_strategies import get_strategy_for_mode
|
||||
|
||||
self._progress(progress_callback, "🔍 Executing Exa neural search...")
|
||||
|
||||
# Get strategy for building prompt
|
||||
strategy = get_strategy_for_mode(config.mode)
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
target_audience = request.target_audience or "General"
|
||||
|
||||
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
|
||||
|
||||
# Execute Exa search
|
||||
try:
|
||||
exa_provider = ExaResearchProvider()
|
||||
raw_result = await exa_provider.search(
|
||||
research_prompt, topic, industry, target_audience, config, user_id
|
||||
)
|
||||
|
||||
# Track usage
|
||||
cost = raw_result.get('cost', {}).get('total', 0.005) if isinstance(raw_result.get('cost'), dict) else 0.005
|
||||
exa_provider.track_exa_usage(user_id, cost)
|
||||
|
||||
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
|
||||
|
||||
# Run common analysis
|
||||
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
|
||||
|
||||
except RuntimeError as e:
|
||||
if "EXA_API_KEY not configured" in str(e):
|
||||
logger.warning("Exa not configured, falling back to Tavily")
|
||||
self._progress(progress_callback, "⚠️ Exa unavailable, trying Tavily...")
|
||||
config.provider = ResearchProvider.TAVILY
|
||||
return await self._execute_tavily_research(request, config, user_id, progress_callback)
|
||||
raise
|
||||
|
||||
async def _execute_tavily_research(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> BlogResearchResponse:
|
||||
"""Execute research using Tavily provider."""
|
||||
from services.blog_writer.research.tavily_provider import TavilyResearchProvider
|
||||
from services.blog_writer.research.research_strategies import get_strategy_for_mode
|
||||
|
||||
self._progress(progress_callback, "🔍 Executing Tavily AI search...")
|
||||
|
||||
# Get strategy for building prompt
|
||||
strategy = get_strategy_for_mode(config.mode)
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
target_audience = request.target_audience or "General"
|
||||
|
||||
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
|
||||
|
||||
# Execute Tavily search
|
||||
try:
|
||||
tavily_provider = TavilyResearchProvider()
|
||||
raw_result = await tavily_provider.search(
|
||||
research_prompt, topic, industry, target_audience, config, user_id
|
||||
)
|
||||
|
||||
# Track usage
|
||||
cost = raw_result.get('cost', {}).get('total', 0.001) if isinstance(raw_result.get('cost'), dict) else 0.001
|
||||
search_depth = config.tavily_search_depth or "basic"
|
||||
tavily_provider.track_tavily_usage(user_id, cost, search_depth)
|
||||
|
||||
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
|
||||
|
||||
# Run common analysis
|
||||
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
|
||||
|
||||
except RuntimeError as e:
|
||||
if "TAVILY_API_KEY not configured" in str(e):
|
||||
logger.warning("Tavily not configured, falling back to Google")
|
||||
self._progress(progress_callback, "⚠️ Tavily unavailable, using Google Search...")
|
||||
config.provider = ResearchProvider.GOOGLE
|
||||
return await self._execute_google_research(request, config, user_id, progress_callback)
|
||||
raise
|
||||
|
||||
async def _execute_google_research(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None
|
||||
) -> BlogResearchResponse:
|
||||
"""Execute research using Google/Gemini grounding."""
|
||||
from services.blog_writer.research.google_provider import GoogleResearchProvider
|
||||
from services.blog_writer.research.research_strategies import get_strategy_for_mode
|
||||
|
||||
self._progress(progress_callback, "🔍 Executing Google Search grounding...")
|
||||
|
||||
# Get strategy for building prompt
|
||||
strategy = get_strategy_for_mode(config.mode)
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
target_audience = request.target_audience or "General"
|
||||
|
||||
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
|
||||
|
||||
# Execute Google search
|
||||
google_provider = GoogleResearchProvider()
|
||||
raw_result = await google_provider.search(
|
||||
research_prompt, topic, industry, target_audience, config, user_id
|
||||
)
|
||||
|
||||
self._progress(progress_callback, "📝 Processing grounded results...")
|
||||
|
||||
# Run common analysis
|
||||
return await self._run_analysis(request, raw_result, config, user_id, progress_callback, is_google=True)
|
||||
|
||||
async def _run_analysis(
|
||||
self,
|
||||
request: BlogResearchRequest,
|
||||
raw_result: Dict[str, Any],
|
||||
config: ResearchConfig,
|
||||
user_id: str,
|
||||
progress_callback: Optional[Callable[[str], None]] = None,
|
||||
is_google: bool = False
|
||||
) -> BlogResearchResponse:
|
||||
"""Run common analysis on raw results."""
|
||||
from services.blog_writer.research.keyword_analyzer import KeywordAnalyzer
|
||||
from services.blog_writer.research.competitor_analyzer import CompetitorAnalyzer
|
||||
from services.blog_writer.research.content_angle_generator import ContentAngleGenerator
|
||||
from services.blog_writer.research.data_filter import ResearchDataFilter
|
||||
|
||||
self._progress(progress_callback, "🔍 Analyzing keywords and content angles...")
|
||||
|
||||
# Extract content for analysis
|
||||
if is_google:
|
||||
content = raw_result.get("content", "")
|
||||
sources = self._extract_sources_from_grounding(raw_result)
|
||||
search_queries = raw_result.get("search_queries", []) or []
|
||||
grounding_metadata = self._extract_grounding_metadata(raw_result)
|
||||
else:
|
||||
content = raw_result.get('content', '')
|
||||
sources = [ResearchSource(**s) if isinstance(s, dict) else s for s in raw_result.get('sources', [])]
|
||||
search_queries = raw_result.get('search_queries', [])
|
||||
grounding_metadata = None
|
||||
|
||||
topic = request.topic or ", ".join(request.keywords)
|
||||
industry = request.industry or "General"
|
||||
|
||||
# Run analyzers
|
||||
keyword_analyzer = KeywordAnalyzer()
|
||||
competitor_analyzer = CompetitorAnalyzer()
|
||||
content_angle_generator = ContentAngleGenerator()
|
||||
data_filter = ResearchDataFilter()
|
||||
|
||||
keyword_analysis = keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
|
||||
competitor_analysis = competitor_analyzer.analyze(content, user_id=user_id)
|
||||
suggested_angles = content_angle_generator.generate(content, topic, industry, user_id=user_id)
|
||||
|
||||
# Build response
|
||||
response = BlogResearchResponse(
|
||||
success=True,
|
||||
sources=sources,
|
||||
keyword_analysis=keyword_analysis,
|
||||
competitor_analysis=competitor_analysis,
|
||||
suggested_angles=suggested_angles,
|
||||
search_widget="",
|
||||
search_queries=search_queries,
|
||||
grounding_metadata=grounding_metadata,
|
||||
original_keywords=request.keywords,
|
||||
)
|
||||
|
||||
# Filter and clean research data
|
||||
self._progress(progress_callback, "✨ Filtering and optimizing results...")
|
||||
filtered_response = data_filter.filter_research_data(response)
|
||||
|
||||
return filtered_response
|
||||
|
||||
def _extract_sources_from_grounding(self, gemini_result: Dict[str, Any]) -> list:
|
||||
"""Extract sources from Gemini grounding metadata."""
|
||||
from models.blog_models import ResearchSource
|
||||
|
||||
sources = []
|
||||
if not gemini_result or not isinstance(gemini_result, dict):
|
||||
return sources
|
||||
|
||||
raw_sources = gemini_result.get("sources", []) or []
|
||||
|
||||
for src in raw_sources:
|
||||
source = ResearchSource(
|
||||
title=src.get("title", "Untitled"),
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
published_at=str(src.get("publication_date", "2024-01-01")),
|
||||
index=src.get("index"),
|
||||
source_type=src.get("type", "web")
|
||||
)
|
||||
sources.append(source)
|
||||
|
||||
return sources
|
||||
|
||||
def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract grounding metadata from Gemini result."""
|
||||
if not gemini_result or not isinstance(gemini_result, dict):
|
||||
return None
|
||||
|
||||
return gemini_result.get("grounding_metadata")
|
||||
|
||||
def _transform_response(
|
||||
self,
|
||||
response: BlogResearchResponse,
|
||||
provider: ResearchProvider,
|
||||
context: ResearchContext
|
||||
) -> ResearchResult:
|
||||
"""Transform BlogResearchResponse to ResearchResult."""
|
||||
|
||||
# Convert sources to dicts
|
||||
sources = []
|
||||
for s in response.sources:
|
||||
if hasattr(s, 'dict'):
|
||||
sources.append(s.dict())
|
||||
elif isinstance(s, dict):
|
||||
sources.append(s)
|
||||
else:
|
||||
sources.append({
|
||||
'title': getattr(s, 'title', ''),
|
||||
'url': getattr(s, 'url', ''),
|
||||
'excerpt': getattr(s, 'excerpt', ''),
|
||||
})
|
||||
|
||||
# Extract grounding metadata
|
||||
grounding = None
|
||||
if response.grounding_metadata:
|
||||
if hasattr(response.grounding_metadata, 'dict'):
|
||||
grounding = response.grounding_metadata.dict()
|
||||
else:
|
||||
grounding = response.grounding_metadata
|
||||
|
||||
return ResearchResult(
|
||||
success=response.success,
|
||||
sources=sources,
|
||||
keyword_analysis=response.keyword_analysis,
|
||||
competitor_analysis=response.competitor_analysis,
|
||||
suggested_angles=response.suggested_angles,
|
||||
provider_used=provider.value,
|
||||
search_queries=response.search_queries,
|
||||
grounding_metadata=grounding,
|
||||
original_query=context.query,
|
||||
error_message=response.error_message,
|
||||
error_code=response.error_code if hasattr(response, 'error_code') else None,
|
||||
retry_suggested=response.retry_suggested if hasattr(response, 'retry_suggested') else False,
|
||||
)
|
||||
|
||||
def get_provider_status(self) -> Dict[str, Any]:
|
||||
"""Get status of available providers."""
|
||||
return {
|
||||
"exa": {
|
||||
"available": self.exa_available,
|
||||
"priority": 1,
|
||||
"description": "Neural search for semantic understanding"
|
||||
},
|
||||
"tavily": {
|
||||
"available": self.tavily_available,
|
||||
"priority": 2,
|
||||
"description": "AI-powered web search"
|
||||
},
|
||||
"google": {
|
||||
"available": True, # Always available via Gemini
|
||||
"priority": 3,
|
||||
"description": "Google Search grounding"
|
||||
}
|
||||
}
|
||||
|
||||
23
backend/services/research/intent/__init__.py
Normal file
23
backend/services/research/intent/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Research Intent Package
|
||||
|
||||
This package provides intent-driven research capabilities:
|
||||
- Intent inference from user input
|
||||
- Targeted query generation
|
||||
- Intent-aware result analysis
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
from .research_intent_inference import ResearchIntentInference
|
||||
from .intent_query_generator import IntentQueryGenerator
|
||||
from .intent_aware_analyzer import IntentAwareAnalyzer
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
__all__ = [
|
||||
"ResearchIntentInference",
|
||||
"IntentQueryGenerator",
|
||||
"IntentAwareAnalyzer",
|
||||
"IntentPromptBuilder",
|
||||
]
|
||||
547
backend/services/research/intent/intent_aware_analyzer.py
Normal file
547
backend/services/research/intent/intent_aware_analyzer.py
Normal file
@@ -0,0 +1,547 @@
|
||||
"""
|
||||
Intent-Aware Result Analyzer
|
||||
|
||||
Analyzes research results based on user intent.
|
||||
Extracts exactly what the user needs from raw research data.
|
||||
|
||||
This is the key innovation - instead of generic analysis,
|
||||
we analyze results through the lens of what the user wants to accomplish.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
IntentDrivenResearchResult,
|
||||
ExpectedDeliverable,
|
||||
StatisticWithCitation,
|
||||
ExpertQuote,
|
||||
CaseStudySummary,
|
||||
TrendAnalysis,
|
||||
ComparisonTable,
|
||||
ComparisonItem,
|
||||
ProsCons,
|
||||
SourceWithRelevance,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
|
||||
class IntentAwareAnalyzer:
|
||||
"""
|
||||
Analyzes research results based on user intent.
|
||||
|
||||
Instead of generic summaries, this extracts exactly what the user
|
||||
needs: statistics, quotes, case studies, trends, etc.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the analyzer."""
|
||||
self.prompt_builder = IntentPromptBuilder()
|
||||
logger.info("IntentAwareAnalyzer initialized")
|
||||
|
||||
async def analyze(
|
||||
self,
|
||||
raw_results: Dict[str, Any],
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> IntentDrivenResearchResult:
|
||||
"""
|
||||
Analyze raw research results based on user intent.
|
||||
|
||||
Args:
|
||||
raw_results: Raw results from Exa/Tavily/Google
|
||||
intent: The user's research intent
|
||||
research_persona: Optional persona for context
|
||||
|
||||
Returns:
|
||||
IntentDrivenResearchResult with extracted deliverables
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Analyzing results for intent: {intent.primary_question[:50]}...")
|
||||
|
||||
# Format raw results for analysis
|
||||
formatted_results = self._format_raw_results(raw_results)
|
||||
|
||||
# Build the analysis prompt
|
||||
prompt = self.prompt_builder.build_intent_aware_analysis_prompt(
|
||||
raw_results=formatted_results,
|
||||
intent=intent,
|
||||
research_persona=research_persona,
|
||||
)
|
||||
|
||||
# Define the expected JSON schema
|
||||
analysis_schema = self._build_analysis_schema(intent.expected_deliverables)
|
||||
|
||||
# Call LLM for analysis
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=analysis_schema,
|
||||
user_id=None
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Intent-aware analysis failed: {result.get('error')}")
|
||||
return self._create_fallback_result(raw_results, intent)
|
||||
|
||||
# Parse and validate the result
|
||||
analyzed_result = self._parse_analysis_result(result, intent, raw_results)
|
||||
|
||||
logger.info(
|
||||
f"Analysis complete: {len(analyzed_result.key_takeaways)} takeaways, "
|
||||
f"{len(analyzed_result.statistics)} stats, "
|
||||
f"{len(analyzed_result.sources)} sources"
|
||||
)
|
||||
|
||||
return analyzed_result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in intent-aware analysis: {e}")
|
||||
return self._create_fallback_result(raw_results, intent)
|
||||
|
||||
def _format_raw_results(self, raw_results: Dict[str, Any]) -> str:
|
||||
"""Format raw research results for LLM analysis."""
|
||||
|
||||
formatted_parts = []
|
||||
|
||||
# Extract content
|
||||
content = raw_results.get("content", "")
|
||||
if content:
|
||||
formatted_parts.append(f"=== MAIN CONTENT ===\n{content[:8000]}")
|
||||
|
||||
# Extract sources with their content
|
||||
sources = raw_results.get("sources", [])
|
||||
if sources:
|
||||
formatted_parts.append("\n=== SOURCES ===")
|
||||
for i, source in enumerate(sources[:15], 1): # Limit to 15 sources
|
||||
title = source.get("title", "Untitled")
|
||||
url = source.get("url", "")
|
||||
excerpt = source.get("excerpt", source.get("text", source.get("content", "")))
|
||||
|
||||
formatted_parts.append(f"\nSource {i}: {title}")
|
||||
formatted_parts.append(f"URL: {url}")
|
||||
if excerpt:
|
||||
formatted_parts.append(f"Content: {excerpt[:500]}")
|
||||
|
||||
# Extract grounding metadata if available (from Google)
|
||||
grounding = raw_results.get("grounding_metadata", {})
|
||||
if grounding:
|
||||
formatted_parts.append("\n=== GROUNDING DATA ===")
|
||||
formatted_parts.append(json.dumps(grounding, indent=2)[:2000])
|
||||
|
||||
# Extract any AI answers (from Tavily)
|
||||
answer = raw_results.get("answer", "")
|
||||
if answer:
|
||||
formatted_parts.append(f"\n=== AI-GENERATED ANSWER ===\n{answer}")
|
||||
|
||||
return "\n".join(formatted_parts)
|
||||
|
||||
def _build_analysis_schema(self, expected_deliverables: List[str]) -> Dict[str, Any]:
|
||||
"""Build JSON schema based on expected deliverables."""
|
||||
|
||||
# Base schema
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"primary_answer": {"type": "string"},
|
||||
"secondary_answers": {
|
||||
"type": "object",
|
||||
"additionalProperties": {"type": "string"}
|
||||
},
|
||||
"executive_summary": {"type": "string"},
|
||||
"key_takeaways": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"maxItems": 7
|
||||
},
|
||||
"confidence": {"type": "number"},
|
||||
"gaps_identified": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"follow_up_queries": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
},
|
||||
"required": ["primary_answer", "executive_summary", "key_takeaways", "confidence"]
|
||||
}
|
||||
|
||||
# Add deliverable-specific properties
|
||||
if ExpectedDeliverable.KEY_STATISTICS.value in expected_deliverables:
|
||||
schema["properties"]["statistics"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"statistic": {"type": "string"},
|
||||
"value": {"type": "string"},
|
||||
"context": {"type": "string"},
|
||||
"source": {"type": "string"},
|
||||
"url": {"type": "string"},
|
||||
"credibility": {"type": "number"},
|
||||
"recency": {"type": "string"}
|
||||
},
|
||||
"required": ["statistic", "context", "source", "url"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.EXPERT_QUOTES.value in expected_deliverables:
|
||||
schema["properties"]["expert_quotes"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quote": {"type": "string"},
|
||||
"speaker": {"type": "string"},
|
||||
"title": {"type": "string"},
|
||||
"organization": {"type": "string"},
|
||||
"source": {"type": "string"},
|
||||
"url": {"type": "string"}
|
||||
},
|
||||
"required": ["quote", "speaker", "source", "url"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.CASE_STUDIES.value in expected_deliverables:
|
||||
schema["properties"]["case_studies"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"organization": {"type": "string"},
|
||||
"challenge": {"type": "string"},
|
||||
"solution": {"type": "string"},
|
||||
"outcome": {"type": "string"},
|
||||
"key_metrics": {"type": "array", "items": {"type": "string"}},
|
||||
"source": {"type": "string"},
|
||||
"url": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "organization", "challenge", "solution", "outcome"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.TRENDS.value in expected_deliverables:
|
||||
schema["properties"]["trends"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"trend": {"type": "string"},
|
||||
"direction": {"type": "string"},
|
||||
"evidence": {"type": "array", "items": {"type": "string"}},
|
||||
"impact": {"type": "string"},
|
||||
"timeline": {"type": "string"},
|
||||
"sources": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["trend", "direction", "evidence"]
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.COMPARISONS.value in expected_deliverables:
|
||||
schema["properties"]["comparisons"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"criteria": {"type": "array", "items": {"type": "string"}},
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"pros": {"type": "array", "items": {"type": "string"}},
|
||||
"cons": {"type": "array", "items": {"type": "string"}},
|
||||
"features": {"type": "object"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"verdict": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.PROS_CONS.value in expected_deliverables:
|
||||
schema["properties"]["pros_cons"] = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"subject": {"type": "string"},
|
||||
"pros": {"type": "array", "items": {"type": "string"}},
|
||||
"cons": {"type": "array", "items": {"type": "string"}},
|
||||
"balanced_verdict": {"type": "string"}
|
||||
}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.BEST_PRACTICES.value in expected_deliverables:
|
||||
schema["properties"]["best_practices"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.STEP_BY_STEP.value in expected_deliverables:
|
||||
schema["properties"]["step_by_step"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.DEFINITIONS.value in expected_deliverables:
|
||||
schema["properties"]["definitions"] = {
|
||||
"type": "object",
|
||||
"additionalProperties": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.EXAMPLES.value in expected_deliverables:
|
||||
schema["properties"]["examples"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
if ExpectedDeliverable.PREDICTIONS.value in expected_deliverables:
|
||||
schema["properties"]["predictions"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
# Always include sources and suggested outline
|
||||
schema["properties"]["sources"] = {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"url": {"type": "string"},
|
||||
"relevance_score": {"type": "number"},
|
||||
"relevance_reason": {"type": "string"},
|
||||
"content_type": {"type": "string"},
|
||||
"credibility_score": {"type": "number"}
|
||||
},
|
||||
"required": ["title", "url"]
|
||||
}
|
||||
}
|
||||
|
||||
schema["properties"]["suggested_outline"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
return schema
|
||||
|
||||
def _parse_analysis_result(
|
||||
self,
|
||||
result: Dict[str, Any],
|
||||
intent: ResearchIntent,
|
||||
raw_results: Dict[str, Any],
|
||||
) -> IntentDrivenResearchResult:
|
||||
"""Parse LLM analysis result into structured format."""
|
||||
|
||||
# Parse statistics
|
||||
statistics = []
|
||||
for stat in result.get("statistics", []):
|
||||
try:
|
||||
statistics.append(StatisticWithCitation(
|
||||
statistic=stat.get("statistic", ""),
|
||||
value=stat.get("value"),
|
||||
context=stat.get("context", ""),
|
||||
source=stat.get("source", ""),
|
||||
url=stat.get("url", ""),
|
||||
credibility=float(stat.get("credibility", 0.8)),
|
||||
recency=stat.get("recency"),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse statistic: {e}")
|
||||
|
||||
# Parse expert quotes
|
||||
expert_quotes = []
|
||||
for quote in result.get("expert_quotes", []):
|
||||
try:
|
||||
expert_quotes.append(ExpertQuote(
|
||||
quote=quote.get("quote", ""),
|
||||
speaker=quote.get("speaker", ""),
|
||||
title=quote.get("title"),
|
||||
organization=quote.get("organization"),
|
||||
context=quote.get("context"),
|
||||
source=quote.get("source", ""),
|
||||
url=quote.get("url", ""),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse expert quote: {e}")
|
||||
|
||||
# Parse case studies
|
||||
case_studies = []
|
||||
for cs in result.get("case_studies", []):
|
||||
try:
|
||||
case_studies.append(CaseStudySummary(
|
||||
title=cs.get("title", ""),
|
||||
organization=cs.get("organization", ""),
|
||||
challenge=cs.get("challenge", ""),
|
||||
solution=cs.get("solution", ""),
|
||||
outcome=cs.get("outcome", ""),
|
||||
key_metrics=cs.get("key_metrics", []),
|
||||
source=cs.get("source", ""),
|
||||
url=cs.get("url", ""),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse case study: {e}")
|
||||
|
||||
# Parse trends
|
||||
trends = []
|
||||
for trend in result.get("trends", []):
|
||||
try:
|
||||
trends.append(TrendAnalysis(
|
||||
trend=trend.get("trend", ""),
|
||||
direction=trend.get("direction", "growing"),
|
||||
evidence=trend.get("evidence", []),
|
||||
impact=trend.get("impact"),
|
||||
timeline=trend.get("timeline"),
|
||||
sources=trend.get("sources", []),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse trend: {e}")
|
||||
|
||||
# Parse comparisons
|
||||
comparisons = []
|
||||
for comp in result.get("comparisons", []):
|
||||
try:
|
||||
items = []
|
||||
for item in comp.get("items", []):
|
||||
items.append(ComparisonItem(
|
||||
name=item.get("name", ""),
|
||||
description=item.get("description"),
|
||||
pros=item.get("pros", []),
|
||||
cons=item.get("cons", []),
|
||||
features=item.get("features", {}),
|
||||
rating=item.get("rating"),
|
||||
source=item.get("source"),
|
||||
))
|
||||
comparisons.append(ComparisonTable(
|
||||
title=comp.get("title", ""),
|
||||
criteria=comp.get("criteria", []),
|
||||
items=items,
|
||||
winner=comp.get("winner"),
|
||||
verdict=comp.get("verdict"),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse comparison: {e}")
|
||||
|
||||
# Parse pros/cons
|
||||
pros_cons = None
|
||||
pc_data = result.get("pros_cons")
|
||||
if pc_data:
|
||||
try:
|
||||
pros_cons = ProsCons(
|
||||
subject=pc_data.get("subject", intent.original_input),
|
||||
pros=pc_data.get("pros", []),
|
||||
cons=pc_data.get("cons", []),
|
||||
balanced_verdict=pc_data.get("balanced_verdict", ""),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse pros/cons: {e}")
|
||||
|
||||
# Parse sources
|
||||
sources = []
|
||||
for src in result.get("sources", []):
|
||||
try:
|
||||
sources.append(SourceWithRelevance(
|
||||
title=src.get("title", ""),
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("excerpt"),
|
||||
relevance_score=float(src.get("relevance_score", 0.8)),
|
||||
relevance_reason=src.get("relevance_reason"),
|
||||
content_type=src.get("content_type"),
|
||||
published_date=src.get("published_date"),
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse source: {e}")
|
||||
|
||||
# If no sources from analysis, extract from raw results
|
||||
if not sources:
|
||||
sources = self._extract_sources_from_raw(raw_results)
|
||||
|
||||
return IntentDrivenResearchResult(
|
||||
success=True,
|
||||
primary_answer=result.get("primary_answer", ""),
|
||||
secondary_answers=result.get("secondary_answers", {}),
|
||||
statistics=statistics,
|
||||
expert_quotes=expert_quotes,
|
||||
case_studies=case_studies,
|
||||
comparisons=comparisons,
|
||||
trends=trends,
|
||||
best_practices=result.get("best_practices", []),
|
||||
step_by_step=result.get("step_by_step", []),
|
||||
pros_cons=pros_cons,
|
||||
definitions=result.get("definitions", {}),
|
||||
examples=result.get("examples", []),
|
||||
predictions=result.get("predictions", []),
|
||||
executive_summary=result.get("executive_summary", ""),
|
||||
key_takeaways=result.get("key_takeaways", []),
|
||||
suggested_outline=result.get("suggested_outline", []),
|
||||
sources=sources,
|
||||
raw_content=self._format_raw_results(raw_results)[:5000],
|
||||
confidence=float(result.get("confidence", 0.7)),
|
||||
gaps_identified=result.get("gaps_identified", []),
|
||||
follow_up_queries=result.get("follow_up_queries", []),
|
||||
original_intent=intent,
|
||||
)
|
||||
|
||||
def _extract_sources_from_raw(self, raw_results: Dict[str, Any]) -> List[SourceWithRelevance]:
|
||||
"""Extract sources from raw results when analysis doesn't provide them."""
|
||||
|
||||
sources = []
|
||||
for src in raw_results.get("sources", [])[:10]:
|
||||
try:
|
||||
sources.append(SourceWithRelevance(
|
||||
title=src.get("title", "Untitled"),
|
||||
url=src.get("url", ""),
|
||||
excerpt=src.get("excerpt", src.get("text", ""))[:200],
|
||||
relevance_score=0.8,
|
||||
credibility_score=float(src.get("credibility_score", 0.8)),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract source: {e}")
|
||||
|
||||
return sources
|
||||
|
||||
def _create_fallback_result(
|
||||
self,
|
||||
raw_results: Dict[str, Any],
|
||||
intent: ResearchIntent,
|
||||
) -> IntentDrivenResearchResult:
|
||||
"""Create a fallback result when AI analysis fails."""
|
||||
|
||||
# Extract basic information from raw results
|
||||
content = raw_results.get("content", "")
|
||||
sources = self._extract_sources_from_raw(raw_results)
|
||||
|
||||
# Create basic takeaways from content
|
||||
key_takeaways = []
|
||||
if content:
|
||||
sentences = content.split(". ")[:5]
|
||||
key_takeaways = [s.strip() + "." for s in sentences if len(s) > 20]
|
||||
|
||||
return IntentDrivenResearchResult(
|
||||
success=True,
|
||||
primary_answer=f"Research findings for: {intent.primary_question}",
|
||||
secondary_answers={},
|
||||
executive_summary=content[:300] if content else "Research completed",
|
||||
key_takeaways=key_takeaways,
|
||||
sources=sources,
|
||||
raw_content=self._format_raw_results(raw_results)[:5000],
|
||||
confidence=0.5,
|
||||
gaps_identified=[
|
||||
"AI analysis failed - showing raw results",
|
||||
"Manual review recommended"
|
||||
],
|
||||
follow_up_queries=[],
|
||||
original_intent=intent,
|
||||
)
|
||||
627
backend/services/research/intent/intent_prompt_builder.py
Normal file
627
backend/services/research/intent/intent_prompt_builder.py
Normal file
@@ -0,0 +1,627 @@
|
||||
"""
|
||||
Intent Prompt Builder
|
||||
|
||||
Builds comprehensive AI prompts for:
|
||||
1. Intent inference from user input
|
||||
2. Targeted query generation
|
||||
3. Intent-aware result analysis
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchPurpose,
|
||||
ContentOutput,
|
||||
ExpectedDeliverable,
|
||||
ResearchDepthLevel,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
|
||||
|
||||
class IntentPromptBuilder:
|
||||
"""Builds prompts for intent-driven research."""
|
||||
|
||||
# Purpose explanations for the AI
|
||||
PURPOSE_EXPLANATIONS = {
|
||||
ResearchPurpose.LEARN: "User wants to understand a topic for personal knowledge",
|
||||
ResearchPurpose.CREATE_CONTENT: "User will create content (blog, video, podcast) from this research",
|
||||
ResearchPurpose.MAKE_DECISION: "User needs to make a choice/decision based on research",
|
||||
ResearchPurpose.COMPARE: "User wants to compare alternatives or competitors",
|
||||
ResearchPurpose.SOLVE_PROBLEM: "User is looking for a solution to a specific problem",
|
||||
ResearchPurpose.FIND_DATA: "User needs specific statistics, facts, or citations",
|
||||
ResearchPurpose.EXPLORE_TRENDS: "User wants to understand current/future trends",
|
||||
ResearchPurpose.VALIDATE: "User wants to verify or fact-check information",
|
||||
ResearchPurpose.GENERATE_IDEAS: "User wants to brainstorm content ideas",
|
||||
}
|
||||
|
||||
# Deliverable descriptions
|
||||
DELIVERABLE_DESCRIPTIONS = {
|
||||
ExpectedDeliverable.KEY_STATISTICS: "Numbers, percentages, data points with citations",
|
||||
ExpectedDeliverable.EXPERT_QUOTES: "Authoritative quotes from industry experts",
|
||||
ExpectedDeliverable.CASE_STUDIES: "Real examples and success stories",
|
||||
ExpectedDeliverable.COMPARISONS: "Side-by-side analysis tables",
|
||||
ExpectedDeliverable.TRENDS: "Current and emerging industry trends",
|
||||
ExpectedDeliverable.BEST_PRACTICES: "Recommended approaches and guidelines",
|
||||
ExpectedDeliverable.STEP_BY_STEP: "Process guides and how-to instructions",
|
||||
ExpectedDeliverable.PROS_CONS: "Advantages and disadvantages analysis",
|
||||
ExpectedDeliverable.DEFINITIONS: "Clear explanations of concepts and terms",
|
||||
ExpectedDeliverable.CITATIONS: "Authoritative sources for reference",
|
||||
ExpectedDeliverable.EXAMPLES: "Concrete examples to illustrate points",
|
||||
ExpectedDeliverable.PREDICTIONS: "Future outlook and predictions",
|
||||
}
|
||||
|
||||
def build_intent_inference_prompt(
|
||||
self,
|
||||
user_input: str,
|
||||
keywords: List[str],
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build prompt for inferring user's research intent.
|
||||
|
||||
This prompt analyzes the user's input and determines:
|
||||
- What they want to accomplish
|
||||
- What questions they need answered
|
||||
- What specific deliverables they need
|
||||
"""
|
||||
|
||||
# Build persona context
|
||||
persona_context = self._build_persona_context(research_persona, industry, target_audience)
|
||||
|
||||
# Build competitor context
|
||||
competitor_context = self._build_competitor_context(competitor_data)
|
||||
|
||||
prompt = f"""You are an expert research intent analyzer. Your job is to understand what a content creator REALLY needs from their research.
|
||||
|
||||
## USER INPUT
|
||||
"{user_input}"
|
||||
|
||||
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
|
||||
|
||||
## USER CONTEXT
|
||||
{persona_context}
|
||||
|
||||
{competitor_context}
|
||||
|
||||
## YOUR TASK
|
||||
|
||||
Analyze the user's input and infer their research intent. Determine:
|
||||
|
||||
1. **INPUT TYPE**: Is this:
|
||||
- "keywords": Simple topic keywords (e.g., "AI healthcare 2025")
|
||||
- "question": A specific question (e.g., "What are the best AI tools for healthcare?")
|
||||
- "goal": A goal statement (e.g., "I need to write a blog about AI in healthcare")
|
||||
- "mixed": Combination of above
|
||||
|
||||
2. **PRIMARY QUESTION**: What is the main question to answer? Convert their input into a clear question.
|
||||
|
||||
3. **SECONDARY QUESTIONS**: What related questions should also be answered? (3-5 questions)
|
||||
|
||||
4. **PURPOSE**: Why are they researching? Choose ONE:
|
||||
- "learn": Understand a topic for personal knowledge
|
||||
- "create_content": Create content (blog, video, podcast)
|
||||
- "make_decision": Make a choice between options
|
||||
- "compare": Compare alternatives/competitors
|
||||
- "solve_problem": Find a solution
|
||||
- "find_data": Get specific statistics/facts
|
||||
- "explore_trends": Understand industry trends
|
||||
- "validate": Verify claims/information
|
||||
- "generate_ideas": Brainstorm ideas
|
||||
|
||||
5. **CONTENT OUTPUT**: What will they create? Choose ONE:
|
||||
- "blog", "podcast", "video", "social_post", "newsletter", "presentation", "report", "whitepaper", "email", "general"
|
||||
|
||||
6. **EXPECTED DELIVERABLES**: What specific outputs do they need? Choose ALL that apply:
|
||||
- "key_statistics": Numbers, data points
|
||||
- "expert_quotes": Authoritative quotes
|
||||
- "case_studies": Real examples
|
||||
- "comparisons": Side-by-side analysis
|
||||
- "trends": Industry trends
|
||||
- "best_practices": Recommendations
|
||||
- "step_by_step": How-to guides
|
||||
- "pros_cons": Advantages/disadvantages
|
||||
- "definitions": Concept explanations
|
||||
- "citations": Source references
|
||||
- "examples": Concrete examples
|
||||
- "predictions": Future outlook
|
||||
|
||||
7. **DEPTH**: How deep should the research go?
|
||||
- "overview": Quick summary
|
||||
- "detailed": In-depth analysis
|
||||
- "expert": Comprehensive expert-level
|
||||
|
||||
8. **FOCUS AREAS**: What specific aspects should be researched? (2-4 areas)
|
||||
|
||||
9. **PERSPECTIVE**: From whose viewpoint? (e.g., "marketing manager", "small business owner")
|
||||
|
||||
10. **TIME SENSITIVITY**: Is recency important?
|
||||
- "real_time": Latest only (past 24-48 hours)
|
||||
- "recent": Past week/month
|
||||
- "historical": Include older content
|
||||
- "evergreen": Timeless content
|
||||
|
||||
11. **CONFIDENCE**: How confident are you in this inference? (0.0-1.0)
|
||||
- If < 0.7, set needs_clarification to true and provide clarifying_questions
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return a JSON object:
|
||||
```json
|
||||
{{
|
||||
"input_type": "keywords|question|goal|mixed",
|
||||
"primary_question": "The main question to answer",
|
||||
"secondary_questions": ["question 1", "question 2", "question 3"],
|
||||
"purpose": "one of the purpose options",
|
||||
"content_output": "one of the content options",
|
||||
"expected_deliverables": ["deliverable1", "deliverable2"],
|
||||
"depth": "overview|detailed|expert",
|
||||
"focus_areas": ["area1", "area2"],
|
||||
"perspective": "target perspective or null",
|
||||
"time_sensitivity": "real_time|recent|historical|evergreen",
|
||||
"confidence": 0.85,
|
||||
"needs_clarification": false,
|
||||
"clarifying_questions": [],
|
||||
"analysis_summary": "Brief summary of what the user wants"
|
||||
}}
|
||||
```
|
||||
|
||||
## IMPORTANT RULES
|
||||
|
||||
1. Always convert vague input into a specific primary question
|
||||
2. Infer deliverables based on purpose (e.g., create_content → statistics + examples)
|
||||
3. Use persona context to refine perspective and focus areas
|
||||
4. If input is ambiguous, provide clarifying questions
|
||||
5. Default to "detailed" depth unless input suggests otherwise
|
||||
6. For content creation, include relevant deliverables automatically
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def build_query_generation_prompt(
|
||||
self,
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build prompt for generating targeted research queries.
|
||||
|
||||
Generates multiple queries, each targeting a specific deliverable.
|
||||
"""
|
||||
|
||||
deliverables_list = "\n".join([
|
||||
f"- {d}: {self.DELIVERABLE_DESCRIPTIONS.get(ExpectedDeliverable(d), d)}"
|
||||
for d in intent.expected_deliverables
|
||||
])
|
||||
|
||||
persona_keywords = ""
|
||||
if research_persona and research_persona.suggested_keywords:
|
||||
persona_keywords = f"\nSUGGESTED KEYWORDS FROM PERSONA: {', '.join(research_persona.suggested_keywords[:10])}"
|
||||
|
||||
prompt = f"""You are a research query optimizer. Generate multiple targeted search queries based on the user's research intent.
|
||||
|
||||
## RESEARCH INTENT
|
||||
|
||||
PRIMARY QUESTION: {intent.primary_question}
|
||||
|
||||
SECONDARY QUESTIONS:
|
||||
{chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None'}
|
||||
|
||||
PURPOSE: {intent.purpose} - {self.PURPOSE_EXPLANATIONS.get(ResearchPurpose(intent.purpose), intent.purpose)}
|
||||
|
||||
CONTENT OUTPUT: {intent.content_output}
|
||||
|
||||
EXPECTED DELIVERABLES:
|
||||
{deliverables_list}
|
||||
|
||||
DEPTH: {intent.depth}
|
||||
|
||||
FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'}
|
||||
|
||||
PERSPECTIVE: {intent.perspective or 'General audience'}
|
||||
|
||||
TIME SENSITIVITY: {intent.time_sensitivity or 'No specific requirement'}
|
||||
{persona_keywords}
|
||||
|
||||
## YOUR TASK
|
||||
|
||||
Generate 4-8 targeted research queries. Each query should:
|
||||
1. Target a specific deliverable or question
|
||||
2. Be optimized for semantic search (Exa/Tavily)
|
||||
3. Include relevant context for better results
|
||||
|
||||
For each query, specify:
|
||||
- The query string
|
||||
- What deliverable it targets
|
||||
- Best provider (exa for semantic/deep, tavily for news/real-time, google for factual)
|
||||
- Priority (1-5, higher = more important)
|
||||
- What we expect to find
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return a JSON object:
|
||||
```json
|
||||
{{
|
||||
"queries": [
|
||||
{{
|
||||
"query": "Healthcare AI adoption statistics 2025 hospitals implementation data",
|
||||
"purpose": "key_statistics",
|
||||
"provider": "exa",
|
||||
"priority": 5,
|
||||
"expected_results": "Statistics on hospital AI adoption rates"
|
||||
}},
|
||||
{{
|
||||
"query": "AI healthcare trends predictions future outlook 2025 2026",
|
||||
"purpose": "trends",
|
||||
"provider": "tavily",
|
||||
"priority": 4,
|
||||
"expected_results": "Current trends and future predictions in healthcare AI"
|
||||
}}
|
||||
],
|
||||
"enhanced_keywords": ["keyword1", "keyword2", "keyword3"],
|
||||
"research_angles": [
|
||||
"Angle 1: Focus on adoption challenges",
|
||||
"Angle 2: Focus on ROI and outcomes"
|
||||
]
|
||||
}}
|
||||
```
|
||||
|
||||
## QUERY OPTIMIZATION RULES
|
||||
|
||||
1. For STATISTICS: Include words like "statistics", "data", "percentage", "report", "study"
|
||||
2. For CASE STUDIES: Include "case study", "success story", "implementation", "example"
|
||||
3. For TRENDS: Include "trends", "future", "predictions", "emerging", year numbers
|
||||
4. For EXPERT QUOTES: Include expert names if known, or "expert opinion", "interview"
|
||||
5. For COMPARISONS: Include "vs", "compare", "comparison", "alternative"
|
||||
6. For NEWS/REAL-TIME: Use Tavily, include recent year/month
|
||||
7. For ACADEMIC/DEEP: Use Exa with neural search
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def build_intent_aware_analysis_prompt(
|
||||
self,
|
||||
raw_results: str,
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build prompt for analyzing research results based on user intent.
|
||||
|
||||
This is the key prompt that extracts exactly what the user needs.
|
||||
"""
|
||||
|
||||
purpose_explanation = self.PURPOSE_EXPLANATIONS.get(
|
||||
ResearchPurpose(intent.purpose),
|
||||
intent.purpose
|
||||
)
|
||||
|
||||
deliverables_instructions = self._build_deliverables_instructions(intent.expected_deliverables)
|
||||
|
||||
perspective_instruction = ""
|
||||
if intent.perspective:
|
||||
perspective_instruction = f"\n**PERSPECTIVE**: Analyze results from the viewpoint of: {intent.perspective}"
|
||||
|
||||
prompt = f"""You are a research analyst helping a content creator find exactly what they need. Your job is to analyze raw research results and extract precisely what the user is looking for.
|
||||
|
||||
## USER'S RESEARCH INTENT
|
||||
|
||||
PRIMARY QUESTION: {intent.primary_question}
|
||||
|
||||
SECONDARY QUESTIONS:
|
||||
{chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None specified'}
|
||||
|
||||
PURPOSE: {intent.purpose}
|
||||
→ {purpose_explanation}
|
||||
|
||||
CONTENT OUTPUT: {intent.content_output}
|
||||
|
||||
EXPECTED DELIVERABLES: {', '.join(intent.expected_deliverables)}
|
||||
|
||||
FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'}
|
||||
{perspective_instruction}
|
||||
|
||||
## RAW RESEARCH RESULTS
|
||||
|
||||
{raw_results[:15000]} # Truncated for token limits
|
||||
|
||||
## YOUR TASK
|
||||
|
||||
Analyze the raw research results and extract EXACTLY what the user needs.
|
||||
|
||||
{deliverables_instructions}
|
||||
|
||||
## OUTPUT REQUIREMENTS
|
||||
|
||||
Provide results in this JSON structure:
|
||||
|
||||
```json
|
||||
{{
|
||||
"primary_answer": "Direct 2-3 sentence answer to the primary question",
|
||||
"secondary_answers": {{
|
||||
"Question 1?": "Answer to question 1",
|
||||
"Question 2?": "Answer to question 2"
|
||||
}},
|
||||
"executive_summary": "2-3 sentence executive summary of all findings",
|
||||
"key_takeaways": [
|
||||
"Key takeaway 1 - most important finding",
|
||||
"Key takeaway 2",
|
||||
"Key takeaway 3",
|
||||
"Key takeaway 4",
|
||||
"Key takeaway 5"
|
||||
],
|
||||
"statistics": [
|
||||
{{
|
||||
"statistic": "72% of hospitals plan to adopt AI by 2025",
|
||||
"value": "72%",
|
||||
"context": "Survey of 500 US hospitals in 2024",
|
||||
"source": "Healthcare AI Report 2024",
|
||||
"url": "https://example.com/report",
|
||||
"credibility": 0.9,
|
||||
"recency": "2024"
|
||||
}}
|
||||
],
|
||||
"expert_quotes": [
|
||||
{{
|
||||
"quote": "AI will revolutionize patient care within 5 years",
|
||||
"speaker": "Dr. Jane Smith",
|
||||
"title": "Chief Medical Officer",
|
||||
"organization": "HealthTech Inc",
|
||||
"source": "TechCrunch",
|
||||
"url": "https://example.com/article"
|
||||
}}
|
||||
],
|
||||
"case_studies": [
|
||||
{{
|
||||
"title": "Mayo Clinic AI Implementation",
|
||||
"organization": "Mayo Clinic",
|
||||
"challenge": "High patient wait times",
|
||||
"solution": "AI-powered triage system",
|
||||
"outcome": "40% reduction in wait times",
|
||||
"key_metrics": ["40% faster triage", "95% patient satisfaction"],
|
||||
"source": "Healthcare IT News",
|
||||
"url": "https://example.com"
|
||||
}}
|
||||
],
|
||||
"trends": [
|
||||
{{
|
||||
"trend": "AI-assisted diagnostics adoption",
|
||||
"direction": "growing",
|
||||
"evidence": ["25% YoY growth", "Major hospital chains investing"],
|
||||
"impact": "Could reduce misdiagnosis by 30%",
|
||||
"timeline": "Expected mainstream by 2027",
|
||||
"sources": ["url1", "url2"]
|
||||
}}
|
||||
],
|
||||
"comparisons": [
|
||||
{{
|
||||
"title": "Top AI Healthcare Platforms",
|
||||
"criteria": ["Cost", "Features", "Support"],
|
||||
"items": [
|
||||
{{
|
||||
"name": "Platform A",
|
||||
"pros": ["Easy integration", "Good support"],
|
||||
"cons": ["Higher cost"],
|
||||
"features": {{"Cost": "$500/month", "Support": "24/7"}}
|
||||
}}
|
||||
],
|
||||
"verdict": "Platform A best for large hospitals"
|
||||
}}
|
||||
],
|
||||
"best_practices": [
|
||||
"Start with a pilot program before full deployment",
|
||||
"Ensure staff training is comprehensive"
|
||||
],
|
||||
"step_by_step": [
|
||||
"Step 1: Assess current infrastructure",
|
||||
"Step 2: Define use cases",
|
||||
"Step 3: Select vendor"
|
||||
],
|
||||
"pros_cons": {{
|
||||
"subject": "AI in Healthcare",
|
||||
"pros": ["Improved accuracy", "Cost savings"],
|
||||
"cons": ["Initial investment", "Training required"],
|
||||
"balanced_verdict": "Benefits outweigh costs for most hospitals"
|
||||
}},
|
||||
"definitions": {{
|
||||
"Clinical AI": "AI systems designed for medical diagnosis and treatment recommendations"
|
||||
}},
|
||||
"examples": [
|
||||
"Example: Hospital X reduced readmissions by 25% using predictive AI"
|
||||
],
|
||||
"predictions": [
|
||||
"By 2030, AI will assist in 80% of initial diagnoses"
|
||||
],
|
||||
"suggested_outline": [
|
||||
"1. Introduction: The AI Healthcare Revolution",
|
||||
"2. Current State: Where We Are Today",
|
||||
"3. Key Statistics and Trends",
|
||||
"4. Case Studies: Success Stories",
|
||||
"5. Implementation Guide",
|
||||
"6. Future Outlook"
|
||||
],
|
||||
"sources": [
|
||||
{{
|
||||
"title": "Healthcare AI Report 2024",
|
||||
"url": "https://example.com",
|
||||
"relevance_score": 0.95,
|
||||
"relevance_reason": "Directly addresses adoption statistics",
|
||||
"content_type": "research report",
|
||||
"credibility_score": 0.9
|
||||
}}
|
||||
],
|
||||
"confidence": 0.85,
|
||||
"gaps_identified": [
|
||||
"Specific cost data for small clinics not found",
|
||||
"Limited information on regulatory challenges"
|
||||
],
|
||||
"follow_up_queries": [
|
||||
"AI healthcare regulations FDA 2025",
|
||||
"Small clinic AI implementation costs"
|
||||
]
|
||||
}}
|
||||
```
|
||||
|
||||
## CRITICAL RULES
|
||||
|
||||
1. **ONLY include information directly from the raw results** - do not make up data
|
||||
2. **ALWAYS include source URLs** for every statistic, quote, and case study
|
||||
3. **If a deliverable type has no relevant data**, return an empty array for it
|
||||
4. **Prioritize recency and credibility** when multiple sources conflict
|
||||
5. **Answer the PRIMARY QUESTION directly** in 2-3 clear sentences
|
||||
6. **Keep KEY TAKEAWAYS to 5-7 points** - the most important findings
|
||||
7. **Add to gaps_identified** if expected information is missing
|
||||
8. **Suggest follow_up_queries** for gaps or incomplete areas
|
||||
9. **Rate confidence** based on how well results match the user's intent
|
||||
10. **Include deliverables ONLY if they are in expected_deliverables** or critical to the question
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _build_persona_context(
|
||||
self,
|
||||
research_persona: Optional[ResearchPersona],
|
||||
industry: Optional[str],
|
||||
target_audience: Optional[str],
|
||||
) -> str:
|
||||
"""Build persona context section for prompts."""
|
||||
|
||||
if not research_persona and not industry:
|
||||
return "No specific persona context available."
|
||||
|
||||
context_parts = []
|
||||
|
||||
if research_persona:
|
||||
context_parts.append(f"INDUSTRY: {research_persona.default_industry}")
|
||||
context_parts.append(f"TARGET AUDIENCE: {research_persona.default_target_audience}")
|
||||
if research_persona.suggested_keywords:
|
||||
context_parts.append(f"TYPICAL TOPICS: {', '.join(research_persona.suggested_keywords[:5])}")
|
||||
if research_persona.research_angles:
|
||||
context_parts.append(f"RESEARCH ANGLES: {', '.join(research_persona.research_angles[:3])}")
|
||||
else:
|
||||
if industry:
|
||||
context_parts.append(f"INDUSTRY: {industry}")
|
||||
if target_audience:
|
||||
context_parts.append(f"TARGET AUDIENCE: {target_audience}")
|
||||
|
||||
return "\n".join(context_parts)
|
||||
|
||||
def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
|
||||
"""Build competitor context section for prompts."""
|
||||
|
||||
if not competitor_data:
|
||||
return ""
|
||||
|
||||
competitor_names = []
|
||||
for comp in competitor_data[:5]: # Limit to 5
|
||||
name = comp.get("name") or comp.get("domain") or comp.get("url", "Unknown")
|
||||
competitor_names.append(name)
|
||||
|
||||
if competitor_names:
|
||||
return f"\nKNOWN COMPETITORS: {', '.join(competitor_names)}"
|
||||
|
||||
return ""
|
||||
|
||||
def _build_deliverables_instructions(self, expected_deliverables: List[str]) -> str:
|
||||
"""Build specific extraction instructions for each expected deliverable."""
|
||||
|
||||
instructions = ["### EXTRACTION INSTRUCTIONS\n"]
|
||||
instructions.append("For each requested deliverable, extract the following:\n")
|
||||
|
||||
deliverable_instructions = {
|
||||
ExpectedDeliverable.KEY_STATISTICS: """
|
||||
**STATISTICS**:
|
||||
- Extract ALL relevant statistics with exact numbers
|
||||
- Include source attribution (publication name, URL)
|
||||
- Note the recency of the data
|
||||
- Rate credibility based on source authority
|
||||
- Format: statistic statement, value, context, source, URL, credibility score
|
||||
""",
|
||||
ExpectedDeliverable.EXPERT_QUOTES: """
|
||||
**EXPERT QUOTES**:
|
||||
- Extract authoritative quotes from named experts
|
||||
- Include speaker name, title, and organization
|
||||
- Provide context for the quote
|
||||
- Include source URL
|
||||
""",
|
||||
ExpectedDeliverable.CASE_STUDIES: """
|
||||
**CASE STUDIES**:
|
||||
- Summarize each case study: challenge → solution → outcome
|
||||
- Include key metrics and results
|
||||
- Name the organization involved
|
||||
- Provide source URL
|
||||
""",
|
||||
ExpectedDeliverable.TRENDS: """
|
||||
**TRENDS**:
|
||||
- Identify current and emerging trends
|
||||
- Note direction: growing, declining, emerging, or stable
|
||||
- List supporting evidence
|
||||
- Include timeline predictions if available
|
||||
- Cite sources
|
||||
""",
|
||||
ExpectedDeliverable.COMPARISONS: """
|
||||
**COMPARISONS**:
|
||||
- Build comparison tables where applicable
|
||||
- Define clear comparison criteria
|
||||
- List pros and cons for each option
|
||||
- Provide a verdict/recommendation if data supports it
|
||||
""",
|
||||
ExpectedDeliverable.BEST_PRACTICES: """
|
||||
**BEST PRACTICES**:
|
||||
- Extract recommended approaches
|
||||
- Provide actionable guidelines
|
||||
- Order by importance or sequence
|
||||
""",
|
||||
ExpectedDeliverable.STEP_BY_STEP: """
|
||||
**STEP BY STEP**:
|
||||
- Extract process/how-to instructions
|
||||
- Number steps clearly
|
||||
- Include any prerequisites or requirements
|
||||
""",
|
||||
ExpectedDeliverable.PROS_CONS: """
|
||||
**PROS AND CONS**:
|
||||
- List advantages (pros)
|
||||
- List disadvantages (cons)
|
||||
- Provide a balanced verdict
|
||||
""",
|
||||
ExpectedDeliverable.DEFINITIONS: """
|
||||
**DEFINITIONS**:
|
||||
- Extract clear explanations of key terms and concepts
|
||||
- Keep definitions concise but comprehensive
|
||||
""",
|
||||
ExpectedDeliverable.EXAMPLES: """
|
||||
**EXAMPLES**:
|
||||
- Extract concrete examples that illustrate key points
|
||||
- Include real-world applications
|
||||
""",
|
||||
ExpectedDeliverable.PREDICTIONS: """
|
||||
**PREDICTIONS**:
|
||||
- Extract future outlook and predictions
|
||||
- Note the source and their track record if known
|
||||
- Include timeframes where mentioned
|
||||
""",
|
||||
ExpectedDeliverable.CITATIONS: """
|
||||
**CITATIONS**:
|
||||
- List all authoritative sources with URLs
|
||||
- Rate credibility and relevance
|
||||
- Note content type (research, news, opinion, etc.)
|
||||
""",
|
||||
}
|
||||
|
||||
for deliverable in expected_deliverables:
|
||||
try:
|
||||
d_enum = ExpectedDeliverable(deliverable)
|
||||
if d_enum in deliverable_instructions:
|
||||
instructions.append(deliverable_instructions[d_enum])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return "\n".join(instructions)
|
||||
387
backend/services/research/intent/intent_query_generator.py
Normal file
387
backend/services/research/intent/intent_query_generator.py
Normal file
@@ -0,0 +1,387 @@
|
||||
"""
|
||||
Intent Query Generator
|
||||
|
||||
Generates multiple targeted research queries based on user intent.
|
||||
Each query targets a specific deliverable or question.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchQuery,
|
||||
ExpectedDeliverable,
|
||||
ResearchPurpose,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
|
||||
class IntentQueryGenerator:
|
||||
"""
|
||||
Generates targeted research queries based on user intent.
|
||||
|
||||
Instead of a single generic search, generates multiple queries
|
||||
each targeting a specific deliverable or question.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the query generator."""
|
||||
self.prompt_builder = IntentPromptBuilder()
|
||||
logger.info("IntentQueryGenerator initialized")
|
||||
|
||||
async def generate_queries(
|
||||
self,
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate targeted research queries based on intent.
|
||||
|
||||
Args:
|
||||
intent: The inferred research intent
|
||||
research_persona: Optional persona for context
|
||||
|
||||
Returns:
|
||||
Dict with queries, enhanced_keywords, and research_angles
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Generating queries for: {intent.primary_question[:50]}...")
|
||||
|
||||
# Build the query generation prompt
|
||||
prompt = self.prompt_builder.build_query_generation_prompt(
|
||||
intent=intent,
|
||||
research_persona=research_persona,
|
||||
)
|
||||
|
||||
# Define the expected JSON schema
|
||||
query_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"queries": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
"purpose": {"type": "string"},
|
||||
"provider": {"type": "string"},
|
||||
"priority": {"type": "integer"},
|
||||
"expected_results": {"type": "string"}
|
||||
},
|
||||
"required": ["query", "purpose", "provider", "priority", "expected_results"]
|
||||
}
|
||||
},
|
||||
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"research_angles": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["queries", "enhanced_keywords", "research_angles"]
|
||||
}
|
||||
|
||||
# Call LLM for query generation
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=query_schema,
|
||||
user_id=None
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Query generation failed: {result.get('error')}")
|
||||
return self._create_fallback_queries(intent)
|
||||
|
||||
# Parse queries
|
||||
queries = self._parse_queries(result.get("queries", []))
|
||||
|
||||
# Ensure we have queries for all expected deliverables
|
||||
queries = self._ensure_deliverable_coverage(queries, intent)
|
||||
|
||||
# Sort by priority
|
||||
queries.sort(key=lambda q: q.priority, reverse=True)
|
||||
|
||||
logger.info(f"Generated {len(queries)} targeted queries")
|
||||
|
||||
return {
|
||||
"queries": queries,
|
||||
"enhanced_keywords": result.get("enhanced_keywords", []),
|
||||
"research_angles": result.get("research_angles", []),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating queries: {e}")
|
||||
return self._create_fallback_queries(intent)
|
||||
|
||||
def _parse_queries(self, raw_queries: List[Dict]) -> List[ResearchQuery]:
|
||||
"""Parse raw query data into ResearchQuery objects."""
|
||||
|
||||
queries = []
|
||||
for q in raw_queries:
|
||||
try:
|
||||
# Validate purpose
|
||||
purpose_str = q.get("purpose", "key_statistics")
|
||||
try:
|
||||
purpose = ExpectedDeliverable(purpose_str)
|
||||
except ValueError:
|
||||
purpose = ExpectedDeliverable.KEY_STATISTICS
|
||||
|
||||
query = ResearchQuery(
|
||||
query=q.get("query", ""),
|
||||
purpose=purpose,
|
||||
provider=q.get("provider", "exa"),
|
||||
priority=min(max(int(q.get("priority", 3)), 1), 5), # Clamp 1-5
|
||||
expected_results=q.get("expected_results", ""),
|
||||
)
|
||||
queries.append(query)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse query: {e}")
|
||||
continue
|
||||
|
||||
return queries
|
||||
|
||||
def _ensure_deliverable_coverage(
|
||||
self,
|
||||
queries: List[ResearchQuery],
|
||||
intent: ResearchIntent,
|
||||
) -> List[ResearchQuery]:
|
||||
"""Ensure we have queries for all expected deliverables."""
|
||||
|
||||
# Get deliverables already covered
|
||||
covered = set(q.purpose.value for q in queries)
|
||||
|
||||
# Check for missing deliverables
|
||||
for deliverable in intent.expected_deliverables:
|
||||
if deliverable not in covered:
|
||||
# Generate a query for this deliverable
|
||||
query = self._generate_query_for_deliverable(
|
||||
deliverable=deliverable,
|
||||
intent=intent,
|
||||
)
|
||||
queries.append(query)
|
||||
|
||||
return queries
|
||||
|
||||
def _generate_query_for_deliverable(
|
||||
self,
|
||||
deliverable: str,
|
||||
intent: ResearchIntent,
|
||||
) -> ResearchQuery:
|
||||
"""Generate a query targeting a specific deliverable."""
|
||||
|
||||
# Extract topic from primary question
|
||||
topic = intent.original_input
|
||||
|
||||
# Query templates by deliverable type
|
||||
templates = {
|
||||
ExpectedDeliverable.KEY_STATISTICS.value: {
|
||||
"query": f"{topic} statistics data report study",
|
||||
"provider": "exa",
|
||||
"priority": 5,
|
||||
"expected": "Statistical data and research findings",
|
||||
},
|
||||
ExpectedDeliverable.EXPERT_QUOTES.value: {
|
||||
"query": f"{topic} expert opinion interview insights",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Expert opinions and authoritative quotes",
|
||||
},
|
||||
ExpectedDeliverable.CASE_STUDIES.value: {
|
||||
"query": f"{topic} case study success story implementation example",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Real-world case studies and examples",
|
||||
},
|
||||
ExpectedDeliverable.TRENDS.value: {
|
||||
"query": f"{topic} trends 2025 future predictions emerging",
|
||||
"provider": "tavily",
|
||||
"priority": 4,
|
||||
"expected": "Current trends and future predictions",
|
||||
},
|
||||
ExpectedDeliverable.COMPARISONS.value: {
|
||||
"query": f"{topic} comparison vs versus alternatives",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Comparison and alternative options",
|
||||
},
|
||||
ExpectedDeliverable.BEST_PRACTICES.value: {
|
||||
"query": f"{topic} best practices recommendations guidelines",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Best practices and recommendations",
|
||||
},
|
||||
ExpectedDeliverable.STEP_BY_STEP.value: {
|
||||
"query": f"{topic} how to guide tutorial steps",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Step-by-step guides and tutorials",
|
||||
},
|
||||
ExpectedDeliverable.PROS_CONS.value: {
|
||||
"query": f"{topic} advantages disadvantages pros cons benefits",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Pros, cons, and trade-offs",
|
||||
},
|
||||
ExpectedDeliverable.DEFINITIONS.value: {
|
||||
"query": f"what is {topic} definition explained",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Clear definitions and explanations",
|
||||
},
|
||||
ExpectedDeliverable.EXAMPLES.value: {
|
||||
"query": f"{topic} examples real world applications",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "Real-world examples and applications",
|
||||
},
|
||||
ExpectedDeliverable.PREDICTIONS.value: {
|
||||
"query": f"{topic} future outlook predictions 2025 2030",
|
||||
"provider": "tavily",
|
||||
"priority": 4,
|
||||
"expected": "Future predictions and outlook",
|
||||
},
|
||||
ExpectedDeliverable.CITATIONS.value: {
|
||||
"query": f"{topic} research paper study academic",
|
||||
"provider": "exa",
|
||||
"priority": 4,
|
||||
"expected": "Authoritative academic sources",
|
||||
},
|
||||
}
|
||||
|
||||
template = templates.get(deliverable, {
|
||||
"query": f"{topic}",
|
||||
"provider": "exa",
|
||||
"priority": 3,
|
||||
"expected": "General information",
|
||||
})
|
||||
|
||||
return ResearchQuery(
|
||||
query=template["query"],
|
||||
purpose=ExpectedDeliverable(deliverable) if deliverable in [e.value for e in ExpectedDeliverable] else ExpectedDeliverable.KEY_STATISTICS,
|
||||
provider=template["provider"],
|
||||
priority=template["priority"],
|
||||
expected_results=template["expected"],
|
||||
)
|
||||
|
||||
def _create_fallback_queries(self, intent: ResearchIntent) -> Dict[str, Any]:
|
||||
"""Create fallback queries when AI generation fails."""
|
||||
|
||||
topic = intent.original_input
|
||||
|
||||
# Generate basic queries for each expected deliverable
|
||||
queries = []
|
||||
for deliverable in intent.expected_deliverables[:5]: # Limit to 5
|
||||
query = self._generate_query_for_deliverable(deliverable, intent)
|
||||
queries.append(query)
|
||||
|
||||
# Add a general query if we have none
|
||||
if not queries:
|
||||
queries.append(ResearchQuery(
|
||||
query=topic,
|
||||
purpose=ExpectedDeliverable.KEY_STATISTICS,
|
||||
provider="exa",
|
||||
priority=5,
|
||||
expected_results="General information and insights",
|
||||
))
|
||||
|
||||
return {
|
||||
"queries": queries,
|
||||
"enhanced_keywords": topic.split()[:10],
|
||||
"research_angles": [
|
||||
f"Overview of {topic}",
|
||||
f"Latest trends in {topic}",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class QueryOptimizer:
|
||||
"""
|
||||
Optimizes queries for different research providers.
|
||||
|
||||
Different providers have different strengths:
|
||||
- Exa: Semantic search, good for deep research
|
||||
- Tavily: Real-time search, good for news/trends
|
||||
- Google: Factual search, good for basic info
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def optimize_for_exa(query: str, intent: ResearchIntent) -> Dict[str, Any]:
|
||||
"""Optimize query and parameters for Exa."""
|
||||
|
||||
# Determine best Exa settings based on deliverable
|
||||
deliverables = intent.expected_deliverables
|
||||
|
||||
# Determine category
|
||||
category = None
|
||||
if ExpectedDeliverable.CITATIONS.value in deliverables:
|
||||
category = "research paper"
|
||||
elif ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
category = "news"
|
||||
elif intent.purpose == ResearchPurpose.COMPARE.value:
|
||||
category = "company"
|
||||
|
||||
# Determine search type
|
||||
search_type = "neural" # Default to neural for semantic understanding
|
||||
if ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
search_type = "auto" # Auto is better for time-sensitive queries
|
||||
|
||||
# Number of results
|
||||
num_results = 10
|
||||
if intent.depth == "expert":
|
||||
num_results = 20
|
||||
elif intent.depth == "overview":
|
||||
num_results = 5
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"type": search_type,
|
||||
"category": category,
|
||||
"num_results": num_results,
|
||||
"text": True,
|
||||
"highlights": True,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def optimize_for_tavily(query: str, intent: ResearchIntent) -> Dict[str, Any]:
|
||||
"""Optimize query and parameters for Tavily."""
|
||||
|
||||
deliverables = intent.expected_deliverables
|
||||
|
||||
# Determine topic
|
||||
topic = "general"
|
||||
if ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
topic = "news"
|
||||
|
||||
# Determine search depth
|
||||
search_depth = "basic"
|
||||
if intent.depth in ["detailed", "expert"]:
|
||||
search_depth = "advanced"
|
||||
|
||||
# Include answer for factual queries
|
||||
include_answer = False
|
||||
if ExpectedDeliverable.DEFINITIONS.value in deliverables:
|
||||
include_answer = "advanced"
|
||||
elif ExpectedDeliverable.KEY_STATISTICS.value in deliverables:
|
||||
include_answer = "basic"
|
||||
|
||||
# Time range for trends
|
||||
time_range = None
|
||||
if intent.time_sensitivity == "real_time":
|
||||
time_range = "day"
|
||||
elif intent.time_sensitivity == "recent":
|
||||
time_range = "week"
|
||||
elif ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
time_range = "month"
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"topic": topic,
|
||||
"search_depth": search_depth,
|
||||
"include_answer": include_answer,
|
||||
"time_range": time_range,
|
||||
"max_results": 10,
|
||||
}
|
||||
378
backend/services/research/intent/research_intent_inference.py
Normal file
378
backend/services/research/intent/research_intent_inference.py
Normal file
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
Research Intent Inference Service
|
||||
|
||||
Analyzes user input to understand their research intent.
|
||||
Uses AI to infer:
|
||||
- What the user wants to accomplish
|
||||
- What questions need answering
|
||||
- What deliverables they expect
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchPurpose,
|
||||
ContentOutput,
|
||||
ExpectedDeliverable,
|
||||
ResearchDepthLevel,
|
||||
InputType,
|
||||
IntentInferenceRequest,
|
||||
IntentInferenceResponse,
|
||||
ResearchQuery,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .intent_prompt_builder import IntentPromptBuilder
|
||||
|
||||
|
||||
class ResearchIntentInference:
|
||||
"""
|
||||
Infers user research intent from minimal input.
|
||||
|
||||
Instead of asking a formal questionnaire, this service
|
||||
uses AI to understand what the user really wants.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the intent inference service."""
|
||||
self.prompt_builder = IntentPromptBuilder()
|
||||
logger.info("ResearchIntentInference initialized")
|
||||
|
||||
async def infer_intent(
|
||||
self,
|
||||
user_input: str,
|
||||
keywords: Optional[List[str]] = None,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
) -> IntentInferenceResponse:
|
||||
"""
|
||||
Analyze user input and infer their research intent.
|
||||
|
||||
Args:
|
||||
user_input: User's keywords, question, or goal
|
||||
keywords: Extracted keywords (optional)
|
||||
research_persona: User's research persona (optional)
|
||||
competitor_data: Competitor analysis data (optional)
|
||||
industry: Industry context (optional)
|
||||
target_audience: Target audience context (optional)
|
||||
|
||||
Returns:
|
||||
IntentInferenceResponse with inferred intent and suggested queries
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Inferring intent for: {user_input[:100]}...")
|
||||
|
||||
keywords = keywords or []
|
||||
|
||||
# Build the inference prompt
|
||||
prompt = self.prompt_builder.build_intent_inference_prompt(
|
||||
user_input=user_input,
|
||||
keywords=keywords,
|
||||
research_persona=research_persona,
|
||||
competitor_data=competitor_data,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
)
|
||||
|
||||
# Define the expected JSON schema
|
||||
intent_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
|
||||
"primary_question": {"type": "string"},
|
||||
"secondary_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"purpose": {"type": "string"},
|
||||
"content_output": {"type": "string"},
|
||||
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
|
||||
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
|
||||
"focus_areas": {"type": "array", "items": {"type": "string"}},
|
||||
"perspective": {"type": "string"},
|
||||
"time_sensitivity": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"needs_clarification": {"type": "boolean"},
|
||||
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"analysis_summary": {"type": "string"}
|
||||
},
|
||||
"required": [
|
||||
"input_type", "primary_question", "purpose", "content_output",
|
||||
"expected_deliverables", "depth", "confidence", "analysis_summary"
|
||||
]
|
||||
}
|
||||
|
||||
# Call LLM for intent inference
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=intent_schema,
|
||||
user_id=None
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Intent inference failed: {result.get('error')}")
|
||||
return self._create_fallback_response(user_input, keywords)
|
||||
|
||||
# Parse and validate the result
|
||||
intent = self._parse_intent_result(result, user_input)
|
||||
|
||||
# Generate quick options for UI
|
||||
quick_options = self._generate_quick_options(intent, result)
|
||||
|
||||
# Create response
|
||||
response = IntentInferenceResponse(
|
||||
success=True,
|
||||
intent=intent,
|
||||
analysis_summary=result.get("analysis_summary", "Research intent analyzed"),
|
||||
suggested_queries=[], # Will be populated by query generator
|
||||
suggested_keywords=self._extract_keywords_from_input(user_input, keywords),
|
||||
suggested_angles=result.get("focus_areas", []),
|
||||
quick_options=quick_options,
|
||||
)
|
||||
|
||||
logger.info(f"Intent inferred: purpose={intent.purpose}, confidence={intent.confidence}")
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error inferring intent: {e}")
|
||||
return self._create_fallback_response(user_input, keywords or [])
|
||||
|
||||
def _parse_intent_result(self, result: Dict[str, Any], user_input: str) -> ResearchIntent:
|
||||
"""Parse LLM result into ResearchIntent model."""
|
||||
|
||||
# Map string values to enums safely
|
||||
input_type = self._safe_enum(InputType, result.get("input_type", "keywords"), InputType.KEYWORDS)
|
||||
purpose = self._safe_enum(ResearchPurpose, result.get("purpose", "learn"), ResearchPurpose.LEARN)
|
||||
content_output = self._safe_enum(ContentOutput, result.get("content_output", "general"), ContentOutput.GENERAL)
|
||||
depth = self._safe_enum(ResearchDepthLevel, result.get("depth", "detailed"), ResearchDepthLevel.DETAILED)
|
||||
|
||||
# Parse expected deliverables
|
||||
raw_deliverables = result.get("expected_deliverables", [])
|
||||
expected_deliverables = []
|
||||
for d in raw_deliverables:
|
||||
try:
|
||||
expected_deliverables.append(ExpectedDeliverable(d).value)
|
||||
except ValueError:
|
||||
# Skip invalid deliverables
|
||||
pass
|
||||
|
||||
# Ensure we have at least some deliverables
|
||||
if not expected_deliverables:
|
||||
expected_deliverables = self._infer_deliverables_from_purpose(purpose)
|
||||
|
||||
return ResearchIntent(
|
||||
primary_question=result.get("primary_question", user_input),
|
||||
secondary_questions=result.get("secondary_questions", []),
|
||||
purpose=purpose.value,
|
||||
content_output=content_output.value,
|
||||
expected_deliverables=expected_deliverables,
|
||||
depth=depth.value,
|
||||
focus_areas=result.get("focus_areas", []),
|
||||
perspective=result.get("perspective"),
|
||||
time_sensitivity=result.get("time_sensitivity"),
|
||||
input_type=input_type.value,
|
||||
original_input=user_input,
|
||||
confidence=float(result.get("confidence", 0.7)),
|
||||
needs_clarification=result.get("needs_clarification", False),
|
||||
clarifying_questions=result.get("clarifying_questions", []),
|
||||
)
|
||||
|
||||
def _safe_enum(self, enum_class, value: str, default):
|
||||
"""Safely convert string to enum, returning default if invalid."""
|
||||
try:
|
||||
return enum_class(value)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
def _infer_deliverables_from_purpose(self, purpose: ResearchPurpose) -> List[str]:
|
||||
"""Infer expected deliverables based on research purpose."""
|
||||
|
||||
purpose_deliverables = {
|
||||
ResearchPurpose.LEARN: [
|
||||
ExpectedDeliverable.DEFINITIONS.value,
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
],
|
||||
ResearchPurpose.CREATE_CONTENT: [
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.EXPERT_QUOTES.value,
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.CASE_STUDIES.value,
|
||||
],
|
||||
ResearchPurpose.MAKE_DECISION: [
|
||||
ExpectedDeliverable.PROS_CONS.value,
|
||||
ExpectedDeliverable.COMPARISONS.value,
|
||||
ExpectedDeliverable.BEST_PRACTICES.value,
|
||||
],
|
||||
ResearchPurpose.COMPARE: [
|
||||
ExpectedDeliverable.COMPARISONS.value,
|
||||
ExpectedDeliverable.PROS_CONS.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
],
|
||||
ResearchPurpose.SOLVE_PROBLEM: [
|
||||
ExpectedDeliverable.STEP_BY_STEP.value,
|
||||
ExpectedDeliverable.BEST_PRACTICES.value,
|
||||
ExpectedDeliverable.CASE_STUDIES.value,
|
||||
],
|
||||
ResearchPurpose.FIND_DATA: [
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.CITATIONS.value,
|
||||
],
|
||||
ResearchPurpose.EXPLORE_TRENDS: [
|
||||
ExpectedDeliverable.TRENDS.value,
|
||||
ExpectedDeliverable.PREDICTIONS.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
],
|
||||
ResearchPurpose.VALIDATE: [
|
||||
ExpectedDeliverable.CITATIONS.value,
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.EXPERT_QUOTES.value,
|
||||
],
|
||||
ResearchPurpose.GENERATE_IDEAS: [
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.TRENDS.value,
|
||||
ExpectedDeliverable.CASE_STUDIES.value,
|
||||
],
|
||||
}
|
||||
|
||||
return purpose_deliverables.get(purpose, [ExpectedDeliverable.KEY_STATISTICS.value])
|
||||
|
||||
def _generate_quick_options(self, intent: ResearchIntent, result: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Generate quick options for UI confirmation."""
|
||||
|
||||
options = []
|
||||
|
||||
# Purpose option
|
||||
options.append({
|
||||
"id": "purpose",
|
||||
"label": "Research Purpose",
|
||||
"value": intent.purpose,
|
||||
"display": self._purpose_display(intent.purpose),
|
||||
"alternatives": [p.value for p in ResearchPurpose],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
})
|
||||
|
||||
# Content output option
|
||||
if intent.content_output != ContentOutput.GENERAL.value:
|
||||
options.append({
|
||||
"id": "content_output",
|
||||
"label": "Content Type",
|
||||
"value": intent.content_output,
|
||||
"display": intent.content_output.replace("_", " ").title(),
|
||||
"alternatives": [c.value for c in ContentOutput],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
})
|
||||
|
||||
# Deliverables option
|
||||
options.append({
|
||||
"id": "deliverables",
|
||||
"label": "What I'll Find",
|
||||
"value": intent.expected_deliverables,
|
||||
"display": [d.replace("_", " ").title() for d in intent.expected_deliverables[:4]],
|
||||
"alternatives": [d.value for d in ExpectedDeliverable],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
"multi_select": True,
|
||||
})
|
||||
|
||||
# Depth option
|
||||
options.append({
|
||||
"id": "depth",
|
||||
"label": "Research Depth",
|
||||
"value": intent.depth,
|
||||
"display": intent.depth.title(),
|
||||
"alternatives": [d.value for d in ResearchDepthLevel],
|
||||
"confidence": result.get("confidence", 0.7),
|
||||
})
|
||||
|
||||
return options
|
||||
|
||||
def _purpose_display(self, purpose: str) -> str:
|
||||
"""Get display-friendly purpose text."""
|
||||
display_map = {
|
||||
"learn": "Understand this topic",
|
||||
"create_content": "Create content about this",
|
||||
"make_decision": "Make a decision",
|
||||
"compare": "Compare options",
|
||||
"solve_problem": "Solve a problem",
|
||||
"find_data": "Find specific data",
|
||||
"explore_trends": "Explore trends",
|
||||
"validate": "Validate information",
|
||||
"generate_ideas": "Generate ideas",
|
||||
}
|
||||
return display_map.get(purpose, purpose.replace("_", " ").title())
|
||||
|
||||
def _extract_keywords_from_input(self, user_input: str, keywords: List[str]) -> List[str]:
|
||||
"""Extract and enhance keywords from user input."""
|
||||
|
||||
# Start with provided keywords
|
||||
extracted = list(keywords) if keywords else []
|
||||
|
||||
# Simple extraction from input (split on common delimiters)
|
||||
words = user_input.lower().replace(",", " ").replace(";", " ").split()
|
||||
|
||||
# Filter out common words
|
||||
stop_words = {
|
||||
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
||||
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "must", "shall", "can", "need", "dare",
|
||||
"to", "of", "in", "for", "on", "with", "at", "by", "from", "up",
|
||||
"about", "into", "through", "during", "before", "after", "above",
|
||||
"below", "between", "under", "again", "further", "then", "once",
|
||||
"here", "there", "when", "where", "why", "how", "all", "each",
|
||||
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
||||
"only", "own", "same", "so", "than", "too", "very", "just", "and",
|
||||
"but", "if", "or", "because", "as", "until", "while", "i", "we",
|
||||
"you", "they", "what", "which", "who", "whom", "this", "that",
|
||||
"these", "those", "am", "want", "write", "blog", "post", "article",
|
||||
}
|
||||
|
||||
for word in words:
|
||||
if word not in stop_words and len(word) > 2 and word not in extracted:
|
||||
extracted.append(word)
|
||||
|
||||
return extracted[:15] # Limit to 15 keywords
|
||||
|
||||
def _create_fallback_response(self, user_input: str, keywords: List[str]) -> IntentInferenceResponse:
|
||||
"""Create a fallback response when AI inference fails."""
|
||||
|
||||
# Create a basic intent from the input
|
||||
fallback_intent = ResearchIntent(
|
||||
primary_question=f"What are the key insights about: {user_input}?",
|
||||
secondary_questions=[
|
||||
f"What are the latest trends in {user_input}?",
|
||||
f"What are best practices for {user_input}?",
|
||||
],
|
||||
purpose=ResearchPurpose.LEARN.value,
|
||||
content_output=ContentOutput.GENERAL.value,
|
||||
expected_deliverables=[
|
||||
ExpectedDeliverable.KEY_STATISTICS.value,
|
||||
ExpectedDeliverable.EXAMPLES.value,
|
||||
ExpectedDeliverable.BEST_PRACTICES.value,
|
||||
],
|
||||
depth=ResearchDepthLevel.DETAILED.value,
|
||||
focus_areas=[],
|
||||
input_type=InputType.KEYWORDS.value,
|
||||
original_input=user_input,
|
||||
confidence=0.5,
|
||||
needs_clarification=True,
|
||||
clarifying_questions=[
|
||||
"What type of content are you creating?",
|
||||
"What specific aspects are you most interested in?",
|
||||
],
|
||||
)
|
||||
|
||||
return IntentInferenceResponse(
|
||||
success=True, # Still return success, just with lower confidence
|
||||
intent=fallback_intent,
|
||||
analysis_summary=f"Basic research analysis for: {user_input}",
|
||||
suggested_queries=[],
|
||||
suggested_keywords=keywords,
|
||||
suggested_angles=[],
|
||||
quick_options=[],
|
||||
)
|
||||
@@ -5,7 +5,7 @@ Handles building comprehensive prompts for research persona generation.
|
||||
Generates personalized research defaults, suggestions, and configurations.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any
|
||||
from typing import Dict, Any, List
|
||||
import json
|
||||
from loguru import logger
|
||||
|
||||
@@ -21,9 +21,34 @@ class ResearchPersonaPromptBuilder:
|
||||
persona_data = onboarding_data.get("persona_data", {}) or {}
|
||||
research_prefs = onboarding_data.get("research_preferences", {}) or {}
|
||||
business_info = onboarding_data.get("business_info", {}) or {}
|
||||
competitor_analysis = onboarding_data.get("competitor_analysis", []) or []
|
||||
|
||||
# Extract core persona
|
||||
core_persona = persona_data.get("core_persona", {}) or {}
|
||||
# Extract core persona - handle both camelCase and snake_case
|
||||
core_persona = persona_data.get("corePersona") or persona_data.get("core_persona") or {}
|
||||
|
||||
# Phase 1: Extract key website analysis fields for enhanced personalization
|
||||
writing_style = website_analysis.get("writing_style", {}) or {}
|
||||
content_type = website_analysis.get("content_type", {}) or {}
|
||||
crawl_result = website_analysis.get("crawl_result", {}) or {}
|
||||
|
||||
# Phase 2: Extract additional fields for pattern-based personalization
|
||||
style_patterns = website_analysis.get("style_patterns", {}) or {}
|
||||
content_characteristics = website_analysis.get("content_characteristics", {}) or {}
|
||||
style_guidelines = website_analysis.get("style_guidelines", {}) or {}
|
||||
|
||||
# Extract topics/keywords from crawl_result (if available)
|
||||
extracted_topics = self._extract_topics_from_crawl(crawl_result)
|
||||
extracted_keywords = self._extract_keywords_from_crawl(crawl_result)
|
||||
|
||||
# Phase 2: Extract patterns and vocabulary level
|
||||
extracted_patterns = self._extract_writing_patterns(style_patterns)
|
||||
vocabulary_level = content_characteristics.get("vocabulary_level", "medium") if content_characteristics else "medium"
|
||||
extracted_guidelines = self._extract_style_guidelines(style_guidelines)
|
||||
|
||||
# Phase 3: Full crawl analysis and comprehensive mapping
|
||||
crawl_analysis = self._analyze_crawl_result_comprehensive(crawl_result)
|
||||
writing_style_mapping = self._map_writing_style_comprehensive(writing_style, content_characteristics)
|
||||
content_themes = self._extract_content_themes(crawl_result, extracted_topics)
|
||||
|
||||
prompt = f"""
|
||||
COMPREHENSIVE RESEARCH PERSONA GENERATION TASK: Create a highly detailed, personalized research persona based on the user's business, writing style, and content strategy. This persona will provide intelligent defaults and suggestions for research inputs.
|
||||
@@ -42,53 +67,233 @@ CORE PERSONA:
|
||||
RESEARCH PREFERENCES:
|
||||
{json.dumps(research_prefs, indent=2)}
|
||||
|
||||
COMPETITOR ANALYSIS:
|
||||
{json.dumps(competitor_analysis, indent=2) if competitor_analysis else "No competitor data available"}
|
||||
|
||||
=== PHASE 1: WEBSITE ANALYSIS INTELLIGENCE ===
|
||||
|
||||
WRITING STYLE (for research depth mapping):
|
||||
{json.dumps(writing_style, indent=2) if writing_style else "Not available"}
|
||||
|
||||
CONTENT TYPE (for preset generation):
|
||||
{json.dumps(content_type, indent=2) if content_type else "Not available"}
|
||||
|
||||
EXTRACTED TOPICS FROM WEBSITE CONTENT:
|
||||
{json.dumps(extracted_topics, indent=2) if extracted_topics else "No topics extracted"}
|
||||
|
||||
EXTRACTED KEYWORDS FROM WEBSITE CONTENT:
|
||||
{json.dumps(extracted_keywords[:20], indent=2) if extracted_keywords else "No keywords extracted"}
|
||||
|
||||
=== PHASE 2: WRITING PATTERNS & STYLE INTELLIGENCE ===
|
||||
|
||||
STYLE PATTERNS (for research angles):
|
||||
{json.dumps(style_patterns, indent=2) if style_patterns else "Not available"}
|
||||
|
||||
EXTRACTED WRITING PATTERNS:
|
||||
{json.dumps(extracted_patterns, indent=2) if extracted_patterns else "No patterns extracted"}
|
||||
|
||||
CONTENT CHARACTERISTICS (for keyword sophistication):
|
||||
{json.dumps(content_characteristics, indent=2) if content_characteristics else "Not available"}
|
||||
|
||||
VOCABULARY LEVEL:
|
||||
{vocabulary_level}
|
||||
|
||||
STYLE GUIDELINES (for query enhancement):
|
||||
{json.dumps(style_guidelines, indent=2) if style_guidelines else "Not available"}
|
||||
|
||||
EXTRACTED GUIDELINES:
|
||||
{json.dumps(extracted_guidelines, indent=2) if extracted_guidelines else "No guidelines extracted"}
|
||||
|
||||
=== PHASE 3: COMPREHENSIVE ANALYSIS & MAPPING ===
|
||||
|
||||
CRAWL ANALYSIS (Full Content Intelligence):
|
||||
{json.dumps(crawl_analysis, indent=2) if crawl_analysis else "No crawl analysis available"}
|
||||
|
||||
WRITING STYLE COMPREHENSIVE MAPPING:
|
||||
{json.dumps(writing_style_mapping, indent=2) if writing_style_mapping else "No style mapping available"}
|
||||
|
||||
CONTENT THEMES (Extracted from Website):
|
||||
{json.dumps(content_themes, indent=2) if content_themes else "No themes extracted"}
|
||||
|
||||
=== RESEARCH PERSONA GENERATION REQUIREMENTS ===
|
||||
|
||||
Generate a comprehensive research persona in JSON format with the following structure:
|
||||
|
||||
1. DEFAULT VALUES:
|
||||
- "default_industry": Extract from core_persona.industry, business_info.industry, or website_analysis target_audience. Use "General" only if none available.
|
||||
- "default_industry": Extract from core_persona.industry, business_info.industry, or website_analysis target_audience. If none available, infer from content patterns in website_analysis or research_preferences. Never use "General" - always provide a specific industry based on context.
|
||||
- "default_target_audience": Extract from core_persona.target_audience, website_analysis.target_audience, or business_info.target_audience. Be specific and descriptive.
|
||||
- "default_research_mode": Suggest "basic", "comprehensive", or "targeted" based on research_preferences.research_depth and content_type preferences.
|
||||
- "default_provider": Suggest "google" for news/trends, "exa" for academic/technical deep-dives, or "google" as default.
|
||||
- "default_research_mode": **PHASE 3 ENHANCEMENT** - Use comprehensive writing_style_mapping:
|
||||
* **PRIMARY**: Use writing_style_mapping.research_depth_preference (from comprehensive analysis)
|
||||
* **SECONDARY**: Map from writing_style.complexity:
|
||||
- If writing_style.complexity == "high": Use "comprehensive" (deep research needed)
|
||||
- If writing_style.complexity == "medium": Use "targeted" (balanced research)
|
||||
- If writing_style.complexity == "low": Use "basic" (quick research)
|
||||
* **FALLBACK**: Use research_preferences.research_depth if complexity not available
|
||||
* This ensures research depth matches the user's writing sophistication level and comprehensive style analysis
|
||||
- "default_provider": **PHASE 3 ENHANCEMENT** - Use writing_style_mapping.provider_preference:
|
||||
* **PRIMARY**: Use writing_style_mapping.provider_preference (from comprehensive style analysis)
|
||||
* **SECONDARY**: Suggest based on user's typical research needs:
|
||||
- Academic/research users: "exa" (semantic search, papers)
|
||||
- News/current events users: "tavily" (real-time, AI answers)
|
||||
- General business users: "exa" (better for content creation)
|
||||
* **DEFAULT**: "exa" (generally better for content creators)
|
||||
|
||||
2. KEYWORD INTELLIGENCE:
|
||||
- "suggested_keywords": Generate 8-12 keywords relevant to the user's industry, interests (from core_persona), and content goals.
|
||||
- "keyword_expansion_patterns": Create a dictionary mapping common keywords to expanded, industry-specific terms. Include 10-15 patterns like:
|
||||
{{"AI": ["healthcare AI", "medical AI", "clinical AI", "diagnostic AI"], "tools": ["medical devices", "clinical tools"], ...}}
|
||||
Focus on industry-specific terminology from the user's domain.
|
||||
- "suggested_keywords": **PHASE 1 ENHANCEMENT** - Prioritize extracted keywords from crawl_result:
|
||||
* First, use extracted_keywords from website content (top 8-10 most relevant)
|
||||
* Then, supplement with keywords from user's industry, interests (from core_persona), and content goals
|
||||
* Total: 8-12 keywords, with at least 50% from extracted_keywords if available
|
||||
* This ensures keywords reflect the user's actual content topics
|
||||
- "keyword_expansion_patterns": **PHASE 2 ENHANCEMENT** - Create a dictionary mapping common keywords to expanded, industry-specific terms based on vocabulary_level:
|
||||
* If vocabulary_level == "advanced": Use sophisticated, technical, industry-specific terminology
|
||||
Example: {{"AI": ["machine learning algorithms", "neural network architectures", "deep learning frameworks", "algorithmic intelligence systems"], "tools": ["enterprise software platforms", "integrated development environments", "cloud-native solutions"]}}
|
||||
* If vocabulary_level == "medium": Use balanced, professional terminology
|
||||
Example: {{"AI": ["artificial intelligence", "automated systems", "smart technology", "intelligent automation"], "tools": ["software solutions", "digital platforms", "business applications"]}}
|
||||
* If vocabulary_level == "simple": Use accessible, beginner-friendly terminology
|
||||
Example: {{"AI": ["smart technology", "automated tools", "helpful software", "intelligent helpers"], "tools": ["apps", "software", "platforms", "online services"]}}
|
||||
* Include 10-15 patterns, matching the user's vocabulary sophistication level
|
||||
* Focus on industry-specific terminology from the user's domain, but at the appropriate complexity level
|
||||
|
||||
3. DOMAIN EXPERTISE:
|
||||
3. PROVIDER-SPECIFIC OPTIMIZATION:
|
||||
- "suggested_exa_domains": List 4-6 authoritative domains for the user's industry (e.g., Healthcare: ["pubmed.gov", "nejm.org", "thelancet.com"]).
|
||||
- "suggested_exa_category": Suggest appropriate Exa category based on industry:
|
||||
- Healthcare/Science: "research paper"
|
||||
- Finance: "financial report"
|
||||
- Technology/Business: "company" or "news"
|
||||
- Social Media/Marketing: "tweet" or "linkedin profile"
|
||||
- Default: null (empty string for all categories)
|
||||
- "suggested_exa_search_type": Suggest Exa search algorithm:
|
||||
- Academic/research content: "neural" (semantic understanding)
|
||||
- Current news/trends: "fast" (speed optimized)
|
||||
- General research: "auto" (balanced)
|
||||
- Code/technical: "neural"
|
||||
- "suggested_tavily_topic": Choose based on content type:
|
||||
- Financial content: "finance"
|
||||
- News/current events: "news"
|
||||
- General research: "general"
|
||||
- "suggested_tavily_search_depth": Choose based on research needs:
|
||||
- Quick overview: "basic" (1 credit, faster)
|
||||
- In-depth analysis: "advanced" (2 credits, more comprehensive)
|
||||
- Breaking news: "fast" (speed optimized)
|
||||
- "suggested_tavily_include_answer": AI-generated answers:
|
||||
- For factual queries needing quick answers: "advanced"
|
||||
- For research summaries: "basic"
|
||||
- When building custom content: "false" (use raw results)
|
||||
- "suggested_tavily_time_range": Time filtering:
|
||||
- Breaking news: "day"
|
||||
- Recent developments: "week"
|
||||
- Industry analysis: "month"
|
||||
- Historical research: null (no time limit)
|
||||
- "suggested_tavily_raw_content_format": Raw content for LLM processing:
|
||||
- For blog content creation: "markdown" (structured)
|
||||
- For simple text extraction: "text"
|
||||
- No raw content needed: "false"
|
||||
- "provider_recommendations": Map use cases to best providers:
|
||||
{{"trends": "tavily", "deep_research": "exa", "factual": "google", "news": "tavily", "academic": "exa"}}
|
||||
|
||||
4. RESEARCH ANGLES:
|
||||
- "research_angles": Generate 5-8 alternative research angles/focuses based on:
|
||||
- User's pain points and challenges (from core_persona)
|
||||
- Industry trends and opportunities
|
||||
- Content goals (from research_preferences)
|
||||
- Audience interests (from core_persona.interests)
|
||||
Examples: "Compare {{topic}} tools", "{{topic}} ROI analysis", "Latest {{topic}} trends", etc.
|
||||
- "research_angles": **PHASE 2 ENHANCEMENT** - Generate 5-8 alternative research angles/focuses based on:
|
||||
* **PRIMARY SOURCE**: Extract from extracted_patterns (writing patterns from style_patterns):
|
||||
- If "comparison" in patterns: "Compare {{topic}} solutions and alternatives"
|
||||
- If "how-to" or "tutorial" in patterns: "Step-by-step guide to {{topic}} implementation"
|
||||
- If "case-study" or "case_study" in patterns: "Real-world {{topic}} case studies and success stories"
|
||||
- If "trend-analysis" or "trends" in patterns: "Latest {{topic}} trends and future predictions"
|
||||
- If "best-practices" or "best_practices" in patterns: "{{topic}} best practices and industry standards"
|
||||
- If "review" or "evaluation" in patterns: "{{topic}} review and evaluation criteria"
|
||||
- If "problem-solving" in patterns: "{{topic}} problem-solving strategies and solutions"
|
||||
* **SECONDARY SOURCES** (if patterns not available):
|
||||
- User's pain points and challenges (from core_persona.identity or core_persona)
|
||||
- Industry trends and opportunities (from website_analysis or business_info)
|
||||
- Content goals (from research_preferences.content_types)
|
||||
- Audience interests (from core_persona or website_analysis.target_audience)
|
||||
- Competitive landscape (if competitor_analysis exists, include competitive angles)
|
||||
* Make angles specific to the user's industry and actionable for content creation
|
||||
* Use the same language style and structure as the user's writing patterns
|
||||
|
||||
5. QUERY ENHANCEMENT:
|
||||
- "query_enhancement_rules": Create templates for improving vague user queries:
|
||||
{{"vague_ai": "Research: AI applications in {{industry}} for {{audience}}", "vague_tools": "Compare top {{industry}} tools", ...}}
|
||||
Include 5-8 enhancement patterns.
|
||||
- "query_enhancement_rules": **PHASE 2 ENHANCEMENT** - Create templates for improving vague user queries based on extracted_guidelines:
|
||||
* **PRIMARY SOURCE**: Use extracted_guidelines (from style_guidelines) to create enhancement rules:
|
||||
- If guidelines include "Use specific examples": {{"vague_query": "Research: {{query}} with specific examples and case studies"}}
|
||||
- If guidelines include "Include data points" or "statistics": {{"general_query": "Research: {{query}} including statistics, metrics, and data analysis"}}
|
||||
- If guidelines include "Reference industry standards": {{"basic_query": "Research: {{query}} with industry benchmarks and best practices"}}
|
||||
- If guidelines include "Cite authoritative sources": {{"factual_query": "Research: {{query}} from authoritative sources and expert opinions"}}
|
||||
- If guidelines include "Provide actionable insights": {{"theoretical_query": "Research: {{query}} with actionable strategies and implementation steps"}}
|
||||
- If guidelines include "Compare alternatives": {{"single_item_query": "Research: Compare {{query}} alternatives and evaluate options"}}
|
||||
* **FALLBACK PATTERNS** (if guidelines not available):
|
||||
{{"vague_ai": "Research: AI applications in {{industry}} for {{audience}}", "vague_tools": "Compare top {{industry}} tools", "vague_trends": "Research latest {{industry}} trends and developments", ...}}
|
||||
* Include 5-8 enhancement patterns
|
||||
* Match the enhancement style to the user's writing guidelines and preferences
|
||||
|
||||
6. RECOMMENDED PRESETS:
|
||||
- "recommended_presets": Generate 3-5 personalized research preset templates. Each preset should include:
|
||||
- name: Descriptive name (e.g., "{{Industry}} Trends", "{{Audience}} Insights")
|
||||
- keywords: Research query string
|
||||
- industry: User's industry
|
||||
- target_audience: User's target audience
|
||||
- research_mode: "basic", "comprehensive", or "targeted"
|
||||
- config: Complete ResearchConfig object with appropriate settings
|
||||
- description: Brief explanation of what this preset researches
|
||||
Make presets relevant to the user's specific industry, audience, and content goals.
|
||||
- "recommended_presets": **PHASE 3 ENHANCEMENT** - Generate 3-5 personalized research preset templates using comprehensive analysis:
|
||||
* **USE CONTENT THEMES**: If content_themes available, create at least one preset per major theme (up to 3 themes)
|
||||
- Example: If themes include ["AI automation", "content marketing", "SEO strategies"], create presets for each
|
||||
- Use theme names in preset keywords: "Research latest {theme} trends and best practices"
|
||||
* **USE CRAWL ANALYSIS**: Leverage crawl_analysis.content_categories and crawl_analysis.main_topics for preset generation
|
||||
- Create presets that match the user's actual website content categories
|
||||
- Use main_topics for preset keywords and descriptions
|
||||
* **CONTENT TYPE BASED**: Generate presets based on content_type (from Phase 1):
|
||||
* **Content-Type-Specific Presets**: Use content_type.primary_type and content_type.secondary_types to create presets:
|
||||
- If primary_type == "blog": Create "Blog Topic Research" preset with trending topics
|
||||
- If primary_type == "article": Create "Article Research" preset with in-depth analysis
|
||||
- If primary_type == "case_study": Create "Case Study Research" preset with real-world examples
|
||||
- If primary_type == "tutorial": Create "Tutorial Research" preset with step-by-step guides
|
||||
- If "tutorial" in secondary_types: Add "How-To Guide Research" preset
|
||||
- If "comparison" in secondary_types or style_patterns: Add "Comparison Research" preset
|
||||
- If content_type.purpose == "thought_leadership": Create "Thought Leadership Research" with expert insights
|
||||
- If content_type.purpose == "education": Create "Educational Content Research" preset
|
||||
* **Use Extracted Topics**: If extracted_topics available, create at least one preset using actual website topics:
|
||||
- "Latest {extracted_topic} Trends" preset
|
||||
- "{extracted_topic} Best Practices" preset
|
||||
* Each preset should include:
|
||||
- name: Descriptive, action-oriented name that clearly indicates what research will be done
|
||||
* Use research_angles as inspiration for preset names (e.g., "Compare {Industry} Tools", "{Industry} ROI Analysis")
|
||||
* If competitor_analysis exists, create at least one competitive analysis preset (e.g., "Competitive Landscape Analysis")
|
||||
* Make names specific and actionable, not generic
|
||||
* **NEW**: Include content type in name when relevant (e.g., "Blog: {Industry} Trends", "Tutorial: {Topic} Guide")
|
||||
- keywords: Research query string that is:
|
||||
* **NEW**: Use extracted_topics and extracted_keywords when available for more relevant queries
|
||||
* Specific and detailed (not vague like "AI tools")
|
||||
* Industry-focused (includes industry context)
|
||||
* Audience-aware (considers target audience needs)
|
||||
* Actionable (user can immediately understand what research will provide)
|
||||
* Examples: "Research latest AI-powered marketing automation platforms for B2B SaaS companies" (GOOD)
|
||||
* Avoid: "AI tools" or "marketing research" (TOO VAGUE)
|
||||
- industry: User's industry (from business_info or inferred)
|
||||
- target_audience: User's target audience (from business_info or inferred)
|
||||
- research_mode: "basic", "comprehensive", or "targeted" based on:
|
||||
* **NEW**: Also consider content_type.purpose:
|
||||
- "thought_leadership" → "comprehensive" (needs deep research)
|
||||
- "education" → "comprehensive" (needs thorough coverage)
|
||||
- "marketing" → "targeted" (needs specific insights)
|
||||
- "entertainment" → "basic" (needs quick facts)
|
||||
* "comprehensive" for deep analysis, trends, competitive research
|
||||
* "targeted" for specific questions, quick insights
|
||||
* "basic" for simple fact-finding
|
||||
- config: Complete ResearchConfig object with:
|
||||
* provider: Use suggested_exa_category to determine if "exa" or "tavily" is better
|
||||
* exa_category: Use suggested_exa_category if available
|
||||
* exa_include_domains: Use suggested_exa_domains if available (limit to 3-5 most relevant)
|
||||
* exa_search_type: Use suggested_exa_search_type if available
|
||||
* max_sources: 15-25 for comprehensive, 10-15 for targeted, 8-12 for basic
|
||||
* include_competitors: true if competitor_analysis exists and preset is about competitive research
|
||||
* include_trends: true for trend-focused presets
|
||||
* include_statistics: true for data-driven research
|
||||
* include_expert_quotes: true for comprehensive research or thought_leadership content
|
||||
- description: Brief (1-2 sentences) explaining what this preset researches and why it's valuable
|
||||
- icon: Optional emoji that represents the preset (e.g., "📊" for trends, "🎯" for targeted, "🔍" for analysis, "📝" for blog, "📚" for tutorial)
|
||||
- gradient: Optional CSS gradient for visual appeal
|
||||
|
||||
PRESET GENERATION GUIDELINES:
|
||||
- **PHASE 1 PRIORITY**: Create presets that match the user's actual content types (from content_type)
|
||||
- Use extracted_topics to create presets based on actual website content
|
||||
- Create presets that the user would actually want to use for their content creation
|
||||
- Use research_angles to inspire preset names and keywords
|
||||
- If competitor_analysis has data, create at least one competitive analysis preset
|
||||
- Make each preset unique with different research focus (trends, tools, best practices, competitive, etc.)
|
||||
- Ensure keywords are detailed enough to generate meaningful research
|
||||
- Vary research_mode across presets to offer different depth levels
|
||||
- Use industry-specific terminology in preset names and keywords
|
||||
|
||||
7. RESEARCH PREFERENCES:
|
||||
- "research_preferences": Extract and structure research preferences from onboarding:
|
||||
@@ -109,8 +314,19 @@ Return a valid JSON object matching this exact structure:
|
||||
"keyword_expansion_patterns": {{
|
||||
"keyword": ["expansion1", "expansion2", ...]
|
||||
}},
|
||||
"suggested_exa_domains": ["domain1.com", "domain2.com", ...],
|
||||
"suggested_exa_category": "string or null",
|
||||
"suggested_exa_domains": ["domain1.com", "domain2.com", ...],
|
||||
"suggested_exa_category": "string or null",
|
||||
"suggested_exa_search_type": "auto | neural | keyword | fast | deep",
|
||||
"suggested_tavily_topic": "general | news | finance",
|
||||
"suggested_tavily_search_depth": "basic | advanced | fast | ultra-fast",
|
||||
"suggested_tavily_include_answer": "false | basic | advanced",
|
||||
"suggested_tavily_time_range": "day | week | month | year or null",
|
||||
"suggested_tavily_raw_content_format": "false | markdown | text",
|
||||
"provider_recommendations": {{
|
||||
"trends": "tavily",
|
||||
"deep_research": "exa",
|
||||
"factual": "google"
|
||||
}},
|
||||
"research_angles": ["angle1", "angle2", ...],
|
||||
"query_enhancement_rules": {{
|
||||
"pattern": "template"
|
||||
@@ -150,18 +366,291 @@ Return a valid JSON object matching this exact structure:
|
||||
=== IMPORTANT INSTRUCTIONS ===
|
||||
|
||||
1. Be highly specific and personalized - use actual data from the user's business, persona, and preferences.
|
||||
2. Avoid generic suggestions - every field should reflect the user's unique context.
|
||||
3. For industries not clearly identified, infer from website_analysis.content_characteristics or writing_style.
|
||||
4. Ensure all suggested keywords, domains, and angles are relevant to the user's industry and audience.
|
||||
5. Generate realistic, actionable presets that the user would actually want to use.
|
||||
6. Confidence score should reflect data richness (0-100): higher if rich onboarding data, lower if minimal data.
|
||||
7. Return ONLY valid JSON - no markdown formatting, no explanatory text.
|
||||
2. NEVER use "General" for industry or target_audience - always infer or create specific categories based on available context.
|
||||
3. For minimal data scenarios:
|
||||
- If industry is unclear, infer from research_preferences.content_types or website_analysis.content_characteristics
|
||||
- If target_audience is unclear, infer from writing_style patterns or content goals
|
||||
- Use business_info to fill gaps when persona_data is incomplete
|
||||
4. Generate industry-specific intelligence even with limited data:
|
||||
- For content creators: assume "Content Marketing" or "Digital Publishing"
|
||||
- For business users: assume "Business Consulting" or "Professional Services"
|
||||
- For technical users: assume "Technology" or "Software Development"
|
||||
5. Ensure all suggested keywords, domains, and angles are relevant to the user's industry and audience.
|
||||
6. Generate realistic, actionable presets that the user would actually want to use.
|
||||
7. Confidence score should reflect data richness (0-100): higher if rich onboarding data, lower if minimal data.
|
||||
8. Return ONLY valid JSON - no markdown formatting, no explanatory text.
|
||||
|
||||
Generate the research persona now:
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _extract_topics_from_crawl(self, crawl_result: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract topics from crawl_result JSON data.
|
||||
|
||||
Args:
|
||||
crawl_result: Dictionary containing crawled website data
|
||||
|
||||
Returns:
|
||||
List of extracted topics (max 15)
|
||||
"""
|
||||
topics = []
|
||||
|
||||
if not crawl_result:
|
||||
return topics
|
||||
|
||||
try:
|
||||
# Try to extract from common crawl result structures
|
||||
# Method 1: Direct topics field
|
||||
if isinstance(crawl_result.get('topics'), list):
|
||||
topics.extend(crawl_result['topics'][:10])
|
||||
|
||||
# Method 2: Extract from headings
|
||||
if isinstance(crawl_result.get('headings'), list):
|
||||
headings = crawl_result['headings']
|
||||
# Filter out common non-topic headings
|
||||
filtered_headings = [
|
||||
h for h in headings[:15]
|
||||
if h and len(h.strip()) > 3
|
||||
and h.lower() not in ['home', 'about', 'contact', 'menu', 'navigation', 'footer', 'header']
|
||||
]
|
||||
topics.extend(filtered_headings)
|
||||
|
||||
# Method 3: Extract from page titles
|
||||
if isinstance(crawl_result.get('titles'), list):
|
||||
titles = crawl_result['titles']
|
||||
topics.extend([t for t in titles[:10] if t and len(t.strip()) > 3])
|
||||
|
||||
# Method 4: Extract from content sections
|
||||
if isinstance(crawl_result.get('sections'), list):
|
||||
sections = crawl_result['sections']
|
||||
for section in sections[:10]:
|
||||
if isinstance(section, dict):
|
||||
section_title = section.get('title') or section.get('heading')
|
||||
if section_title and len(section_title.strip()) > 3:
|
||||
topics.append(section_title)
|
||||
|
||||
# Method 5: Extract from metadata
|
||||
if isinstance(crawl_result.get('metadata'), dict):
|
||||
meta = crawl_result['metadata']
|
||||
if meta.get('title'):
|
||||
topics.append(meta['title'])
|
||||
if isinstance(meta.get('keywords'), list):
|
||||
topics.extend(meta['keywords'][:5])
|
||||
|
||||
# Remove duplicates and clean
|
||||
unique_topics = []
|
||||
seen = set()
|
||||
for topic in topics:
|
||||
if topic and isinstance(topic, str):
|
||||
cleaned = topic.strip()
|
||||
if cleaned and cleaned.lower() not in seen:
|
||||
seen.add(cleaned.lower())
|
||||
unique_topics.append(cleaned)
|
||||
|
||||
return unique_topics[:15] # Limit to 15 topics
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting topics from crawl_result: {e}")
|
||||
return []
|
||||
|
||||
def _extract_keywords_from_crawl(self, crawl_result: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract keywords from crawl_result JSON data.
|
||||
|
||||
Args:
|
||||
crawl_result: Dictionary containing crawled website data
|
||||
|
||||
Returns:
|
||||
List of extracted keywords (max 20)
|
||||
"""
|
||||
keywords = []
|
||||
|
||||
if not crawl_result:
|
||||
return keywords
|
||||
|
||||
try:
|
||||
# Method 1: Direct keywords field
|
||||
if isinstance(crawl_result.get('keywords'), list):
|
||||
keywords.extend(crawl_result['keywords'][:15])
|
||||
|
||||
# Method 2: Extract from metadata keywords
|
||||
if isinstance(crawl_result.get('metadata'), dict):
|
||||
meta = crawl_result['metadata']
|
||||
if isinstance(meta.get('keywords'), list):
|
||||
keywords.extend(meta['keywords'][:10])
|
||||
if meta.get('description'):
|
||||
# Extract potential keywords from description (simple word extraction)
|
||||
desc = meta['description']
|
||||
words = [w.strip() for w in desc.split() if len(w.strip()) > 4]
|
||||
keywords.extend(words[:5])
|
||||
|
||||
# Method 3: Extract from tags
|
||||
if isinstance(crawl_result.get('tags'), list):
|
||||
keywords.extend(crawl_result['tags'][:10])
|
||||
|
||||
# Method 4: Extract from content (simple frequency-based, if available)
|
||||
if isinstance(crawl_result.get('content'), str):
|
||||
content = crawl_result['content']
|
||||
# Simple extraction: words that appear multiple times and are > 4 chars
|
||||
words = content.lower().split()
|
||||
word_freq = {}
|
||||
for word in words:
|
||||
cleaned = ''.join(c for c in word if c.isalnum())
|
||||
if len(cleaned) > 4:
|
||||
word_freq[cleaned] = word_freq.get(cleaned, 0) + 1
|
||||
|
||||
# Get top keywords by frequency
|
||||
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
||||
keywords.extend([word for word, freq in sorted_words[:10] if freq > 1])
|
||||
|
||||
# Remove duplicates and clean
|
||||
unique_keywords = []
|
||||
seen = set()
|
||||
for keyword in keywords:
|
||||
if keyword and isinstance(keyword, str):
|
||||
cleaned = keyword.strip().lower()
|
||||
if cleaned and len(cleaned) > 2 and cleaned not in seen:
|
||||
seen.add(cleaned)
|
||||
unique_keywords.append(keyword.strip())
|
||||
|
||||
return unique_keywords[:20] # Limit to 20 keywords
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting keywords from crawl_result: {e}")
|
||||
return []
|
||||
|
||||
def _extract_writing_patterns(self, style_patterns: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract writing patterns from style_patterns JSON data.
|
||||
|
||||
Args:
|
||||
style_patterns: Dictionary containing writing patterns analysis
|
||||
|
||||
Returns:
|
||||
List of extracted patterns (max 10)
|
||||
"""
|
||||
patterns = []
|
||||
|
||||
if not style_patterns:
|
||||
return patterns
|
||||
|
||||
try:
|
||||
# Method 1: Direct patterns field
|
||||
if isinstance(style_patterns.get('patterns'), list):
|
||||
patterns.extend(style_patterns['patterns'][:10])
|
||||
|
||||
# Method 2: Common patterns field
|
||||
if isinstance(style_patterns.get('common_patterns'), list):
|
||||
patterns.extend(style_patterns['common_patterns'][:10])
|
||||
|
||||
# Method 3: Writing patterns field
|
||||
if isinstance(style_patterns.get('writing_patterns'), list):
|
||||
patterns.extend(style_patterns['writing_patterns'][:10])
|
||||
|
||||
# Method 4: Content structure patterns
|
||||
if isinstance(style_patterns.get('content_structure'), dict):
|
||||
structure = style_patterns['content_structure']
|
||||
if isinstance(structure.get('patterns'), list):
|
||||
patterns.extend(structure['patterns'][:5])
|
||||
|
||||
# Method 5: Extract from analysis field
|
||||
if isinstance(style_patterns.get('analysis'), dict):
|
||||
analysis = style_patterns['analysis']
|
||||
if isinstance(analysis.get('identified_patterns'), list):
|
||||
patterns.extend(analysis['identified_patterns'][:10])
|
||||
|
||||
# Normalize patterns (lowercase, remove duplicates)
|
||||
normalized_patterns = []
|
||||
seen = set()
|
||||
for pattern in patterns:
|
||||
if pattern and isinstance(pattern, str):
|
||||
cleaned = pattern.strip().lower().replace('_', '-').replace(' ', '-')
|
||||
if cleaned and cleaned not in seen:
|
||||
seen.add(cleaned)
|
||||
normalized_patterns.append(cleaned)
|
||||
|
||||
return normalized_patterns[:10] # Limit to 10 patterns
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting writing patterns: {e}")
|
||||
return []
|
||||
|
||||
def _extract_style_guidelines(self, style_guidelines: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Extract style guidelines from style_guidelines JSON data.
|
||||
|
||||
Args:
|
||||
style_guidelines: Dictionary containing generated style guidelines
|
||||
|
||||
Returns:
|
||||
List of extracted guidelines (max 15)
|
||||
"""
|
||||
guidelines = []
|
||||
|
||||
if not style_guidelines:
|
||||
return guidelines
|
||||
|
||||
try:
|
||||
# Method 1: Direct guidelines field
|
||||
if isinstance(style_guidelines.get('guidelines'), list):
|
||||
guidelines.extend(style_guidelines['guidelines'][:15])
|
||||
|
||||
# Method 2: Recommendations field
|
||||
if isinstance(style_guidelines.get('recommendations'), list):
|
||||
guidelines.extend(style_guidelines['recommendations'][:15])
|
||||
|
||||
# Method 3: Best practices field
|
||||
if isinstance(style_guidelines.get('best_practices'), list):
|
||||
guidelines.extend(style_guidelines['best_practices'][:10])
|
||||
|
||||
# Method 4: Tone recommendations
|
||||
if isinstance(style_guidelines.get('tone_recommendations'), list):
|
||||
guidelines.extend(style_guidelines['tone_recommendations'][:5])
|
||||
|
||||
# Method 5: Structure guidelines
|
||||
if isinstance(style_guidelines.get('structure_guidelines'), list):
|
||||
guidelines.extend(style_guidelines['structure_guidelines'][:5])
|
||||
|
||||
# Method 6: Vocabulary suggestions
|
||||
if isinstance(style_guidelines.get('vocabulary_suggestions'), list):
|
||||
guidelines.extend(style_guidelines['vocabulary_suggestions'][:5])
|
||||
|
||||
# Method 7: Engagement tips
|
||||
if isinstance(style_guidelines.get('engagement_tips'), list):
|
||||
guidelines.extend(style_guidelines['engagement_tips'][:5])
|
||||
|
||||
# Method 8: Audience considerations
|
||||
if isinstance(style_guidelines.get('audience_considerations'), list):
|
||||
guidelines.extend(style_guidelines['audience_considerations'][:5])
|
||||
|
||||
# Method 9: SEO optimization (if available)
|
||||
if isinstance(style_guidelines.get('seo_optimization'), list):
|
||||
guidelines.extend(style_guidelines['seo_optimization'][:3])
|
||||
|
||||
# Method 10: Conversion optimization (if available)
|
||||
if isinstance(style_guidelines.get('conversion_optimization'), list):
|
||||
guidelines.extend(style_guidelines['conversion_optimization'][:3])
|
||||
|
||||
# Remove duplicates and clean
|
||||
unique_guidelines = []
|
||||
seen = set()
|
||||
for guideline in guidelines:
|
||||
if guideline and isinstance(guideline, str):
|
||||
cleaned = guideline.strip()
|
||||
# Normalize for comparison (lowercase, remove extra spaces)
|
||||
normalized = ' '.join(cleaned.lower().split())
|
||||
if cleaned and normalized not in seen and len(cleaned) > 5:
|
||||
seen.add(normalized)
|
||||
unique_guidelines.append(cleaned)
|
||||
|
||||
return unique_guidelines[:15] # Limit to 15 guidelines
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error extracting style guidelines: {e}")
|
||||
return []
|
||||
|
||||
def get_json_schema(self) -> Dict[str, Any]:
|
||||
"""Return JSON schema for structured LLM response."""
|
||||
# This will be used with llm_text_gen(json_struct=...)
|
||||
|
||||
@@ -367,16 +367,53 @@ class ResearchPersonaService:
|
||||
if demographics:
|
||||
business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
|
||||
|
||||
# Check if we have enough data
|
||||
if not website_analysis and not persona_data_dict:
|
||||
logger.warning(f"Insufficient onboarding data for user {user_id}")
|
||||
# Check if we have enough data - be more lenient since we can infer from minimal data
|
||||
# We need at least some basic information to generate a meaningful persona
|
||||
has_basic_data = bool(
|
||||
website_analysis or
|
||||
persona_data_dict or
|
||||
research_prefs.get('content_types') or
|
||||
business_info.get('industry')
|
||||
)
|
||||
|
||||
if not has_basic_data:
|
||||
logger.warning(f"Insufficient onboarding data for user {user_id} - no basic data found")
|
||||
return None
|
||||
|
||||
# If we have minimal data, add intelligent defaults to help the AI
|
||||
if not business_info.get('industry'):
|
||||
# Try to infer industry from research preferences or content types
|
||||
content_types = research_prefs.get('content_types', [])
|
||||
if 'blog' in content_types or 'article' in content_types:
|
||||
business_info['industry'] = 'Content Marketing'
|
||||
business_info['inferred'] = True
|
||||
elif 'social_media' in content_types:
|
||||
business_info['industry'] = 'Social Media Marketing'
|
||||
business_info['inferred'] = True
|
||||
elif 'video' in content_types:
|
||||
business_info['industry'] = 'Video Content Creation'
|
||||
business_info['inferred'] = True
|
||||
|
||||
if not business_info.get('target_audience'):
|
||||
# Default to professionals for content creators
|
||||
business_info['target_audience'] = 'Professionals and content consumers'
|
||||
business_info['inferred'] = True
|
||||
|
||||
# Get competitor analysis data (if available)
|
||||
competitor_analysis = None
|
||||
try:
|
||||
competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
|
||||
if competitor_analysis:
|
||||
logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")
|
||||
|
||||
return {
|
||||
"website_analysis": website_analysis,
|
||||
"persona_data": persona_data_dict,
|
||||
"research_preferences": research_prefs,
|
||||
"business_info": business_info
|
||||
"business_info": business_info,
|
||||
"competitor_analysis": competitor_analysis # Add competitor data for better preset generation
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user