Added video studio router and endpoints. Added research router and endpoints. Added youtube router and endpoints. Added onboarding utils router and endpoints. Added onboarding utils service. Added onboarding utils models. Added onboarding utils routes. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils. Added onboarding utils utils.

This commit is contained in:
ajaysi
2026-01-01 17:56:25 +05:30
parent 7512933c65
commit b134e9dc7e
252 changed files with 40333 additions and 2712 deletions

View File

@@ -0,0 +1,51 @@
"""
Research Engine Core Module
This is the standalone AI Research Engine that can be imported by
Blog Writer, Podcast Maker, YouTube Creator, and other ALwrity tools.
Design Goals:
- Tool-agnostic: Any content tool can import and use this
- AI-driven parameter optimization: Users don't need to understand Exa/Tavily internals
- Provider priority: Exa → Tavily → Google (fallback)
- Personalization-aware: Accepts context from calling tools
- Advanced by default: Prioritizes quality over speed
Usage:
from services.research.core import ResearchEngine, ResearchContext
engine = ResearchEngine()
result = await engine.research(ResearchContext(
query="AI trends in healthcare 2025",
content_type=ContentType.BLOG,
persona_context={"industry": "Healthcare", "audience": "Medical professionals"}
))
Author: ALwrity Team
Version: 2.0
Last Updated: December 2025
"""
from .research_context import (
ResearchContext,
ResearchPersonalizationContext,
ContentType,
ResearchGoal,
ResearchDepth,
ProviderPreference,
)
from .parameter_optimizer import ParameterOptimizer
from .research_engine import ResearchEngine
__all__ = [
# Context schemas
"ResearchContext",
"ResearchPersonalizationContext",
"ContentType",
"ResearchGoal",
"ResearchDepth",
"ProviderPreference",
# Core classes
"ParameterOptimizer",
"ResearchEngine",
]

View File

@@ -0,0 +1,384 @@
"""
AI Parameter Optimizer for Research Engine
Uses AI to analyze the research query and context to select optimal
parameters for Exa and Tavily APIs. This abstracts the complexity
from non-technical users.
Key Decisions:
- Provider selection (Exa vs Tavily vs Google)
- Search type (neural vs keyword)
- Category/topic selection
- Depth and result limits
- Domain filtering
Author: ALwrity Team
Version: 2.0
"""
import os
import re
from typing import Dict, Any, Optional, Tuple
from loguru import logger
from .research_context import (
ResearchContext,
ResearchGoal,
ResearchDepth,
ProviderPreference,
ContentType,
)
from models.blog_models import ResearchConfig, ResearchProvider, ResearchMode
class ParameterOptimizer:
"""
AI-driven parameter optimization for research providers.
Analyzes the research context and selects optimal parameters
for Exa, Tavily, or Google without requiring user expertise.
"""
# Query patterns for intelligent routing
TRENDING_PATTERNS = [
r'\b(latest|recent|new|2024|2025|current|trending|news)\b',
r'\b(update|announcement|launch|release)\b',
]
TECHNICAL_PATTERNS = [
r'\b(api|sdk|framework|library|implementation|architecture)\b',
r'\b(code|programming|developer|technical|engineering)\b',
]
COMPETITIVE_PATTERNS = [
r'\b(competitor|alternative|vs|versus|compare|comparison)\b',
r'\b(market|industry|landscape|players)\b',
]
FACTUAL_PATTERNS = [
r'\b(statistics|data|research|study|report|survey)\b',
r'\b(percent|percentage|number|figure|metric)\b',
]
# Exa category mapping based on query analysis
EXA_CATEGORY_MAP = {
'research': 'research paper',
'news': 'news',
'company': 'company',
'personal': 'personal site',
'github': 'github',
'linkedin': 'linkedin profile',
'finance': 'financial report',
}
# Tavily topic mapping
TAVILY_TOPIC_MAP = {
ResearchGoal.TRENDING: 'news',
ResearchGoal.FACTUAL: 'general',
ResearchGoal.COMPETITIVE: 'general',
ResearchGoal.TECHNICAL: 'general',
ResearchGoal.EDUCATIONAL: 'general',
ResearchGoal.INSPIRATIONAL: 'general',
}
def __init__(self):
"""Initialize the optimizer."""
self.exa_available = bool(os.getenv("EXA_API_KEY"))
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
logger.info(f"ParameterOptimizer initialized: exa={self.exa_available}, tavily={self.tavily_available}")
def optimize(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
"""
Analyze research context and return optimized provider and config.
Args:
context: The research context from the calling tool
Returns:
Tuple of (selected_provider, optimized_config)
"""
# If advanced mode, use raw parameters
if context.advanced_mode:
return self._build_advanced_config(context)
# Analyze query to determine optimal approach
query_analysis = self._analyze_query(context.query)
# Select provider based on analysis and preferences
provider = self._select_provider(context, query_analysis)
# Build optimized config for selected provider
config = self._build_config(context, provider, query_analysis)
logger.info(f"Optimized research: provider={provider.value}, mode={config.mode.value}")
return provider, config
def _analyze_query(self, query: str) -> Dict[str, Any]:
"""
Analyze the query to understand intent and optimal approach.
Returns dict with:
- is_trending: Query is about recent/current events
- is_technical: Query is technical in nature
- is_competitive: Query is about competition/comparison
- is_factual: Query needs data/statistics
- suggested_category: Exa category if applicable
- suggested_topic: Tavily topic
"""
query_lower = query.lower()
analysis = {
'is_trending': self._matches_patterns(query_lower, self.TRENDING_PATTERNS),
'is_technical': self._matches_patterns(query_lower, self.TECHNICAL_PATTERNS),
'is_competitive': self._matches_patterns(query_lower, self.COMPETITIVE_PATTERNS),
'is_factual': self._matches_patterns(query_lower, self.FACTUAL_PATTERNS),
'suggested_category': None,
'suggested_topic': 'general',
'suggested_search_type': 'auto',
}
# Determine Exa category
if 'research' in query_lower or 'study' in query_lower or 'paper' in query_lower:
analysis['suggested_category'] = 'research paper'
elif 'github' in query_lower or 'repository' in query_lower:
analysis['suggested_category'] = 'github'
elif 'linkedin' in query_lower or 'professional' in query_lower:
analysis['suggested_category'] = 'linkedin profile'
elif analysis['is_trending']:
analysis['suggested_category'] = 'news'
elif 'company' in query_lower or 'startup' in query_lower:
analysis['suggested_category'] = 'company'
# Determine Tavily topic
if analysis['is_trending']:
analysis['suggested_topic'] = 'news'
elif 'finance' in query_lower or 'stock' in query_lower or 'investment' in query_lower:
analysis['suggested_topic'] = 'finance'
else:
analysis['suggested_topic'] = 'general'
# Determine search type
if analysis['is_technical'] or analysis['is_factual']:
analysis['suggested_search_type'] = 'neural' # Better for semantic understanding
elif analysis['is_trending']:
analysis['suggested_search_type'] = 'keyword' # Better for current events
return analysis
def _matches_patterns(self, text: str, patterns: list) -> bool:
"""Check if text matches any of the patterns."""
for pattern in patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def _select_provider(self, context: ResearchContext, analysis: Dict[str, Any]) -> ResearchProvider:
"""
Select the optimal provider based on context and query analysis.
Priority: Exa → Tavily → Google for ALL modes (including basic).
This provides better semantic search results for content creators.
Exa's neural search excels at understanding context and meaning,
which is valuable for all research types, not just technical queries.
"""
preference = context.provider_preference
# If user explicitly requested a provider, respect that
if preference == ProviderPreference.EXA:
if self.exa_available:
return ResearchProvider.EXA
logger.warning("Exa requested but not available, falling back")
if preference == ProviderPreference.TAVILY:
if self.tavily_available:
return ResearchProvider.TAVILY
logger.warning("Tavily requested but not available, falling back")
if preference == ProviderPreference.GOOGLE:
return ResearchProvider.GOOGLE
# AUTO mode: Always prefer Exa → Tavily → Google
# Exa provides superior semantic search for all content types
if self.exa_available:
logger.info(f"Selected Exa (primary provider): query analysis shows " +
f"technical={analysis.get('is_technical', False)}, " +
f"trending={analysis.get('is_trending', False)}")
return ResearchProvider.EXA
# Tavily as secondary option - good for real-time and news
if self.tavily_available:
logger.info(f"Selected Tavily (secondary): Exa unavailable, " +
f"trending={analysis.get('is_trending', False)}")
return ResearchProvider.TAVILY
# Google grounding as fallback
logger.info("Selected Google (fallback): Exa and Tavily unavailable")
return ResearchProvider.GOOGLE
def _build_config(
self,
context: ResearchContext,
provider: ResearchProvider,
analysis: Dict[str, Any]
) -> ResearchConfig:
"""Build optimized ResearchConfig for the selected provider."""
# Map ResearchDepth to ResearchMode
mode_map = {
ResearchDepth.QUICK: ResearchMode.BASIC,
ResearchDepth.STANDARD: ResearchMode.BASIC,
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
}
mode = mode_map.get(context.depth, ResearchMode.BASIC)
# Base config
config = ResearchConfig(
mode=mode,
provider=provider,
max_sources=context.max_sources,
include_statistics=context.personalization.include_statistics if context.personalization else True,
include_expert_quotes=context.personalization.include_expert_quotes if context.personalization else True,
include_competitors=analysis['is_competitive'],
include_trends=analysis['is_trending'],
)
# Provider-specific optimizations
if provider == ResearchProvider.EXA:
config = self._optimize_exa_config(config, context, analysis)
elif provider == ResearchProvider.TAVILY:
config = self._optimize_tavily_config(config, context, analysis)
# Apply domain filters
if context.include_domains:
if provider == ResearchProvider.EXA:
config.exa_include_domains = context.include_domains
elif provider == ResearchProvider.TAVILY:
config.tavily_include_domains = context.include_domains[:300] # Tavily limit
if context.exclude_domains:
if provider == ResearchProvider.EXA:
config.exa_exclude_domains = context.exclude_domains
elif provider == ResearchProvider.TAVILY:
config.tavily_exclude_domains = context.exclude_domains[:150] # Tavily limit
return config
def _optimize_exa_config(
self,
config: ResearchConfig,
context: ResearchContext,
analysis: Dict[str, Any]
) -> ResearchConfig:
"""Add Exa-specific optimizations."""
# Set category based on analysis
if analysis['suggested_category']:
config.exa_category = analysis['suggested_category']
# Set search type
config.exa_search_type = analysis.get('suggested_search_type', 'auto')
# For comprehensive research, use neural search
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
config.exa_search_type = 'neural'
return config
def _optimize_tavily_config(
self,
config: ResearchConfig,
context: ResearchContext,
analysis: Dict[str, Any]
) -> ResearchConfig:
"""Add Tavily-specific optimizations."""
# Set topic based on analysis
config.tavily_topic = analysis.get('suggested_topic', 'general')
# Set search depth based on research depth
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
config.tavily_search_depth = 'advanced' # 2 credits, but better results
config.tavily_chunks_per_source = 3
else:
config.tavily_search_depth = 'basic' # 1 credit
# Set time range based on recency
if context.recency:
recency_map = {
'day': 'd',
'week': 'w',
'month': 'm',
'year': 'y',
}
config.tavily_time_range = recency_map.get(context.recency, context.recency)
elif analysis['is_trending']:
config.tavily_time_range = 'w' # Last week for trending topics
# Include answer for comprehensive research
if context.depth in [ResearchDepth.COMPREHENSIVE, ResearchDepth.EXPERT]:
config.tavily_include_answer = 'advanced'
# Include raw content for expert depth
if context.depth == ResearchDepth.EXPERT:
config.tavily_include_raw_content = 'markdown'
return config
def _build_advanced_config(self, context: ResearchContext) -> Tuple[ResearchProvider, ResearchConfig]:
"""
Build config from raw advanced parameters.
Used when advanced_mode=True and user wants full control.
"""
# Determine provider from explicit parameters
provider = ResearchProvider.GOOGLE
if context.exa_category or context.exa_search_type:
provider = ResearchProvider.EXA if self.exa_available else ResearchProvider.GOOGLE
elif context.tavily_topic or context.tavily_search_depth:
provider = ResearchProvider.TAVILY if self.tavily_available else ResearchProvider.GOOGLE
# Check preference override
if context.provider_preference == ProviderPreference.EXA and self.exa_available:
provider = ResearchProvider.EXA
elif context.provider_preference == ProviderPreference.TAVILY and self.tavily_available:
provider = ResearchProvider.TAVILY
elif context.provider_preference == ProviderPreference.GOOGLE:
provider = ResearchProvider.GOOGLE
# Map depth to mode
mode_map = {
ResearchDepth.QUICK: ResearchMode.BASIC,
ResearchDepth.STANDARD: ResearchMode.BASIC,
ResearchDepth.COMPREHENSIVE: ResearchMode.COMPREHENSIVE,
ResearchDepth.EXPERT: ResearchMode.COMPREHENSIVE,
}
mode = mode_map.get(context.depth, ResearchMode.BASIC)
# Build config with raw parameters
config = ResearchConfig(
mode=mode,
provider=provider,
max_sources=context.max_sources,
# Exa
exa_category=context.exa_category,
exa_search_type=context.exa_search_type,
exa_include_domains=context.include_domains,
exa_exclude_domains=context.exclude_domains,
# Tavily
tavily_topic=context.tavily_topic,
tavily_search_depth=context.tavily_search_depth,
tavily_include_domains=context.include_domains[:300] if context.include_domains else [],
tavily_exclude_domains=context.exclude_domains[:150] if context.exclude_domains else [],
tavily_include_answer=context.tavily_include_answer,
tavily_include_raw_content=context.tavily_include_raw_content,
tavily_time_range=context.tavily_time_range,
tavily_country=context.tavily_country,
)
logger.info(f"Advanced config: provider={provider.value}, mode={mode.value}")
return provider, config

View File

@@ -0,0 +1,198 @@
"""
Research Context Schema
Defines the unified input schema for the Research Engine.
Any tool (Blog Writer, Podcast Maker, YouTube Creator) can create a ResearchContext
and pass it to the Research Engine.
Author: ALwrity Team
Version: 2.0
"""
from enum import Enum
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
class ContentType(str, Enum):
"""Type of content being created - affects research focus."""
BLOG = "blog"
PODCAST = "podcast"
VIDEO = "video"
SOCIAL = "social"
EMAIL = "email"
NEWSLETTER = "newsletter"
WHITEPAPER = "whitepaper"
GENERAL = "general"
class ResearchGoal(str, Enum):
"""Primary goal of the research - affects provider selection and depth."""
FACTUAL = "factual" # Stats, data, citations
TRENDING = "trending" # Current trends, news
COMPETITIVE = "competitive" # Competitor analysis
EDUCATIONAL = "educational" # How-to, explanations
INSPIRATIONAL = "inspirational" # Stories, quotes
TECHNICAL = "technical" # Deep technical content
class ResearchDepth(str, Enum):
"""Depth of research - maps to existing ResearchMode."""
QUICK = "quick" # Fast, surface-level (maps to BASIC)
STANDARD = "standard" # Balanced depth (maps to BASIC with more sources)
COMPREHENSIVE = "comprehensive" # Deep research (maps to COMPREHENSIVE)
EXPERT = "expert" # Maximum depth with expert sources
class ProviderPreference(str, Enum):
"""Provider preference - AUTO lets the engine decide."""
AUTO = "auto" # AI decides based on query (default)
EXA = "exa" # Force Exa neural search
TAVILY = "tavily" # Force Tavily AI search
GOOGLE = "google" # Force Google grounding
HYBRID = "hybrid" # Use multiple providers
class ResearchPersonalizationContext(BaseModel):
"""
Context from the calling tool (Blog Writer, Podcast Maker, etc.)
This personalizes the research without the Research Engine knowing
the specific tool implementation.
"""
# Who is creating the content
creator_id: Optional[str] = None # Clerk user ID
# Content context
content_type: ContentType = ContentType.GENERAL
industry: Optional[str] = None
target_audience: Optional[str] = None
tone: Optional[str] = None # professional, casual, technical, etc.
# Persona data (from onboarding)
persona_id: Optional[str] = None
brand_voice: Optional[str] = None
competitor_urls: List[str] = Field(default_factory=list)
# Content requirements
word_count_target: Optional[int] = None
include_statistics: bool = True
include_expert_quotes: bool = True
include_case_studies: bool = False
include_visuals: bool = False
# Platform-specific hints
platform: Optional[str] = None # medium, wordpress, youtube, spotify, etc.
class Config:
use_enum_values = True
class ResearchContext(BaseModel):
"""
Main input schema for the Research Engine.
This is what any tool passes to the Research Engine to get research results.
The engine uses AI to optimize parameters based on this context.
"""
# Primary research input
query: str = Field(..., description="Main research query or topic")
keywords: List[str] = Field(default_factory=list, description="Additional keywords")
# Research configuration
goal: ResearchGoal = ResearchGoal.FACTUAL
depth: ResearchDepth = ResearchDepth.STANDARD
provider_preference: ProviderPreference = ProviderPreference.AUTO
# Personalization from calling tool
personalization: Optional[ResearchPersonalizationContext] = None
# Constraints
max_sources: int = Field(default=10, ge=1, le=25)
recency: Optional[str] = None # "day", "week", "month", "year", None for all-time
# Domain filtering
include_domains: List[str] = Field(default_factory=list)
exclude_domains: List[str] = Field(default_factory=list)
# Advanced mode (exposes raw provider parameters)
advanced_mode: bool = False
# Raw provider parameters (only used if advanced_mode=True)
# Exa-specific
exa_category: Optional[str] = None
exa_search_type: Optional[str] = None # auto, keyword, neural
# Tavily-specific
tavily_topic: Optional[str] = None # general, news, finance
tavily_search_depth: Optional[str] = None # basic, advanced
tavily_include_answer: bool = False
tavily_include_raw_content: bool = False
tavily_time_range: Optional[str] = None
tavily_country: Optional[str] = None
class Config:
use_enum_values = True
def get_effective_query(self) -> str:
"""Build effective query combining query and keywords."""
if self.keywords:
return f"{self.query} {' '.join(self.keywords)}"
return self.query
def get_industry(self) -> str:
"""Get industry from personalization or default."""
if self.personalization and self.personalization.industry:
return self.personalization.industry
return "General"
def get_audience(self) -> str:
"""Get target audience from personalization or default."""
if self.personalization and self.personalization.target_audience:
return self.personalization.target_audience
return "General"
def get_user_id(self) -> Optional[str]:
"""Get user ID from personalization."""
if self.personalization:
return self.personalization.creator_id
return None
class ResearchResult(BaseModel):
"""
Output schema from the Research Engine.
Standardized format that any tool can consume.
"""
success: bool = True
# Content
summary: Optional[str] = None # AI-generated summary of findings
raw_content: Optional[str] = None # Raw aggregated content for LLM processing
# Sources
sources: List[Dict[str, Any]] = Field(default_factory=list)
# Analysis (reuses existing blog writer analysis)
keyword_analysis: Dict[str, Any] = Field(default_factory=dict)
competitor_analysis: Dict[str, Any] = Field(default_factory=dict)
suggested_angles: List[str] = Field(default_factory=list)
# Metadata
provider_used: str = "google" # Which provider was actually used
search_queries: List[str] = Field(default_factory=list)
grounding_metadata: Optional[Dict[str, Any]] = None
# Cost tracking
estimated_cost: float = 0.0
# Error handling
error_message: Optional[str] = None
error_code: Optional[str] = None
retry_suggested: bool = False
# Original context for reference
original_query: Optional[str] = None
class Config:
use_enum_values = True

View File

@@ -0,0 +1,558 @@
"""
Research Engine - Core Orchestrator
The main entry point for AI research across all ALwrity tools.
This engine wraps existing providers (Exa, Tavily, Google) and provides
a unified interface for any content generation tool.
Usage:
from services.research.core import ResearchEngine, ResearchContext, ContentType
engine = ResearchEngine()
result = await engine.research(ResearchContext(
query="AI trends in healthcare 2025",
content_type=ContentType.PODCAST,
personalization=ResearchPersonalizationContext(
industry="Healthcare",
target_audience="Medical professionals"
)
))
Author: ALwrity Team
Version: 2.0
"""
import os
import time
from typing import Dict, Any, Optional, Callable
from loguru import logger
from .research_context import (
ResearchContext,
ResearchResult,
ResearchDepth,
ContentType,
ResearchPersonalizationContext,
)
from .parameter_optimizer import ParameterOptimizer
# Reuse existing blog writer models and services
from models.blog_models import (
BlogResearchRequest,
BlogResearchResponse,
ResearchConfig,
ResearchProvider,
ResearchMode,
PersonaInfo,
ResearchSource,
)
# Research persona for personalization
from models.research_persona_models import ResearchPersona
class ResearchEngine:
"""
AI Research Engine - Standalone module for content research.
This engine:
1. Accepts a ResearchContext from any tool
2. Uses AI to optimize parameters for Exa/Tavily
3. Integrates research persona for personalization
4. Executes research using existing providers
5. Returns standardized ResearchResult
Can be imported by Blog Writer, Podcast Maker, YouTube Creator, etc.
"""
def __init__(self, db_session=None):
"""Initialize the Research Engine."""
self.optimizer = ParameterOptimizer()
self._providers_initialized = False
self._exa_provider = None
self._tavily_provider = None
self._google_provider = None
self._db_session = db_session
# Check provider availability
self.exa_available = bool(os.getenv("EXA_API_KEY"))
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
logger.info(f"ResearchEngine initialized: exa={self.exa_available}, tavily={self.tavily_available}")
def _get_research_persona(self, user_id: str, generate_if_missing: bool = True) -> Optional[ResearchPersona]:
"""
Fetch research persona for user, generating if missing.
Phase 2: Since onboarding is mandatory and always completes before accessing
any tool, we can safely generate research persona on first use. This ensures
hyper-personalization without requiring "General" fallbacks.
Args:
user_id: User ID (Clerk string)
generate_if_missing: If True, generate persona if not cached (default: True)
Returns:
ResearchPersona if successful, None only if user has no core persona
"""
if not user_id:
return None
try:
from services.research.research_persona_service import ResearchPersonaService
db = self._db_session
if not db:
from services.database import get_db_session
db = get_db_session()
persona_service = ResearchPersonaService(db_session=db)
if generate_if_missing:
# Phase 2: Use get_or_generate() to create persona on first visit
# This triggers LLM call if not cached, but onboarding guarantees
# core persona exists, so generation will succeed
logger.info(f"🔄 Getting/generating research persona for user {user_id}...")
persona = persona_service.get_or_generate(user_id, force_refresh=False)
if persona:
logger.info(f"✅ Research persona ready for user {user_id}: industry={persona.default_industry}")
else:
logger.warning(f"⚠️ Could not get/generate research persona for user {user_id} - using core persona fallback")
else:
# Fast path: only return cached (for config endpoints)
persona = persona_service.get_cached_only(user_id)
if persona:
logger.debug(f"Research persona loaded from cache for user {user_id}")
return persona
except Exception as e:
logger.warning(f"Failed to load research persona for user {user_id}: {e}")
return None
def _enrich_context_with_persona(
self,
context: ResearchContext,
persona: ResearchPersona
) -> ResearchContext:
"""
Enrich the research context with persona data.
Only applies persona defaults if the context doesn't already have values.
User-provided values always take precedence.
"""
# Create personalization context if not exists
if not context.personalization:
context.personalization = ResearchPersonalizationContext()
# Apply persona defaults only if not already set
if not context.personalization.industry or context.personalization.industry == "General":
if persona.default_industry:
context.personalization.industry = persona.default_industry
logger.debug(f"Applied persona industry: {persona.default_industry}")
if not context.personalization.target_audience or context.personalization.target_audience == "General":
if persona.default_target_audience:
context.personalization.target_audience = persona.default_target_audience
logger.debug(f"Applied persona target_audience: {persona.default_target_audience}")
# Apply suggested Exa domains if not already set
if not context.include_domains and persona.suggested_exa_domains:
context.include_domains = persona.suggested_exa_domains[:6] # Limit to 6 domains
logger.debug(f"Applied persona domains: {context.include_domains}")
# Apply suggested Exa category if not already set
if not context.exa_category and persona.suggested_exa_category:
context.exa_category = persona.suggested_exa_category
logger.debug(f"Applied persona exa_category: {persona.suggested_exa_category}")
return context
async def research(
self,
context: ResearchContext,
progress_callback: Optional[Callable[[str], None]] = None
) -> ResearchResult:
"""
Execute research based on the given context.
Args:
context: Research context with query, goals, and personalization
progress_callback: Optional callback for progress updates
Returns:
ResearchResult with sources, analysis, and content
"""
start_time = time.time()
try:
# Progress update
self._progress(progress_callback, "🔍 Analyzing research query...")
# Enrich context with research persona (Phase 2: generate if missing)
user_id = context.get_user_id()
if user_id:
self._progress(progress_callback, "👤 Loading personalized research profile...")
persona = self._get_research_persona(user_id, generate_if_missing=True)
if persona:
self._progress(progress_callback, "✨ Applying hyper-personalized settings...")
context = self._enrich_context_with_persona(context, persona)
else:
logger.warning(f"No research persona available for user {user_id} - proceeding with provided context")
# Optimize parameters based on enriched context
provider, config = self.optimizer.optimize(context)
self._progress(progress_callback, f"🤖 Selected {provider.value.upper()} for research")
# Build the request using existing blog models
request = self._build_request(context, config)
user_id = context.get_user_id() or ""
# Execute research using appropriate provider
self._progress(progress_callback, f"🌐 Connecting to {provider.value} search...")
if provider == ResearchProvider.EXA:
response = await self._execute_exa_research(request, config, user_id, progress_callback)
elif provider == ResearchProvider.TAVILY:
response = await self._execute_tavily_research(request, config, user_id, progress_callback)
else:
response = await self._execute_google_research(request, config, user_id, progress_callback)
# Transform response to ResearchResult
self._progress(progress_callback, "📊 Processing results...")
result = self._transform_response(response, provider, context)
duration_ms = (time.time() - start_time) * 1000
logger.info(f"Research completed in {duration_ms:.0f}ms: {len(result.sources)} sources")
self._progress(progress_callback, f"✅ Research complete: {len(result.sources)} sources found")
return result
except Exception as e:
logger.error(f"Research failed: {e}")
return ResearchResult(
success=False,
error_message=str(e),
error_code="RESEARCH_FAILED",
retry_suggested=True,
original_query=context.query
)
def _progress(self, callback: Optional[Callable[[str], None]], message: str):
"""Send progress update if callback provided."""
if callback:
callback(message)
logger.info(f"[Research] {message}")
def _build_request(self, context: ResearchContext, config: ResearchConfig) -> BlogResearchRequest:
"""Build BlogResearchRequest from ResearchContext."""
# Extract keywords from query
keywords = context.keywords if context.keywords else [context.query]
# Build persona info from personalization
persona = None
if context.personalization:
persona = PersonaInfo(
persona_id=context.personalization.persona_id,
tone=context.personalization.tone,
audience=context.personalization.target_audience,
industry=context.personalization.industry,
)
return BlogResearchRequest(
keywords=keywords,
topic=context.query,
industry=context.get_industry(),
target_audience=context.get_audience(),
tone=context.personalization.tone if context.personalization else None,
word_count_target=context.personalization.word_count_target if context.personalization else 1500,
persona=persona,
research_mode=config.mode,
config=config,
)
async def _execute_exa_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Exa provider."""
from services.blog_writer.research.exa_provider import ExaResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Exa neural search...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Execute Exa search
try:
exa_provider = ExaResearchProvider()
raw_result = await exa_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
# Track usage
cost = raw_result.get('cost', {}).get('total', 0.005) if isinstance(raw_result.get('cost'), dict) else 0.005
exa_provider.track_exa_usage(user_id, cost)
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
except RuntimeError as e:
if "EXA_API_KEY not configured" in str(e):
logger.warning("Exa not configured, falling back to Tavily")
self._progress(progress_callback, "⚠️ Exa unavailable, trying Tavily...")
config.provider = ResearchProvider.TAVILY
return await self._execute_tavily_research(request, config, user_id, progress_callback)
raise
async def _execute_tavily_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Tavily provider."""
from services.blog_writer.research.tavily_provider import TavilyResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Tavily AI search...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Execute Tavily search
try:
tavily_provider = TavilyResearchProvider()
raw_result = await tavily_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
# Track usage
cost = raw_result.get('cost', {}).get('total', 0.001) if isinstance(raw_result.get('cost'), dict) else 0.001
search_depth = config.tavily_search_depth or "basic"
tavily_provider.track_tavily_usage(user_id, cost, search_depth)
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
except RuntimeError as e:
if "TAVILY_API_KEY not configured" in str(e):
logger.warning("Tavily not configured, falling back to Google")
self._progress(progress_callback, "⚠️ Tavily unavailable, using Google Search...")
config.provider = ResearchProvider.GOOGLE
return await self._execute_google_research(request, config, user_id, progress_callback)
raise
async def _execute_google_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Google/Gemini grounding."""
from services.blog_writer.research.google_provider import GoogleResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Google Search grounding...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Execute Google search
google_provider = GoogleResearchProvider()
raw_result = await google_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
self._progress(progress_callback, "📝 Processing grounded results...")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback, is_google=True)
async def _run_analysis(
self,
request: BlogResearchRequest,
raw_result: Dict[str, Any],
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None,
is_google: bool = False
) -> BlogResearchResponse:
"""Run common analysis on raw results."""
from services.blog_writer.research.keyword_analyzer import KeywordAnalyzer
from services.blog_writer.research.competitor_analyzer import CompetitorAnalyzer
from services.blog_writer.research.content_angle_generator import ContentAngleGenerator
from services.blog_writer.research.data_filter import ResearchDataFilter
self._progress(progress_callback, "🔍 Analyzing keywords and content angles...")
# Extract content for analysis
if is_google:
content = raw_result.get("content", "")
sources = self._extract_sources_from_grounding(raw_result)
search_queries = raw_result.get("search_queries", []) or []
grounding_metadata = self._extract_grounding_metadata(raw_result)
else:
content = raw_result.get('content', '')
sources = [ResearchSource(**s) if isinstance(s, dict) else s for s in raw_result.get('sources', [])]
search_queries = raw_result.get('search_queries', [])
grounding_metadata = None
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
# Run analyzers
keyword_analyzer = KeywordAnalyzer()
competitor_analyzer = CompetitorAnalyzer()
content_angle_generator = ContentAngleGenerator()
data_filter = ResearchDataFilter()
keyword_analysis = keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
competitor_analysis = competitor_analyzer.analyze(content, user_id=user_id)
suggested_angles = content_angle_generator.generate(content, topic, industry, user_id=user_id)
# Build response
response = BlogResearchResponse(
success=True,
sources=sources,
keyword_analysis=keyword_analysis,
competitor_analysis=competitor_analysis,
suggested_angles=suggested_angles,
search_widget="",
search_queries=search_queries,
grounding_metadata=grounding_metadata,
original_keywords=request.keywords,
)
# Filter and clean research data
self._progress(progress_callback, "✨ Filtering and optimizing results...")
filtered_response = data_filter.filter_research_data(response)
return filtered_response
def _extract_sources_from_grounding(self, gemini_result: Dict[str, Any]) -> list:
"""Extract sources from Gemini grounding metadata."""
from models.blog_models import ResearchSource
sources = []
if not gemini_result or not isinstance(gemini_result, dict):
return sources
raw_sources = gemini_result.get("sources", []) or []
for src in raw_sources:
source = ResearchSource(
title=src.get("title", "Untitled"),
url=src.get("url", ""),
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
credibility_score=float(src.get("credibility_score", 0.8)),
published_at=str(src.get("publication_date", "2024-01-01")),
index=src.get("index"),
source_type=src.get("type", "web")
)
sources.append(source)
return sources
def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract grounding metadata from Gemini result."""
if not gemini_result or not isinstance(gemini_result, dict):
return None
return gemini_result.get("grounding_metadata")
def _transform_response(
self,
response: BlogResearchResponse,
provider: ResearchProvider,
context: ResearchContext
) -> ResearchResult:
"""Transform BlogResearchResponse to ResearchResult."""
# Convert sources to dicts
sources = []
for s in response.sources:
if hasattr(s, 'dict'):
sources.append(s.dict())
elif isinstance(s, dict):
sources.append(s)
else:
sources.append({
'title': getattr(s, 'title', ''),
'url': getattr(s, 'url', ''),
'excerpt': getattr(s, 'excerpt', ''),
})
# Extract grounding metadata
grounding = None
if response.grounding_metadata:
if hasattr(response.grounding_metadata, 'dict'):
grounding = response.grounding_metadata.dict()
else:
grounding = response.grounding_metadata
return ResearchResult(
success=response.success,
sources=sources,
keyword_analysis=response.keyword_analysis,
competitor_analysis=response.competitor_analysis,
suggested_angles=response.suggested_angles,
provider_used=provider.value,
search_queries=response.search_queries,
grounding_metadata=grounding,
original_query=context.query,
error_message=response.error_message,
error_code=response.error_code if hasattr(response, 'error_code') else None,
retry_suggested=response.retry_suggested if hasattr(response, 'retry_suggested') else False,
)
def get_provider_status(self) -> Dict[str, Any]:
"""Get status of available providers."""
return {
"exa": {
"available": self.exa_available,
"priority": 1,
"description": "Neural search for semantic understanding"
},
"tavily": {
"available": self.tavily_available,
"priority": 2,
"description": "AI-powered web search"
},
"google": {
"available": True, # Always available via Gemini
"priority": 3,
"description": "Google Search grounding"
}
}