Files
ALwrity/backend/services/research/core/research_engine.py
ajaysi 928c2f20aa fix: WYSIWYG editor, content generation, and writing assistant bug fixes
- Fix text selection menu not showing: wire contentRef via inputRef on multiline TextField
- Fix blog title not truncating: add min-w-0 for flex item overflow
- Fix outline generation 500: escape curly braces in f-string prompt template
- Fix content generation 'NoneType not callable': replace SessionLocal() with get_session_for_user(), add db param to MediumBlogGenerator, fix signature mismatch in database_task_manager
- Fix writing assistant suggest 500: add auth + user_id to API endpoint and service, replace sync requests with httpx.AsyncClient
- Fix hallucination detector 404: explicitly include router in main.py and app.py
- Fix missing error_data in task failure responses
- Hide CopilotKit web inspector button
- Remove hardcoded fallback suggestions from SmartTypingAssist
- Fix stale closure refs in SmartTypingAssist handleTypingChange
- Add two-column editor layout, stats bar, section hover menu
- Various subscription, billing, and research module improvements
2026-05-14 09:11:51 +05:30

613 lines
26 KiB
Python

"""
Research Engine - Core Orchestrator
The main entry point for AI research across all ALwrity tools.
This engine wraps existing providers (Exa, Tavily, Google) and provides
a unified interface for any content generation tool.
Usage:
from services.research.core import ResearchEngine, ResearchContext, ContentType
engine = ResearchEngine()
result = await engine.research(ResearchContext(
query="AI trends in healthcare 2025",
content_type=ContentType.PODCAST,
personalization=ResearchPersonalizationContext(
industry="Healthcare",
target_audience="Medical professionals"
)
))
Author: ALwrity Team
Version: 2.0
"""
import os
import time
from typing import Dict, Any, Optional, Callable
from loguru import logger
from .research_context import (
ResearchContext,
ResearchResult,
ResearchDepth,
ContentType,
ResearchPersonalizationContext,
)
from .parameter_optimizer import ParameterOptimizer
# Reuse existing blog writer models and services
from models.blog_models import (
BlogResearchRequest,
BlogResearchResponse,
ResearchConfig,
ResearchProvider,
ResearchMode,
PersonaInfo,
ResearchSource,
)
# Research persona for personalization
from models.research_persona_models import ResearchPersona
class ResearchEngine:
"""
AI Research Engine - Standalone module for content research.
This engine:
1. Accepts a ResearchContext from any tool
2. Uses AI to optimize parameters for Exa/Tavily
3. Integrates research persona for personalization
4. Executes research using existing providers
5. Returns standardized ResearchResult
Can be imported by Blog Writer, Podcast Maker, YouTube Creator, etc.
"""
def __init__(self, db_session=None):
"""Initialize the Research Engine."""
self.optimizer = ParameterOptimizer()
self._providers_initialized = False
self._exa_provider = None
self._tavily_provider = None
self._google_provider = None
self._db_session = db_session
# Check provider availability
self.exa_available = bool(os.getenv("EXA_API_KEY"))
self.tavily_available = bool(os.getenv("TAVILY_API_KEY"))
logger.info(f"ResearchEngine initialized: exa={self.exa_available}, tavily={self.tavily_available}")
def _get_research_persona(self, user_id: str, generate_if_missing: bool = True) -> Optional[ResearchPersona]:
"""
Fetch research persona for user, generating if missing.
Phase 2: Since onboarding is mandatory and always completes before accessing
any tool, we can safely generate research persona on first use. This ensures
hyper-personalization without requiring "General" fallbacks.
Args:
user_id: User ID (Clerk string)
generate_if_missing: If True, generate persona if not cached (default: True)
Returns:
ResearchPersona if successful, None only if user has no core persona
"""
if not user_id:
return None
try:
from services.research.research_persona_service import ResearchPersonaService
db = self._db_session
if not db:
from services.database import get_db_session
db = get_db_session()
persona_service = ResearchPersonaService(db_session=db)
if generate_if_missing:
# Phase 2: Use get_or_generate() to create persona on first visit
# This triggers LLM call if not cached, but onboarding guarantees
# core persona exists, so generation will succeed
logger.info(f"🔄 Getting/generating research persona for user {user_id}...")
persona = persona_service.get_or_generate(user_id, force_refresh=False)
if persona:
logger.info(f"✅ Research persona ready for user {user_id}: industry={persona.default_industry}")
else:
logger.warning(f"⚠️ Could not get/generate research persona for user {user_id} - using core persona fallback")
else:
# Fast path: only return cached (for config endpoints)
persona = persona_service.get_cached_only(user_id)
if persona:
logger.debug(f"Research persona loaded from cache for user {user_id}")
return persona
except Exception as e:
logger.warning(f"Failed to load research persona for user {user_id}: {e}")
return None
def _enrich_context_with_persona(
self,
context: ResearchContext,
persona: ResearchPersona
) -> ResearchContext:
"""
Enrich the research context with persona data.
Only applies persona defaults if the context doesn't already have values.
User-provided values always take precedence.
"""
# Create personalization context if not exists
if not context.personalization:
context.personalization = ResearchPersonalizationContext()
# Apply persona defaults only if not already set
if not context.personalization.industry or context.personalization.industry == "General":
if persona.default_industry:
context.personalization.industry = persona.default_industry
logger.debug(f"Applied persona industry: {persona.default_industry}")
if not context.personalization.target_audience or context.personalization.target_audience == "General":
if persona.default_target_audience:
context.personalization.target_audience = persona.default_target_audience
logger.debug(f"Applied persona target_audience: {persona.default_target_audience}")
# Apply suggested Exa domains if not already set
if not context.include_domains and persona.suggested_exa_domains:
context.include_domains = persona.suggested_exa_domains[:6] # Limit to 6 domains
logger.debug(f"Applied persona domains: {context.include_domains}")
# Apply suggested Exa category if not already set
if not context.exa_category and persona.suggested_exa_category:
context.exa_category = persona.suggested_exa_category
logger.debug(f"Applied persona exa_category: {persona.suggested_exa_category}")
return context
async def research(
self,
context: ResearchContext,
progress_callback: Optional[Callable[[str], None]] = None
) -> ResearchResult:
"""
Execute research based on the given context.
Args:
context: Research context with query, goals, and personalization
progress_callback: Optional callback for progress updates
Returns:
ResearchResult with sources, analysis, and content
"""
start_time = time.time()
try:
# Progress update
self._progress(progress_callback, "🔍 Analyzing research query...")
# Enrich context with research persona (Phase 2: generate if missing)
user_id = context.get_user_id()
if user_id:
self._progress(progress_callback, "👤 Loading personalized research profile...")
persona = self._get_research_persona(user_id, generate_if_missing=True)
if persona:
self._progress(progress_callback, "✨ Applying hyper-personalized settings...")
context = self._enrich_context_with_persona(context, persona)
else:
logger.warning(f"No research persona available for user {user_id} - proceeding with provided context")
# Optimize parameters based on enriched context
provider, config = self.optimizer.optimize(context)
self._progress(progress_callback, f"🤖 Selected {provider.value.upper()} for research")
# Build the request using existing blog models
request = self._build_request(context, config)
user_id = context.get_user_id() or ""
# Execute research using appropriate provider
self._progress(progress_callback, f"🌐 Connecting to {provider.value} search...")
if provider == ResearchProvider.EXA:
response = await self._execute_exa_research(request, config, user_id, progress_callback)
elif provider == ResearchProvider.TAVILY:
response = await self._execute_tavily_research(request, config, user_id, progress_callback)
else:
response = await self._execute_google_research(request, config, user_id, progress_callback)
# Transform response to ResearchResult
self._progress(progress_callback, "📊 Processing results...")
result = self._transform_response(response, provider, context)
duration_ms = (time.time() - start_time) * 1000
logger.info(f"Research completed in {duration_ms:.0f}ms: {len(result.sources)} sources")
self._progress(progress_callback, f"✅ Research complete: {len(result.sources)} sources found")
return result
except Exception as e:
logger.error(f"Research failed: {e}")
return ResearchResult(
success=False,
error_message=str(e),
error_code="RESEARCH_FAILED",
retry_suggested=True,
original_query=context.query
)
def _progress(self, callback: Optional[Callable[[str], None]], message: str):
"""Send progress update if callback provided."""
if callback:
callback(message)
logger.info(f"[Research] {message}")
def _build_request(self, context: ResearchContext, config: ResearchConfig) -> BlogResearchRequest:
"""Build BlogResearchRequest from ResearchContext."""
# Extract keywords from query
keywords = context.keywords if context.keywords else [context.query]
# Build persona info from personalization
persona = None
if context.personalization:
persona = PersonaInfo(
persona_id=context.personalization.persona_id,
tone=context.personalization.tone,
audience=context.personalization.target_audience,
industry=context.personalization.industry,
)
return BlogResearchRequest(
keywords=keywords,
topic=context.query,
industry=context.get_industry(),
target_audience=context.get_audience(),
tone=context.personalization.tone if context.personalization else None,
word_count_target=context.personalization.word_count_target if context.personalization else 1500,
persona=persona,
research_mode=config.mode,
config=config,
)
async def _execute_exa_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Exa provider."""
from services.blog_writer.research.exa_provider import ExaResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Exa neural search...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Preflight subscription check
try:
db = self._db_session
if not db:
from services.database import get_db_session
db = get_db_session()
if db:
from services.subscription import PricingService
from models.subscription_models import APIProvider
pricing_service = PricingService(db)
can_proceed, message, usage_info = pricing_service.check_usage_limits(
user_id=user_id,
provider=APIProvider.EXA,
tokens_requested=0,
actual_provider_name="exa",
)
if not can_proceed:
raise HTTPException(status_code=429, detail={
'error': message, 'message': message,
'provider': 'exa', 'usage_info': usage_info or {}
})
logger.info(f"[ResearchEngine] Exa preflight check passed for user {user_id}")
except HTTPException:
raise
except Exception as e:
logger.warning(f"[ResearchEngine] Exa preflight check failed: {e}")
# Execute Exa search
try:
exa_provider = ExaResearchProvider()
raw_result = await exa_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
# Track usage
cost = raw_result.get('cost', {}).get('total', 0.005) if isinstance(raw_result.get('cost'), dict) else 0.005
exa_provider.track_exa_usage(user_id, cost)
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
except RuntimeError as e:
if "EXA_API_KEY not configured" in str(e):
logger.warning("Exa not configured, falling back to Tavily")
self._progress(progress_callback, "⚠️ Exa unavailable, trying Tavily...")
config.provider = ResearchProvider.TAVILY
return await self._execute_tavily_research(request, config, user_id, progress_callback)
raise
async def _execute_tavily_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Tavily provider."""
from services.blog_writer.research.tavily_provider import TavilyResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Tavily AI search...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Preflight subscription check
try:
db = self._db_session
if not db:
from services.database import get_db_session
db = get_db_session()
if db:
from services.subscription import PricingService
from models.subscription_models import APIProvider
pricing_service = PricingService(db)
can_proceed, message, usage_info = pricing_service.check_usage_limits(
user_id=user_id,
provider=APIProvider.TAVILY,
tokens_requested=0,
actual_provider_name="tavily",
)
if not can_proceed:
raise HTTPException(status_code=429, detail={
'error': message, 'message': message,
'provider': 'tavily', 'usage_info': usage_info or {}
})
logger.info(f"[ResearchEngine] Tavily preflight check passed for user {user_id}")
except HTTPException:
raise
except Exception as e:
logger.warning(f"[ResearchEngine] Tavily preflight check failed: {e}")
# Execute Tavily search
try:
tavily_provider = TavilyResearchProvider()
raw_result = await tavily_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
# Track usage
cost = raw_result.get('cost', {}).get('total', 0.001) if isinstance(raw_result.get('cost'), dict) else 0.001
search_depth = config.tavily_search_depth or "basic"
tavily_provider.track_tavily_usage(user_id, cost, search_depth)
self._progress(progress_callback, f"📝 Found {len(raw_result.get('sources', []))} sources")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback)
except RuntimeError as e:
if "TAVILY_API_KEY not configured" in str(e):
logger.warning("Tavily not configured, falling back to Google")
self._progress(progress_callback, "⚠️ Tavily unavailable, using Google Search...")
config.provider = ResearchProvider.GOOGLE
return await self._execute_google_research(request, config, user_id, progress_callback)
raise
async def _execute_google_research(
self,
request: BlogResearchRequest,
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None
) -> BlogResearchResponse:
"""Execute research using Google/Gemini grounding."""
from services.blog_writer.research.google_provider import GoogleResearchProvider
from services.blog_writer.research.research_strategies import get_strategy_for_mode
self._progress(progress_callback, "🔍 Executing Google Search grounding...")
# Get strategy for building prompt
strategy = get_strategy_for_mode(config.mode)
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
target_audience = request.target_audience or "General"
research_prompt = strategy.build_research_prompt(topic, industry, target_audience, config)
# Execute Google search
google_provider = GoogleResearchProvider()
raw_result = await google_provider.search(
research_prompt, topic, industry, target_audience, config, user_id
)
self._progress(progress_callback, "📝 Processing grounded results...")
# Run common analysis
return await self._run_analysis(request, raw_result, config, user_id, progress_callback, is_google=True)
async def _run_analysis(
self,
request: BlogResearchRequest,
raw_result: Dict[str, Any],
config: ResearchConfig,
user_id: str,
progress_callback: Optional[Callable[[str], None]] = None,
is_google: bool = False
) -> BlogResearchResponse:
"""Run common analysis on raw results."""
from services.blog_writer.research.keyword_analyzer import KeywordAnalyzer
from services.blog_writer.research.competitor_analyzer import CompetitorAnalyzer
from services.blog_writer.research.content_angle_generator import ContentAngleGenerator
from services.blog_writer.research.data_filter import ResearchDataFilter
self._progress(progress_callback, "🔍 Analyzing keywords and content angles...")
# Extract content for analysis
if is_google:
content = raw_result.get("content", "")
sources = self._extract_sources_from_grounding(raw_result)
search_queries = raw_result.get("search_queries", []) or []
grounding_metadata = self._extract_grounding_metadata(raw_result)
else:
content = raw_result.get('content', '')
sources = [ResearchSource(**s) if isinstance(s, dict) else s for s in raw_result.get('sources', [])]
search_queries = raw_result.get('search_queries', [])
grounding_metadata = None
topic = request.topic or ", ".join(request.keywords)
industry = request.industry or "General"
# Run analyzers
keyword_analyzer = KeywordAnalyzer()
competitor_analyzer = CompetitorAnalyzer()
content_angle_generator = ContentAngleGenerator()
data_filter = ResearchDataFilter()
keyword_analysis = keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
competitor_analysis = competitor_analyzer.analyze(content, user_id=user_id)
suggested_angles = content_angle_generator.generate(content, topic, industry, user_id=user_id)
# Build response
response = BlogResearchResponse(
success=True,
sources=sources,
keyword_analysis=keyword_analysis,
competitor_analysis=competitor_analysis,
suggested_angles=suggested_angles,
search_widget="",
search_queries=search_queries,
grounding_metadata=grounding_metadata,
original_keywords=request.keywords,
)
# Filter and clean research data
self._progress(progress_callback, "✨ Filtering and optimizing results...")
filtered_response = data_filter.filter_research_data(response)
return filtered_response
def _extract_sources_from_grounding(self, gemini_result: Dict[str, Any]) -> list:
"""Extract sources from Gemini grounding metadata."""
from models.blog_models import ResearchSource
sources = []
if not gemini_result or not isinstance(gemini_result, dict):
return sources
raw_sources = gemini_result.get("sources", []) or []
for src in raw_sources:
source = ResearchSource(
title=src.get("title", "Untitled"),
url=src.get("url", ""),
excerpt=src.get("content", "")[:500] if src.get("content") else f"Source from {src.get('title', 'web')}",
credibility_score=float(src.get("credibility_score", 0.8)),
published_at=str(src.get("publication_date", "2024-01-01")),
index=src.get("index"),
source_type=src.get("type", "web")
)
sources.append(source)
return sources
def _extract_grounding_metadata(self, gemini_result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract grounding metadata from Gemini result."""
if not gemini_result or not isinstance(gemini_result, dict):
return None
return gemini_result.get("grounding_metadata")
def _transform_response(
self,
response: BlogResearchResponse,
provider: ResearchProvider,
context: ResearchContext
) -> ResearchResult:
"""Transform BlogResearchResponse to ResearchResult."""
# Convert sources to dicts
sources = []
for s in response.sources:
if hasattr(s, 'dict'):
sources.append(s.dict())
elif isinstance(s, dict):
sources.append(s)
else:
sources.append({
'title': getattr(s, 'title', ''),
'url': getattr(s, 'url', ''),
'excerpt': getattr(s, 'excerpt', ''),
})
# Extract grounding metadata
grounding = None
if response.grounding_metadata:
if hasattr(response.grounding_metadata, 'dict'):
grounding = response.grounding_metadata.dict()
else:
grounding = response.grounding_metadata
return ResearchResult(
success=response.success,
sources=sources,
keyword_analysis=response.keyword_analysis,
competitor_analysis=response.competitor_analysis,
suggested_angles=response.suggested_angles,
provider_used=provider.value,
search_queries=response.search_queries,
grounding_metadata=grounding,
original_query=context.query,
error_message=response.error_message,
error_code=response.error_code if hasattr(response, 'error_code') else None,
retry_suggested=response.retry_suggested if hasattr(response, 'retry_suggested') else False,
)
def get_provider_status(self) -> Dict[str, Any]:
"""Get status of available providers."""
return {
"exa": {
"available": self.exa_available,
"priority": 1,
"description": "Neural search for semantic understanding"
},
"tavily": {
"available": self.tavily_available,
"priority": 2,
"description": "AI-powered web search"
},
"google": {
"available": True, # Always available via Gemini
"priority": 3,
"description": "Google Search grounding"
}
}