Files
ALwrity/backend/services/blog_writer/research/exa_provider.py

349 lines
16 KiB
Python

"""
Exa Research Provider
Neural search implementation using Exa API for high-quality, citation-rich research.
"""
from exa_py import Exa
import os
from loguru import logger
from models.subscription_models import APIProvider
from .base_provider import ResearchProvider as BaseProvider
class ExaResearchProvider(BaseProvider):
"""Exa neural search provider."""
def __init__(self):
self.api_key = os.getenv("EXA_API_KEY")
if not self.api_key:
raise RuntimeError("EXA_API_KEY not configured")
self.exa = Exa(self.api_key)
logger.info("✅ Exa Research Provider initialized")
async def search(self, prompt, topic, industry, target_audience, config, user_id):
"""Execute Exa neural search and return standardized results."""
# Build Exa query
query = f"{topic} {industry} {target_audience}"
# Determine category: use exa_category if set, otherwise map from source_types
category = config.exa_category if config.exa_category else self._map_source_type_to_category(config.source_types)
# Use exa_num_results if available, otherwise fallback to max_sources
num_results = config.exa_num_results if hasattr(config, 'exa_num_results') and config.exa_num_results else min(config.max_sources, 25)
# Cap at 100 as per Exa API limits
num_results = min(num_results, 100)
# Build search kwargs - use correct Exa API format
search_kwargs = {
'type': config.exa_search_type or "auto",
'num_results': num_results,
'text': {'max_characters': 1000},
'summary': {'query': f"Key insights about {topic}"},
'highlights': {
'num_sentences': 2,
'highlights_per_url': 3
}
}
# Add optional filters
if category:
search_kwargs['category'] = category
if config.exa_include_domains:
search_kwargs['include_domains'] = config.exa_include_domains
if config.exa_exclude_domains:
search_kwargs['exclude_domains'] = config.exa_exclude_domains
# Add date filters if configured
if hasattr(config, 'exa_date_filter') and config.exa_date_filter:
search_kwargs['start_published_date'] = config.exa_date_filter
if hasattr(config, 'exa_end_published_date') and config.exa_end_published_date:
search_kwargs['end_published_date'] = config.exa_end_published_date
if hasattr(config, 'exa_start_crawl_date') and config.exa_start_crawl_date:
search_kwargs['start_crawl_date'] = config.exa_start_crawl_date
if hasattr(config, 'exa_end_crawl_date') and config.exa_end_crawl_date:
search_kwargs['end_crawl_date'] = config.exa_end_crawl_date
# Add context if configured (supports boolean or object with maxCharacters)
if hasattr(config, 'exa_context') and config.exa_context is not None:
if config.exa_context:
if hasattr(config, 'exa_context_max_characters') and config.exa_context_max_characters:
search_kwargs['context'] = {'maxCharacters': config.exa_context_max_characters}
else:
search_kwargs['context'] = True
# If False, don't add context parameter (default behavior)
# Add text filters if configured
if hasattr(config, 'exa_include_text') and config.exa_include_text:
search_kwargs['include_text'] = config.exa_include_text
if hasattr(config, 'exa_exclude_text') and config.exa_exclude_text:
search_kwargs['exclude_text'] = config.exa_exclude_text
logger.info(f"[Exa Research] Executing search: {query}")
# Execute Exa search - pass contents parameters directly, not nested
try:
# Build optional parameters dict
optional_params = {}
if category:
optional_params['category'] = category
if config.exa_include_domains:
optional_params['include_domains'] = config.exa_include_domains
if config.exa_exclude_domains:
optional_params['exclude_domains'] = config.exa_exclude_domains
if hasattr(config, 'exa_date_filter') and config.exa_date_filter:
optional_params['start_published_date'] = config.exa_date_filter
if hasattr(config, 'exa_end_published_date') and config.exa_end_published_date:
optional_params['end_published_date'] = config.exa_end_published_date
if hasattr(config, 'exa_start_crawl_date') and config.exa_start_crawl_date:
optional_params['start_crawl_date'] = config.exa_start_crawl_date
if hasattr(config, 'exa_end_crawl_date') and config.exa_end_crawl_date:
optional_params['end_crawl_date'] = config.exa_end_crawl_date
# Add context if configured (supports boolean or object with maxCharacters)
if hasattr(config, 'exa_context') and config.exa_context:
if hasattr(config, 'exa_context_max_characters') and config.exa_context_max_characters:
optional_params['context'] = {'maxCharacters': config.exa_context_max_characters}
else:
optional_params['context'] = True
# Add text filters if configured
if hasattr(config, 'exa_include_text') and config.exa_include_text:
optional_params['include_text'] = config.exa_include_text
if hasattr(config, 'exa_exclude_text') and config.exa_exclude_text:
optional_params['exclude_text'] = config.exa_exclude_text
# Add additional_queries for Deep search (only works with type="deep")
if config.exa_search_type == 'deep' and hasattr(config, 'exa_additional_queries') and config.exa_additional_queries:
optional_params['additional_queries'] = config.exa_additional_queries
# Build contents parameters (text, summary, highlights)
text_params = {}
if hasattr(config, 'exa_text_max_characters') and config.exa_text_max_characters:
text_params['max_characters'] = config.exa_text_max_characters
else:
text_params['max_characters'] = 1000 # Default
summary_params = {}
if hasattr(config, 'exa_summary_query') and config.exa_summary_query:
summary_params['query'] = config.exa_summary_query
else:
summary_params['query'] = f"Key insights about {topic}" # Default
highlights_params = {}
if hasattr(config, 'exa_highlights') and config.exa_highlights:
if hasattr(config, 'exa_highlights_num_sentences') and config.exa_highlights_num_sentences:
highlights_params['num_sentences'] = config.exa_highlights_num_sentences
else:
highlights_params['num_sentences'] = 2 # Default
if hasattr(config, 'exa_highlights_per_url') and config.exa_highlights_per_url:
highlights_params['highlights_per_url'] = config.exa_highlights_per_url
else:
highlights_params['highlights_per_url'] = 3 # Default
results = self.exa.search_and_contents(
query,
text=text_params,
summary=summary_params,
highlights=highlights_params if highlights_params else None,
type=config.exa_search_type or "auto",
num_results=num_results,
**optional_params
)
except Exception as e:
logger.error(f"[Exa Research] API call failed: {e}")
# Try simpler call without contents if the above fails
try:
logger.info("[Exa Research] Retrying with simplified parameters")
# Build minimal optional parameters for retry
optional_params = {}
if category:
optional_params['category'] = category
if config.exa_include_domains:
optional_params['include_domains'] = config.exa_include_domains
if config.exa_exclude_domains:
optional_params['exclude_domains'] = config.exa_exclude_domains
if hasattr(config, 'exa_date_filter') and config.exa_date_filter:
optional_params['start_published_date'] = config.exa_date_filter
if hasattr(config, 'exa_end_published_date') and config.exa_end_published_date:
optional_params['end_published_date'] = config.exa_end_published_date
if hasattr(config, 'exa_start_crawl_date') and config.exa_start_crawl_date:
optional_params['start_crawl_date'] = config.exa_start_crawl_date
if hasattr(config, 'exa_end_crawl_date') and config.exa_end_crawl_date:
optional_params['end_crawl_date'] = config.exa_end_crawl_date
# Add additional_queries for Deep search (only works with type="deep")
if config.exa_search_type == 'deep' and hasattr(config, 'exa_additional_queries') and config.exa_additional_queries:
optional_params['additional_queries'] = config.exa_additional_queries
results = self.exa.search_and_contents(
query,
type=config.exa_search_type or "auto",
num_results=num_results,
**optional_params
)
except Exception as retry_error:
logger.error(f"[Exa Research] Retry also failed: {retry_error}")
raise RuntimeError(f"Exa search failed: {str(retry_error)}") from retry_error
# Transform to standardized format
sources = self._transform_sources(results.results)
content = self._aggregate_content(results.results)
search_type = getattr(results, 'resolvedSearchType', 'neural') if hasattr(results, 'resolvedSearchType') else 'neural'
# Get cost if available
cost = 0.005 # Default Exa cost for 1-25 results
if hasattr(results, 'costDollars'):
if hasattr(results.costDollars, 'total'):
cost = results.costDollars.total
logger.info(f"[Exa Research] Search completed: {len(sources)} sources, type: {search_type}")
return {
'sources': sources,
'content': content,
'search_type': search_type,
'provider': 'exa',
'search_queries': [query],
'cost': {'total': cost}
}
def get_provider_enum(self):
"""Return EXA provider enum for subscription tracking."""
return APIProvider.EXA
def estimate_tokens(self) -> int:
"""Estimate token usage for Exa (not token-based)."""
return 0 # Exa is per-search, not token-based
def _map_source_type_to_category(self, source_types):
"""Map SourceType enum to Exa category parameter."""
if not source_types:
return None
category_map = {
'research paper': 'research paper',
'news': 'news',
'web': 'personal site',
'industry': 'company',
'expert': 'linkedin profile'
}
for st in source_types:
if st.value in category_map:
return category_map[st.value]
return None
def _transform_sources(self, results):
"""Transform Exa results to ResearchSource format."""
sources = []
for idx, result in enumerate(results):
source_type = self._determine_source_type(result.url if hasattr(result, 'url') else '')
# Extract image if available (some Exa results include image URL)
image_url = result.image if hasattr(result, 'image') else None
sources.append({
'title': result.title if hasattr(result, 'title') else '',
'url': result.url if hasattr(result, 'url') else '',
'excerpt': self._get_excerpt(result),
'credibility_score': 0.85, # Exa results are high quality
'published_at': result.publishedDate if hasattr(result, 'publishedDate') else None,
'index': idx,
'source_type': source_type,
'content': result.text if hasattr(result, 'text') else '',
'highlights': result.highlights if hasattr(result, 'highlights') else [],
'summary': result.summary if hasattr(result, 'summary') else '',
'image': image_url,
'author': result.author if hasattr(result, 'author') else None
})
return sources
def _get_excerpt(self, result):
"""Extract excerpt from Exa result. Prefer highlights if available."""
if hasattr(result, 'highlights') and result.highlights and len(result.highlights) > 0:
return result.highlights[0]
if hasattr(result, 'summary') and result.summary:
return result.summary
if hasattr(result, 'text') and result.text:
return result.text[:500]
return ''
def _determine_source_type(self, url):
"""Determine source type from URL."""
if not url:
return 'web'
url_lower = url.lower()
if 'arxiv.org' in url_lower or 'research' in url_lower:
return 'academic'
elif any(news in url_lower for news in ['cnn.com', 'bbc.com', 'reuters.com', 'theguardian.com']):
return 'news'
elif 'linkedin.com' in url_lower:
return 'expert'
else:
return 'web'
def _aggregate_content(self, results):
"""Aggregate content from Exa results for LLM analysis, including highlights."""
content_parts = []
for idx, result in enumerate(results):
part = [f"Source {idx + 1}: {result.title if hasattr(result, 'title') else 'Untitled'}"]
if hasattr(result, 'url') and result.url:
part.append(f"URL: {result.url}")
# Add highlights if available (most valuable for LLM)
if hasattr(result, 'highlights') and result.highlights:
highlights_text = "\n".join([f"- {h}" for h in result.highlights])
part.append(f"Key Highlights:\n{highlights_text}")
# Add summary if available
if hasattr(result, 'summary') and result.summary:
part.append(f"Summary: {result.summary}")
# Add text snippet if highlights/summary insufficient
elif hasattr(result, 'text') and result.text:
part.append(f"Excerpt: {result.text[:1000]}")
content_parts.append("\n".join(part))
return "\n\n---\n\n".join(content_parts)
def track_exa_usage(self, user_id: str, cost: float):
"""Track Exa API usage after successful call."""
from services.database import get_db
from services.subscription import PricingService
from sqlalchemy import text
db = next(get_db())
try:
pricing_service = PricingService(db)
current_period = pricing_service.get_current_billing_period(user_id)
# Update exa_calls and exa_cost via SQL UPDATE
update_query = text("""
UPDATE usage_summaries
SET exa_calls = COALESCE(exa_calls, 0) + 1,
exa_cost = COALESCE(exa_cost, 0) + :cost,
total_calls = total_calls + 1,
total_cost = total_cost + :cost
WHERE user_id = :user_id AND billing_period = :period
""")
db.execute(update_query, {
'cost': cost,
'user_id': user_id,
'period': current_period
})
db.commit()
logger.info(f"[Exa] Tracked usage: user={user_id}, cost=${cost}")
except Exception as e:
logger.error(f"[Exa] Failed to track usage: {e}")
db.rollback()
finally:
db.close()