Phase 1: Dead Code Cleanup - Remove GeminiGroundedProvider import and property from linkedin_service.py - Remove fallback_provider property (gemini_provider imports) - Fix routers/linkedin.py edit endpoint to use llm_text_gen - Delete dead LinkedInImageEditor class - Remove dead _transform_gemini_sources from content_generator.py Phase 2: Research Infrastructure Alignment - Add user_id to _conduct_research() for pre-flight validation - Add validate_exa_research_operations() before Exa/Tavily calls - Pass user_id to provider.simple_search() for usage tracking - Inject research content into LLM prompts via _build_research_context() - Fix Google engine path to fallback to Exa - Add Exa → Tavily fallback on research failure Phase 3: Cosmetic Cleanup - Rename _generate_prompts_with_gemini → _generate_prompts_with_llm - Rename _build_gemini_prompt → _build_image_prompt - Rename _parse_gemini_response → _parse_llm_response - Remove all Gemini references from LinkedIn code (0 remaining) - Update docstrings and log messages Additional: - Research caching using existing ResearchCache - Shared ExaContentResearchProvider in services/research/ - Persona service uses llm_text_gen instead of gemini_structured_json_response - LinkedInWriter.tsx ChatMessage → ChatMsg type mapping fix - RegisterLinkedInActionsEnhanced.tsx content_format_rules typing fix
199 lines
7.6 KiB
Python
199 lines
7.6 KiB
Python
"""
|
|
Exa Content Research Provider
|
|
|
|
Shared Exa neural search provider for content research across ALwrity modules.
|
|
Provides simple_search() for fact-checking, content grounding, and research.
|
|
|
|
Used by:
|
|
- LinkedIn Writer (content generation research)
|
|
- Blog Writer (fact-checking and writing assistance)
|
|
|
|
This is the content-research variant. For competitor discovery/analysis,
|
|
use ExaService in exa_service.py.
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
from typing import List, Dict, Any, Optional
|
|
from loguru import logger
|
|
|
|
|
|
class ExaContentResearchProvider:
|
|
"""Exa neural search provider for content research."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the Exa content research provider."""
|
|
self.api_key = os.getenv("EXA_API_KEY")
|
|
if not self.api_key:
|
|
raise RuntimeError("EXA_API_KEY not configured")
|
|
|
|
from exa_py import Exa
|
|
self.exa = Exa(self.api_key)
|
|
logger.info("✅ Exa Content Research Provider initialized")
|
|
|
|
async def simple_search(
|
|
self,
|
|
query: str,
|
|
num_results: int = 5,
|
|
user_id: str = None,
|
|
include_domains: List[str] = None,
|
|
exclude_domains: List[str] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Simple Exa search for content research and fact-checking.
|
|
Handles subscription preflight check and usage tracking.
|
|
|
|
Args:
|
|
query: Search query string
|
|
num_results: Number of results to return (default 5)
|
|
user_id: Optional user ID for subscription checking
|
|
include_domains: Only return results from these domains
|
|
exclude_domains: Exclude results from these domains
|
|
|
|
Returns:
|
|
List of source dicts with title, url, text, publishedDate, author, score keys
|
|
|
|
Raises:
|
|
HTTPException(429): If user has exceeded subscription limits
|
|
Exception: If Exa API key not configured or search fails
|
|
"""
|
|
# Preflight subscription check
|
|
if user_id:
|
|
from models.subscription_models import APIProvider
|
|
from services.subscription import PricingService
|
|
from services.database import get_session_for_user
|
|
from fastapi import HTTPException
|
|
|
|
db = get_session_for_user(user_id)
|
|
if db:
|
|
try:
|
|
pricing_service = PricingService(db)
|
|
can_proceed, message, usage_info = pricing_service.check_usage_limits(
|
|
user_id=user_id,
|
|
provider=APIProvider.EXA,
|
|
tokens_requested=0,
|
|
actual_provider_name="exa",
|
|
)
|
|
if not can_proceed:
|
|
raise HTTPException(status_code=429, detail={
|
|
'error': 'insufficient_balance',
|
|
'message': message,
|
|
'provider': 'exa',
|
|
'usage_info': usage_info or {}
|
|
})
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.warning(f"[Exa simple_search] Preflight check failed: {e}")
|
|
finally:
|
|
try:
|
|
db.close()
|
|
except Exception:
|
|
pass
|
|
|
|
search_kwargs = {
|
|
"type": "auto",
|
|
"num_results": num_results,
|
|
"text": {"max_characters": 1000},
|
|
"highlights": {"num_sentences": 2, "highlights_per_url": 2},
|
|
}
|
|
if include_domains:
|
|
search_kwargs["include_domains"] = include_domains
|
|
if exclude_domains:
|
|
search_kwargs["exclude_domains"] = exclude_domains
|
|
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
results = await loop.run_in_executor(
|
|
None,
|
|
lambda: self.exa.search_and_contents(query, **search_kwargs),
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"[Exa simple_search] API call failed: {e}")
|
|
# Retry with simpler parameters
|
|
retry_kwargs = {"type": "auto", "num_results": num_results, "text": True}
|
|
if include_domains:
|
|
retry_kwargs["include_domains"] = include_domains
|
|
if exclude_domains:
|
|
retry_kwargs["exclude_domains"] = exclude_domains
|
|
try:
|
|
logger.info("[Exa simple_search] Retrying with simplified parameters")
|
|
results = await loop.run_in_executor(
|
|
None,
|
|
lambda: self.exa.search_and_contents(query, **retry_kwargs),
|
|
)
|
|
except Exception as retry_error:
|
|
logger.error(f"[Exa simple_search] Retry also failed: {retry_error}")
|
|
raise RuntimeError(f"Exa search failed: {str(retry_error)}") from retry_error
|
|
|
|
sources = []
|
|
for result in results.results:
|
|
sources.append({
|
|
'title': getattr(result, 'title', 'Untitled'),
|
|
'url': getattr(result, 'url', ''),
|
|
'text': getattr(result, 'text', ''),
|
|
'publishedDate': getattr(result, 'publishedDate', ''),
|
|
'author': getattr(result, 'author', ''),
|
|
'score': (lambda v: v if v is not None else 0.5)(getattr(result, 'score', 0.5)),
|
|
})
|
|
|
|
# Track usage
|
|
if user_id:
|
|
cost = 0.005 # ~0.5 cents per search
|
|
try:
|
|
self.track_usage(user_id, cost)
|
|
except Exception as e:
|
|
logger.warning(f"[Exa simple_search] Failed to track usage: {e}")
|
|
|
|
logger.info(f"[Exa simple_search] Found {len(sources)} sources for query: {query[:80]}...")
|
|
return sources
|
|
|
|
def track_usage(self, user_id: str, cost: float):
|
|
"""Track Exa API usage after successful call."""
|
|
from services.database import get_session_for_user
|
|
from services.subscription import PricingService
|
|
from sqlalchemy import text
|
|
|
|
db = get_session_for_user(user_id)
|
|
if not db:
|
|
logger.warning(f"[track_usage] Could not get DB session for user {user_id}")
|
|
return
|
|
try:
|
|
pricing_service = PricingService(db)
|
|
current_period = pricing_service.get_current_billing_period(user_id)
|
|
|
|
# Update exa_calls and exa_cost via SQL UPDATE
|
|
update_query = text("""
|
|
UPDATE usage_summaries
|
|
SET exa_calls = COALESCE(exa_calls, 0) + 1,
|
|
exa_cost = COALESCE(exa_cost, 0) + :cost,
|
|
total_calls = total_calls + 1,
|
|
total_cost = total_cost + :cost
|
|
WHERE user_id = :user_id AND billing_period = :period
|
|
""")
|
|
db.execute(update_query, {
|
|
'cost': cost,
|
|
'user_id': user_id,
|
|
'period': current_period
|
|
})
|
|
db.commit()
|
|
|
|
logger.info(f"[Exa] Tracked usage: user={user_id}, cost=${cost}")
|
|
except Exception as e:
|
|
logger.error(f"[Exa] Failed to track usage: {e}")
|
|
db.rollback()
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
# Global singleton instance
|
|
_exa_content_provider: Optional[ExaContentResearchProvider] = None
|
|
|
|
|
|
def get_exa_content_provider() -> ExaContentResearchProvider:
|
|
"""Get or create the global Exa content research provider instance."""
|
|
global _exa_content_provider
|
|
if _exa_content_provider is None:
|
|
_exa_content_provider = ExaContentResearchProvider()
|
|
return _exa_content_provider
|