560 lines
24 KiB
Python
560 lines
24 KiB
Python
"""
|
|
Unified Research Analyzer
|
|
|
|
Combines intent inference, query generation, and parameter optimization
|
|
into a single AI call with justifications for each decision.
|
|
|
|
This reduces 2 LLM calls to 1, improves coherence, and provides
|
|
user-friendly justifications for all settings.
|
|
|
|
Author: ALwrity Team
|
|
Version: 1.0
|
|
"""
|
|
|
|
import json
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from loguru import logger
|
|
|
|
from models.research_intent_models import (
|
|
ResearchIntent,
|
|
ResearchQuery,
|
|
IntentInferenceResponse,
|
|
ResearchPurpose,
|
|
ContentOutput,
|
|
ExpectedDeliverable,
|
|
ResearchDepthLevel,
|
|
InputType,
|
|
)
|
|
from models.research_persona_models import ResearchPersona
|
|
|
|
|
|
class UnifiedResearchAnalyzer:
|
|
"""
|
|
Unified AI-driven analyzer that performs:
|
|
1. Intent inference (what user wants)
|
|
2. Query generation (how to search)
|
|
3. Parameter optimization (Exa/Tavily settings)
|
|
|
|
All in a single LLM call with justifications.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the unified analyzer."""
|
|
logger.info("UnifiedResearchAnalyzer initialized")
|
|
|
|
async def analyze(
|
|
self,
|
|
user_input: str,
|
|
keywords: Optional[List[str]] = None,
|
|
research_persona: Optional[ResearchPersona] = None,
|
|
competitor_data: Optional[List[Dict]] = None,
|
|
industry: Optional[str] = None,
|
|
target_audience: Optional[str] = None,
|
|
user_id: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Perform unified analysis of user research request.
|
|
|
|
Returns:
|
|
Dict containing:
|
|
- intent: ResearchIntent
|
|
- queries: List[ResearchQuery]
|
|
- exa_config: Dict with settings and justifications
|
|
- tavily_config: Dict with settings and justifications
|
|
- recommended_provider: str
|
|
- provider_justification: str
|
|
"""
|
|
try:
|
|
logger.info(f"Unified analysis for: {user_input[:100]}...")
|
|
|
|
keywords = keywords or []
|
|
|
|
# Build the unified prompt
|
|
prompt = self._build_unified_prompt(
|
|
user_input=user_input,
|
|
keywords=keywords,
|
|
research_persona=research_persona,
|
|
competitor_data=competitor_data,
|
|
industry=industry,
|
|
target_audience=target_audience,
|
|
)
|
|
|
|
# Define the comprehensive JSON schema
|
|
unified_schema = self._build_unified_schema()
|
|
|
|
# Call LLM (single call for everything)
|
|
from services.llm_providers.main_text_generation import llm_text_gen
|
|
|
|
result = llm_text_gen(
|
|
prompt=prompt,
|
|
json_struct=unified_schema,
|
|
user_id=user_id
|
|
)
|
|
|
|
if isinstance(result, dict) and "error" in result:
|
|
logger.error(f"Unified analysis failed: {result.get('error')}")
|
|
return self._create_fallback_response(user_input, keywords)
|
|
|
|
# Parse the unified result
|
|
return self._parse_unified_result(result, user_input)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in unified analysis: {e}")
|
|
return self._create_fallback_response(user_input, keywords or [])
|
|
|
|
def _build_unified_prompt(
|
|
self,
|
|
user_input: str,
|
|
keywords: List[str],
|
|
research_persona: Optional[ResearchPersona] = None,
|
|
competitor_data: Optional[List[Dict]] = None,
|
|
industry: Optional[str] = None,
|
|
target_audience: Optional[str] = None,
|
|
) -> str:
|
|
"""Build the unified prompt for intent + queries + parameters."""
|
|
|
|
# Build persona context
|
|
persona_context = self._build_persona_context(research_persona, industry, target_audience)
|
|
|
|
# Build competitor context
|
|
competitor_context = self._build_competitor_context(competitor_data)
|
|
|
|
prompt = f'''You are an expert AI research strategist. Analyze the user's research request and provide a complete research plan including intent understanding, search queries, and optimal API settings.
|
|
|
|
## USER INPUT
|
|
"{user_input}"
|
|
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
|
|
|
|
## USER CONTEXT
|
|
{persona_context}
|
|
{competitor_context}
|
|
|
|
## YOUR TASK: Provide a Complete Research Plan
|
|
|
|
### PART 1: INTENT ANALYSIS
|
|
Understand what the user really wants from their research.
|
|
|
|
### PART 2: SEARCH QUERIES
|
|
Generate 4-8 targeted search queries optimized for semantic search.
|
|
|
|
### PART 3: PROVIDER SETTINGS
|
|
Configure Exa and Tavily API parameters with justifications.
|
|
|
|
### PART 4: GOOGLE TRENDS KEYWORDS (if trends in deliverables)
|
|
If "trends" is in expected_deliverables OR purpose is "explore_trends":
|
|
- Suggest 1-3 optimized keywords for Google Trends analysis
|
|
- These may differ from research queries (trends need broader, searchable terms)
|
|
- Consider: What keywords will show meaningful trends over time?
|
|
- Consider: What timeframe will show relevant trends? (1 year, 12 months, etc.)
|
|
- Consider: What geographic region is most relevant for the user?
|
|
- Explain what insights trends will uncover for content generation:
|
|
* Search interest trends over time (optimal publication timing)
|
|
* Regional interest distribution (audience targeting)
|
|
* Related topics for content expansion
|
|
* Related queries for FAQ sections
|
|
* Rising topics for timely content opportunities
|
|
|
|
---
|
|
|
|
## AVAILABLE PROVIDER OPTIONS
|
|
|
|
### EXA API OPTIONS (Semantic Search Engine)
|
|
| Parameter | Options | Description |
|
|
|-----------|---------|-------------|
|
|
| type | "auto", "neural", "fast", "deep" | "neural" = semantic understanding, "deep" = comprehensive with query expansion |
|
|
| category | "company", "research paper", "news", "github", "tweet", "personal site", "pdf", "financial report", "people" | Focus on specific content types |
|
|
| numResults | 5-25 | Number of results (10 recommended) |
|
|
| includeDomains | string[] | Domains to include (e.g., ["arxiv.org", "nature.com"]) |
|
|
| excludeDomains | string[] | Domains to exclude |
|
|
| startPublishedDate | ISO date | Filter by publish date (e.g., "2024-01-01T00:00:00.000Z") |
|
|
| text | boolean | Include full text content |
|
|
| highlights | boolean | Extract key highlights |
|
|
| context | boolean | Return as single context string for RAG |
|
|
|
|
**WHEN TO USE EXA:**
|
|
- Semantic understanding needed (finding similar content)
|
|
- Academic/research papers
|
|
- Company/competitor research
|
|
- Deep, comprehensive results
|
|
- Historical content
|
|
|
|
### TAVILY API OPTIONS (AI-Powered Search)
|
|
| Parameter | Options | Description |
|
|
|-----------|---------|-------------|
|
|
| topic | "general", "news", "finance" | Search topic category |
|
|
| search_depth | "basic", "advanced" | "advanced" = multiple semantic snippets per URL |
|
|
| include_answer | false, true, "basic", "advanced" | AI-generated answer from results |
|
|
| include_raw_content | false, true, "markdown", "text" | Raw page content format |
|
|
| time_range | "day", "week", "month", "year" | Filter by recency |
|
|
| max_results | 5-20 | Number of results |
|
|
| include_domains | string[] | Domains to include |
|
|
| exclude_domains | string[] | Domains to exclude |
|
|
|
|
**WHEN TO USE TAVILY:**
|
|
- Real-time/current events
|
|
- News and trending topics
|
|
- Quick facts with AI answers
|
|
- Financial data
|
|
- Recent time-sensitive content
|
|
|
|
---
|
|
|
|
## OUTPUT FORMAT
|
|
|
|
Return a JSON object with this exact structure:
|
|
|
|
```json
|
|
{{
|
|
"intent": {{
|
|
"input_type": "keywords|question|goal|mixed",
|
|
"primary_question": "The main question to answer",
|
|
"secondary_questions": ["question 1", "question 2"],
|
|
"purpose": "learn|create_content|make_decision|compare|solve_problem|find_data|explore_trends|validate|generate_ideas",
|
|
"content_output": "blog|podcast|video|social_post|newsletter|presentation|report|whitepaper|email|general",
|
|
"expected_deliverables": ["key_statistics", "expert_quotes", "case_studies", "trends", "best_practices"],
|
|
"depth": "overview|detailed|expert",
|
|
"focus_areas": ["area1", "area2"],
|
|
"perspective": "target perspective or null",
|
|
"time_sensitivity": "real_time|recent|historical|evergreen",
|
|
"confidence": 0.85,
|
|
"confidence_reason": "Why this confidence level",
|
|
"great_example": "Example of better input if confidence < 0.8",
|
|
"needs_clarification": false,
|
|
"clarifying_questions": [],
|
|
"analysis_summary": "Brief summary of research plan"
|
|
}},
|
|
"queries": [
|
|
{{
|
|
"query": "Optimized search query string",
|
|
"purpose": "key_statistics|expert_quotes|case_studies|trends|etc",
|
|
"provider": "exa|tavily",
|
|
"priority": 5,
|
|
"expected_results": "What we expect to find",
|
|
"justification": "Why this query and provider"
|
|
}}
|
|
],
|
|
"enhanced_keywords": ["expanded", "related", "keywords"],
|
|
"research_angles": ["Angle 1: ...", "Angle 2: ..."],
|
|
"recommended_provider": "exa|tavily",
|
|
"provider_justification": "Why this provider is best for this research",
|
|
"exa_config": {{
|
|
"enabled": true,
|
|
"type": "auto|neural|fast|deep",
|
|
"type_justification": "Why this search type",
|
|
"category": "news|research paper|company|etc or null",
|
|
"category_justification": "Why this category or null",
|
|
"numResults": 10,
|
|
"numResults_justification": "Why this number",
|
|
"includeDomains": [],
|
|
"includeDomains_justification": "Why these domains or empty",
|
|
"startPublishedDate": "2024-01-01T00:00:00.000Z or null",
|
|
"date_justification": "Why this date filter or null",
|
|
"highlights": true,
|
|
"highlights_justification": "Why enable/disable highlights",
|
|
"context": true,
|
|
"context_justification": "Why enable/disable context string"
|
|
}},
|
|
"tavily_config": {{
|
|
"enabled": true,
|
|
"topic": "general|news|finance",
|
|
"topic_justification": "Why this topic",
|
|
"search_depth": "basic|advanced",
|
|
"search_depth_justification": "Why this depth",
|
|
"include_answer": "true|false|basic|advanced",
|
|
"include_answer_justification": "Why this answer mode",
|
|
"time_range": "day|week|month|year|null",
|
|
"time_range_justification": "Why this time range or null",
|
|
"max_results": 10,
|
|
"max_results_justification": "Why this number",
|
|
"include_raw_content": "false|true|markdown|text",
|
|
"include_raw_content_justification": "Why this content mode"
|
|
}},
|
|
"trends_config": {{
|
|
"enabled": true|false,
|
|
"keywords": ["keyword1", "keyword2"],
|
|
"keywords_justification": "Why these keywords for trends analysis",
|
|
"timeframe": "today 1-y|today 12-m|all",
|
|
"timeframe_justification": "Why this timeframe",
|
|
"geo": "US|GB|IN|etc",
|
|
"geo_justification": "Why this geographic region",
|
|
"expected_insights": [
|
|
"Search interest trends over the past year",
|
|
"Regional interest distribution",
|
|
"Related topics for content expansion",
|
|
"Related queries for FAQ sections",
|
|
"Optimal publication timing based on interest peaks"
|
|
]
|
|
}}
|
|
}}
|
|
```
|
|
|
|
## DECISION RULES
|
|
|
|
1. **Provider Selection:**
|
|
- Use EXA for: academic research, competitor analysis, deep understanding, finding similar content
|
|
- Use TAVILY for: news, current events, quick facts, financial data, real-time info
|
|
|
|
2. **Query Optimization:**
|
|
- Include relevant keywords for semantic matching
|
|
- Add context words based on deliverables (e.g., "statistics 2024" for key_statistics)
|
|
- Match query style to provider (natural language for Exa, keyword-rich for Tavily)
|
|
|
|
3. **Parameter Selection:**
|
|
- ALWAYS provide justification for each parameter choice
|
|
- Consider time sensitivity when setting date filters
|
|
- Match category/topic to content type
|
|
- Use "advanced" depth when quality matters more than speed
|
|
|
|
4. **Google Trends Keywords (if trends enabled):**
|
|
- Suggest 1-3 keywords optimized for trends analysis
|
|
- Keywords should be broader than research queries (e.g., "AI marketing" vs "AI marketing tools for small businesses")
|
|
- Consider what will show meaningful search interest trends
|
|
- Choose timeframe based on content type (12 months for blogs, 1 year for comprehensive)
|
|
- Select geo based on user's target audience or industry
|
|
- List specific insights trends will uncover
|
|
|
|
5. **Justifications:**
|
|
- Keep justifications concise (1 sentence)
|
|
- Explain the "why" not the "what"
|
|
- Reference user's intent when relevant
|
|
'''
|
|
|
|
return prompt
|
|
|
|
def _build_unified_schema(self) -> Dict[str, Any]:
|
|
"""Build the JSON schema for unified response."""
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"intent": {
|
|
"type": "object",
|
|
"properties": {
|
|
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
|
|
"primary_question": {"type": "string"},
|
|
"secondary_questions": {"type": "array", "items": {"type": "string"}},
|
|
"purpose": {"type": "string"},
|
|
"content_output": {"type": "string"},
|
|
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
|
|
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
|
|
"focus_areas": {"type": "array", "items": {"type": "string"}},
|
|
"perspective": {"type": "string"},
|
|
"time_sensitivity": {"type": "string"},
|
|
"confidence": {"type": "number"},
|
|
"confidence_reason": {"type": "string"},
|
|
"great_example": {"type": "string"},
|
|
"needs_clarification": {"type": "boolean"},
|
|
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
|
|
"analysis_summary": {"type": "string"}
|
|
},
|
|
"required": ["primary_question", "purpose", "expected_deliverables", "confidence"]
|
|
},
|
|
"queries": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {"type": "string"},
|
|
"purpose": {"type": "string"},
|
|
"provider": {"type": "string"},
|
|
"priority": {"type": "integer"},
|
|
"expected_results": {"type": "string"},
|
|
"justification": {"type": "string"}
|
|
},
|
|
"required": ["query", "purpose", "provider", "priority"]
|
|
}
|
|
},
|
|
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
|
|
"research_angles": {"type": "array", "items": {"type": "string"}},
|
|
"recommended_provider": {"type": "string"},
|
|
"provider_justification": {"type": "string"},
|
|
"exa_config": {
|
|
"type": "object",
|
|
"properties": {
|
|
"enabled": {"type": "boolean"},
|
|
"type": {"type": "string"},
|
|
"type_justification": {"type": "string"},
|
|
"category": {"type": "string"},
|
|
"category_justification": {"type": "string"},
|
|
"numResults": {"type": "integer"},
|
|
"numResults_justification": {"type": "string"},
|
|
"includeDomains": {"type": "array", "items": {"type": "string"}},
|
|
"includeDomains_justification": {"type": "string"},
|
|
"startPublishedDate": {"type": "string"},
|
|
"date_justification": {"type": "string"},
|
|
"highlights": {"type": "boolean"},
|
|
"highlights_justification": {"type": "string"},
|
|
"context": {"type": "boolean"},
|
|
"context_justification": {"type": "string"}
|
|
}
|
|
},
|
|
"tavily_config": {
|
|
"type": "object",
|
|
"properties": {
|
|
"enabled": {"type": "boolean"},
|
|
"topic": {"type": "string"},
|
|
"topic_justification": {"type": "string"},
|
|
"search_depth": {"type": "string"},
|
|
"search_depth_justification": {"type": "string"},
|
|
"include_answer": {"type": "string"},
|
|
"include_answer_justification": {"type": "string"},
|
|
"time_range": {"type": "string"},
|
|
"time_range_justification": {"type": "string"},
|
|
"max_results": {"type": "integer"},
|
|
"max_results_justification": {"type": "string"},
|
|
"include_raw_content": {"type": "string"},
|
|
"include_raw_content_justification": {"type": "string"}
|
|
}
|
|
},
|
|
"trends_config": {
|
|
"type": "object",
|
|
"properties": {
|
|
"enabled": {"type": "boolean"},
|
|
"keywords": {"type": "array", "items": {"type": "string"}},
|
|
"keywords_justification": {"type": "string"},
|
|
"timeframe": {"type": "string"},
|
|
"timeframe_justification": {"type": "string"},
|
|
"geo": {"type": "string"},
|
|
"geo_justification": {"type": "string"},
|
|
"expected_insights": {"type": "array", "items": {"type": "string"}}
|
|
}
|
|
}
|
|
},
|
|
"required": ["intent", "queries", "recommended_provider", "exa_config", "tavily_config"]
|
|
}
|
|
|
|
def _build_persona_context(
|
|
self,
|
|
research_persona: Optional[ResearchPersona],
|
|
industry: Optional[str],
|
|
target_audience: Optional[str],
|
|
) -> str:
|
|
"""Build persona context section."""
|
|
parts = []
|
|
|
|
if research_persona:
|
|
if research_persona.default_industry:
|
|
parts.append(f"Industry: {research_persona.default_industry}")
|
|
if research_persona.default_target_audience:
|
|
parts.append(f"Target Audience: {research_persona.default_target_audience}")
|
|
if research_persona.research_angles:
|
|
parts.append(f"Preferred Research Angles: {', '.join(research_persona.research_angles[:3])}")
|
|
if research_persona.suggested_keywords:
|
|
parts.append(f"Relevant Keywords: {', '.join(research_persona.suggested_keywords[:5])}")
|
|
else:
|
|
if industry:
|
|
parts.append(f"Industry: {industry}")
|
|
if target_audience:
|
|
parts.append(f"Target Audience: {target_audience}")
|
|
|
|
if not parts:
|
|
return "No specific user context available. Use general best practices."
|
|
|
|
return "\n".join(parts)
|
|
|
|
def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
|
|
"""Build competitor context section."""
|
|
if not competitor_data:
|
|
return ""
|
|
|
|
competitor_names = [c.get("name", c.get("url", "")) for c in competitor_data[:5]]
|
|
if competitor_names:
|
|
return f"\nKnown Competitors: {', '.join(competitor_names)}"
|
|
return ""
|
|
|
|
def _parse_unified_result(self, result: Dict[str, Any], user_input: str) -> Dict[str, Any]:
|
|
"""Parse the unified LLM result into structured response."""
|
|
|
|
intent_data = result.get("intent", {})
|
|
|
|
# Build ResearchIntent
|
|
intent = ResearchIntent(
|
|
primary_question=intent_data.get("primary_question", user_input),
|
|
secondary_questions=intent_data.get("secondary_questions", []),
|
|
purpose=intent_data.get("purpose", "learn"),
|
|
content_output=intent_data.get("content_output", "general"),
|
|
expected_deliverables=intent_data.get("expected_deliverables", ["key_statistics"]),
|
|
depth=intent_data.get("depth", "detailed"),
|
|
focus_areas=intent_data.get("focus_areas", []),
|
|
perspective=intent_data.get("perspective"),
|
|
time_sensitivity=intent_data.get("time_sensitivity"),
|
|
input_type=intent_data.get("input_type", "keywords"),
|
|
original_input=user_input,
|
|
confidence=float(intent_data.get("confidence", 0.7)),
|
|
confidence_reason=intent_data.get("confidence_reason"),
|
|
great_example=intent_data.get("great_example"),
|
|
needs_clarification=intent_data.get("needs_clarification", False),
|
|
clarifying_questions=intent_data.get("clarifying_questions", []),
|
|
)
|
|
|
|
# Build queries
|
|
queries = []
|
|
for q in result.get("queries", []):
|
|
try:
|
|
queries.append(ResearchQuery(
|
|
query=q.get("query", ""),
|
|
purpose=q.get("purpose", "key_statistics"),
|
|
provider=q.get("provider", "exa"),
|
|
priority=int(q.get("priority", 3)),
|
|
expected_results=q.get("expected_results", ""),
|
|
))
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse query: {e}")
|
|
|
|
return {
|
|
"success": True,
|
|
"intent": intent,
|
|
"queries": queries,
|
|
"enhanced_keywords": result.get("enhanced_keywords", []),
|
|
"research_angles": result.get("research_angles", []),
|
|
"recommended_provider": result.get("recommended_provider", "exa"),
|
|
"provider_justification": result.get("provider_justification", ""),
|
|
"exa_config": result.get("exa_config", {}),
|
|
"tavily_config": result.get("tavily_config", {}),
|
|
"trends_config": result.get("trends_config", {}), # NEW: Google Trends configuration
|
|
"analysis_summary": intent_data.get("analysis_summary", ""),
|
|
}
|
|
|
|
def _create_fallback_response(self, user_input: str, keywords: List[str]) -> Dict[str, Any]:
|
|
"""Create fallback response when analysis fails."""
|
|
return {
|
|
"success": False,
|
|
"intent": ResearchIntent(
|
|
primary_question=f"What are the key insights about: {user_input}?",
|
|
purpose="learn",
|
|
content_output="general",
|
|
expected_deliverables=["key_statistics", "best_practices"],
|
|
depth="detailed",
|
|
original_input=user_input,
|
|
confidence=0.5,
|
|
),
|
|
"queries": [
|
|
ResearchQuery(
|
|
query=user_input,
|
|
purpose="key_statistics",
|
|
provider="exa",
|
|
priority=5,
|
|
expected_results="General research results",
|
|
)
|
|
],
|
|
"enhanced_keywords": keywords,
|
|
"research_angles": [],
|
|
"recommended_provider": "exa",
|
|
"provider_justification": "Default fallback to Exa for semantic search",
|
|
"exa_config": {
|
|
"enabled": True,
|
|
"type": "auto",
|
|
"type_justification": "Auto mode for balanced results",
|
|
"numResults": 10,
|
|
"highlights": True,
|
|
},
|
|
"tavily_config": {
|
|
"enabled": True,
|
|
"topic": "general",
|
|
"search_depth": "advanced",
|
|
"include_answer": True,
|
|
},
|
|
"trends_config": {
|
|
"enabled": False, # Disabled in fallback
|
|
},
|
|
}
|