AI Analysis and Content Strategy fixes. Enhanced Strategy Routes refactoring.
This commit is contained in:
@@ -154,7 +154,17 @@ class IntentAwareAnalyzer:
|
||||
"primary_answer": {"type": "string"},
|
||||
"secondary_answers": {
|
||||
"type": "object",
|
||||
"additionalProperties": {"type": "string"}
|
||||
"additionalProperties": {"oneOf": [{"type": "string"}, {"type": "null"}]}
|
||||
},
|
||||
"focus_areas_coverage": {
|
||||
"type": "object",
|
||||
"additionalProperties": {"oneOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"description": "Summary of what was found for each focus area, or null if not covered"
|
||||
},
|
||||
"also_answering_coverage": {
|
||||
"type": "object",
|
||||
"additionalProperties": {"oneOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"description": "Information found about each 'also answering' topic, or null if not found"
|
||||
},
|
||||
"executive_summary": {"type": "string"},
|
||||
"key_takeaways": {
|
||||
@@ -469,10 +479,21 @@ class IntentAwareAnalyzer:
|
||||
if not sources:
|
||||
sources = self._extract_sources_from_raw(raw_results)
|
||||
|
||||
# Parse coverage fields (handle null values)
|
||||
focus_areas_coverage = {}
|
||||
for area, coverage in result.get("focus_areas_coverage", {}).items():
|
||||
focus_areas_coverage[area] = coverage if coverage else None
|
||||
|
||||
also_answering_coverage = {}
|
||||
for topic, coverage in result.get("also_answering_coverage", {}).items():
|
||||
also_answering_coverage[topic] = coverage if coverage else None
|
||||
|
||||
return IntentDrivenResearchResult(
|
||||
success=True,
|
||||
primary_answer=result.get("primary_answer", ""),
|
||||
secondary_answers=result.get("secondary_answers", {}),
|
||||
focus_areas_coverage=focus_areas_coverage,
|
||||
also_answering_coverage=also_answering_coverage,
|
||||
statistics=statistics,
|
||||
expert_quotes=expert_quotes,
|
||||
case_studies=case_studies,
|
||||
@@ -534,6 +555,8 @@ class IntentAwareAnalyzer:
|
||||
success=True,
|
||||
primary_answer=f"Research findings for: {intent.primary_question}",
|
||||
secondary_answers={},
|
||||
focus_areas_coverage={area: None for area in intent.focus_areas} if intent.focus_areas else {},
|
||||
also_answering_coverage={topic: None for topic in intent.also_answering} if intent.also_answering else {},
|
||||
executive_summary=content[:300] if content else "Research completed",
|
||||
key_takeaways=key_takeaways,
|
||||
sources=sources,
|
||||
|
||||
@@ -11,6 +11,7 @@ Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
@@ -27,6 +28,14 @@ from models.research_persona_models import ResearchPersona
|
||||
class IntentPromptBuilder:
|
||||
"""Builds prompts for intent-driven research."""
|
||||
|
||||
def _get_current_date_context(self) -> str:
|
||||
"""Get current date/time context for prompts."""
|
||||
now = datetime.now()
|
||||
current_year = now.year
|
||||
current_month = now.strftime("%B") # Full month name
|
||||
current_date = now.strftime("%Y-%m-%d")
|
||||
return f"CURRENT DATE: {current_date} ({current_month} {current_year})\nCURRENT YEAR: {current_year}"
|
||||
|
||||
# Purpose explanations for the AI
|
||||
PURPOSE_EXPLANATIONS = {
|
||||
ResearchPurpose.LEARN: "User wants to understand a topic for personal knowledge",
|
||||
@@ -74,6 +83,11 @@ class IntentPromptBuilder:
|
||||
- What specific deliverables they need
|
||||
"""
|
||||
|
||||
# Get current date context
|
||||
date_context = self._get_current_date_context()
|
||||
now = datetime.now()
|
||||
current_year = now.year
|
||||
|
||||
# Build persona context
|
||||
persona_context = self._build_persona_context(research_persona, industry, target_audience)
|
||||
|
||||
@@ -82,6 +96,11 @@ class IntentPromptBuilder:
|
||||
|
||||
prompt = f"""You are an expert research intent analyzer. Your job is to understand what a content creator REALLY needs from their research.
|
||||
|
||||
## CURRENT DATE/TIME CONTEXT
|
||||
{date_context}
|
||||
|
||||
**NOTE**: When user mentions time-sensitive terms (latest, current, recent, trends, predictions), prioritize {current_year} data.
|
||||
|
||||
## USER INPUT
|
||||
"{user_input}"
|
||||
|
||||
@@ -97,7 +116,7 @@ class IntentPromptBuilder:
|
||||
Analyze the user's input and infer their research intent. Determine:
|
||||
|
||||
1. **INPUT TYPE**: Is this:
|
||||
- "keywords": Simple topic keywords (e.g., "AI healthcare 2025")
|
||||
- "keywords": Simple topic keywords (e.g., "AI healthcare {current_year}")
|
||||
- "question": A specific question (e.g., "What are the best AI tools for healthcare?")
|
||||
- "goal": A goal statement (e.g., "I need to write a blog about AI in healthcare")
|
||||
- "mixed": Combination of above
|
||||
@@ -210,8 +229,25 @@ Return a JSON object:
|
||||
if research_persona and research_persona.suggested_keywords:
|
||||
persona_keywords = f"\nSUGGESTED KEYWORDS FROM PERSONA: {', '.join(research_persona.suggested_keywords[:10])}"
|
||||
|
||||
# Get current date context
|
||||
date_context = self._get_current_date_context()
|
||||
now = datetime.now()
|
||||
current_year = now.year
|
||||
next_year = current_year + 1
|
||||
current_month_year = now.strftime("%B %Y")
|
||||
|
||||
prompt = f"""You are a research query optimizer. Generate multiple targeted search queries based on the user's research intent.
|
||||
|
||||
## CURRENT DATE/TIME CONTEXT
|
||||
{date_context}
|
||||
|
||||
**CRITICAL**: When generating queries:
|
||||
- ALWAYS use the CURRENT YEAR ({current_year}) for time-sensitive queries
|
||||
- For trends, predictions, or future-looking queries, use {current_year} or {next_year}
|
||||
- For recent/real-time queries, use current month/year: {current_month_year}
|
||||
- NEVER use outdated years from training data (e.g., 2024, 2025 if we're past those dates)
|
||||
- When user mentions "latest", "current", "recent", or time-sensitive terms, prioritize {current_year} data
|
||||
|
||||
## RESEARCH INTENT
|
||||
|
||||
PRIMARY QUESTION: {intent.primary_question}
|
||||
@@ -256,14 +292,14 @@ Return a JSON object:
|
||||
{{
|
||||
"queries": [
|
||||
{{
|
||||
"query": "Healthcare AI adoption statistics 2025 hospitals implementation data",
|
||||
"query": "Healthcare AI adoption statistics {current_year} hospitals implementation data",
|
||||
"purpose": "key_statistics",
|
||||
"provider": "exa",
|
||||
"priority": 5,
|
||||
"expected_results": "Statistics on hospital AI adoption rates"
|
||||
}},
|
||||
{{
|
||||
"query": "AI healthcare trends predictions future outlook 2025 2026",
|
||||
"query": "AI healthcare trends predictions future outlook {current_year} {next_year}",
|
||||
"purpose": "trends",
|
||||
"provider": "tavily",
|
||||
"priority": 4,
|
||||
@@ -280,13 +316,14 @@ Return a JSON object:
|
||||
|
||||
## QUERY OPTIMIZATION RULES
|
||||
|
||||
1. For STATISTICS: Include words like "statistics", "data", "percentage", "report", "study"
|
||||
1. For STATISTICS: Include words like "statistics", "data", "percentage", "report", "study", and CURRENT YEAR ({current_year})
|
||||
2. For CASE STUDIES: Include "case study", "success story", "implementation", "example"
|
||||
3. For TRENDS: Include "trends", "future", "predictions", "emerging", year numbers
|
||||
3. For TRENDS: Include "trends", "future", "predictions", "emerging", and CURRENT YEAR ({current_year}) or {next_year}
|
||||
4. For EXPERT QUOTES: Include expert names if known, or "expert opinion", "interview"
|
||||
5. For COMPARISONS: Include "vs", "compare", "comparison", "alternative"
|
||||
6. For NEWS/REAL-TIME: Use Tavily, include recent year/month
|
||||
6. For NEWS/REAL-TIME: Use Tavily, include CURRENT YEAR ({current_year}) and current month/year ({current_month_year})
|
||||
7. For ACADEMIC/DEEP: Use Exa with neural search
|
||||
8. **CRITICAL**: Always use {current_year} (not outdated years) for time-sensitive queries
|
||||
"""
|
||||
|
||||
return prompt
|
||||
@@ -314,23 +351,43 @@ Return a JSON object:
|
||||
if intent.perspective:
|
||||
perspective_instruction = f"\n**PERSPECTIVE**: Analyze results from the viewpoint of: {intent.perspective}"
|
||||
|
||||
# Get current date context
|
||||
date_context = self._get_current_date_context()
|
||||
now = datetime.now()
|
||||
current_year = now.year
|
||||
|
||||
prompt = f"""You are a research analyst helping a content creator find exactly what they need. Your job is to analyze raw research results and extract precisely what the user is looking for.
|
||||
|
||||
## CURRENT DATE/TIME CONTEXT
|
||||
{date_context}
|
||||
|
||||
**CRITICAL**: When analyzing results:
|
||||
- Prioritize data from CURRENT YEAR ({current_year}) or recent dates
|
||||
- If statistics/quotes mention outdated years, note the recency in context
|
||||
- For trends/predictions, ensure timelines reference {current_year} or future years
|
||||
- NEVER present outdated data as "current" or "latest" - always check dates
|
||||
|
||||
## USER'S RESEARCH INTENT
|
||||
|
||||
PRIMARY QUESTION: {intent.primary_question}
|
||||
**PRIMARY QUESTION**: {intent.primary_question}
|
||||
|
||||
SECONDARY QUESTIONS:
|
||||
**SECONDARY QUESTIONS TO ANSWER**:
|
||||
{chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None specified'}
|
||||
|
||||
PURPOSE: {intent.purpose}
|
||||
**FOCUS AREAS** (prioritize information related to these):
|
||||
{', '.join(intent.focus_areas) if intent.focus_areas else 'General - no specific focus areas'}
|
||||
|
||||
**ALSO ANSWERING** (address these topics if found in results):
|
||||
{', '.join(intent.also_answering) if intent.also_answering else 'None specified'}
|
||||
|
||||
**PURPOSE**: {intent.purpose}
|
||||
→ {purpose_explanation}
|
||||
|
||||
CONTENT OUTPUT: {intent.content_output}
|
||||
**CONTENT OUTPUT**: {intent.content_output}
|
||||
|
||||
EXPECTED DELIVERABLES: {', '.join(intent.expected_deliverables)}
|
||||
**EXPECTED DELIVERABLES**: {', '.join(intent.expected_deliverables)}
|
||||
|
||||
FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'}
|
||||
**PERSPECTIVE**: {intent.perspective or 'General audience'}
|
||||
{perspective_instruction}
|
||||
|
||||
## RAW RESEARCH RESULTS
|
||||
@@ -339,7 +396,33 @@ FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'
|
||||
|
||||
## YOUR TASK
|
||||
|
||||
Analyze the raw research results and extract EXACTLY what the user needs.
|
||||
Analyze the raw research results and extract EXACTLY what the user needs. Use a **generalized approach** - don't over-optimize for specific fields, but ensure all intent aspects are considered naturally.
|
||||
|
||||
### ANALYSIS GUIDELINES:
|
||||
|
||||
1. **PRIMARY QUESTION**: Always provide a direct, clear answer to the primary question in 2-3 sentences.
|
||||
|
||||
2. **SECONDARY QUESTIONS**: For each secondary question, provide an answer if information is available in the results. If not available, note it in gaps_identified. Don't force answers - only include what's actually in the results.
|
||||
|
||||
3. **FOCUS AREAS**: When extracting deliverables, prioritize information that relates to the focus areas. If focus areas are specified:
|
||||
- Weight relevance scores higher for sources/content matching focus areas
|
||||
- Include focus area context in extracted statistics, quotes, case studies
|
||||
- If results don't address focus areas, note this in gaps_identified
|
||||
- Provide a brief summary of what was found for each focus area in focus_areas_coverage
|
||||
|
||||
4. **ALSO ANSWERING**: If results contain information about "also answering" topics, include it naturally in the analysis. Don't create separate sections unless the information is substantial. Provide a brief summary of what was found for each topic in also_answering_coverage.
|
||||
|
||||
5. **GENERALIZED EXTRACTION**:
|
||||
- Extract deliverables based on expected_deliverables
|
||||
- Use perspective to frame information appropriately
|
||||
- Consider content_output when structuring results
|
||||
- Don't over-optimize - let the results guide what's extracted
|
||||
|
||||
6. **CONTEXTUAL LINKING**: When extracting information, consider:
|
||||
- How it relates to the primary question
|
||||
- Which secondary questions it answers
|
||||
- Which focus areas it addresses
|
||||
- This helps create a cohesive research result
|
||||
|
||||
{deliverables_instructions}
|
||||
|
||||
@@ -351,8 +434,16 @@ Provide results in this JSON structure:
|
||||
{{
|
||||
"primary_answer": "Direct 2-3 sentence answer to the primary question",
|
||||
"secondary_answers": {{
|
||||
"Question 1?": "Answer to question 1",
|
||||
"Question 2?": "Answer to question 2"
|
||||
"Secondary Question 1?": "Answer if found in results, or null if not available",
|
||||
"Secondary Question 2?": "Answer if found in results, or null if not available"
|
||||
}},
|
||||
"focus_areas_coverage": {{
|
||||
"Focus Area 1": "Brief summary of what was found related to this focus area, or null if not covered",
|
||||
"Focus Area 2": "Brief summary of what was found related to this focus area, or null if not covered"
|
||||
}},
|
||||
"also_answering_coverage": {{
|
||||
"Topic 1": "Information found about this topic, or null if not found",
|
||||
"Topic 2": "Information found about this topic, or null if not found"
|
||||
}},
|
||||
"executive_summary": "2-3 sentence executive summary of all findings",
|
||||
"key_takeaways": [
|
||||
@@ -364,13 +455,13 @@ Provide results in this JSON structure:
|
||||
],
|
||||
"statistics": [
|
||||
{{
|
||||
"statistic": "72% of hospitals plan to adopt AI by 2025",
|
||||
"statistic": "72% of hospitals plan to adopt AI by {current_year}",
|
||||
"value": "72%",
|
||||
"context": "Survey of 500 US hospitals in 2024",
|
||||
"source": "Healthcare AI Report 2024",
|
||||
"context": "Survey of 500 US hospitals in {current_year}",
|
||||
"source": "Healthcare AI Report {current_year}",
|
||||
"url": "https://example.com/report",
|
||||
"credibility": 0.9,
|
||||
"recency": "2024"
|
||||
"recency": "{current_year}"
|
||||
}}
|
||||
],
|
||||
"expert_quotes": [
|
||||
@@ -401,7 +492,7 @@ Provide results in this JSON structure:
|
||||
"direction": "growing",
|
||||
"evidence": ["25% YoY growth", "Major hospital chains investing"],
|
||||
"impact": "Could reduce misdiagnosis by 30%",
|
||||
"timeline": "Expected mainstream by 2027",
|
||||
"timeline": "Expected mainstream by {current_year + 2}",
|
||||
"sources": ["url1", "url2"]
|
||||
}}
|
||||
],
|
||||
@@ -442,7 +533,7 @@ Provide results in this JSON structure:
|
||||
"Example: Hospital X reduced readmissions by 25% using predictive AI"
|
||||
],
|
||||
"predictions": [
|
||||
"By 2030, AI will assist in 80% of initial diagnoses"
|
||||
"By {current_year + 5}, AI will assist in 80% of initial diagnoses"
|
||||
],
|
||||
"suggested_outline": [
|
||||
"1. Introduction: The AI Healthcare Revolution",
|
||||
@@ -454,7 +545,7 @@ Provide results in this JSON structure:
|
||||
],
|
||||
"sources": [
|
||||
{{
|
||||
"title": "Healthcare AI Report 2024",
|
||||
"title": "Healthcare AI Report {current_year}",
|
||||
"url": "https://example.com",
|
||||
"relevance_score": 0.95,
|
||||
"relevance_reason": "Directly addresses adoption statistics",
|
||||
@@ -468,7 +559,7 @@ Provide results in this JSON structure:
|
||||
"Limited information on regulatory challenges"
|
||||
],
|
||||
"follow_up_queries": [
|
||||
"AI healthcare regulations FDA 2025",
|
||||
"AI healthcare regulations FDA {current_year}",
|
||||
"Small clinic AI implementation costs"
|
||||
]
|
||||
}}
|
||||
@@ -486,6 +577,8 @@ Provide results in this JSON structure:
|
||||
8. **Suggest follow_up_queries** for gaps or incomplete areas
|
||||
9. **Rate confidence** based on how well results match the user's intent
|
||||
10. **Include deliverables ONLY if they are in expected_deliverables** or critical to the question
|
||||
11. **Don't over-optimize** - use a natural, generalized approach that considers all intent fields without forcing connections
|
||||
12. **For focus_areas_coverage and also_answering_coverage**: Only include entries for focus areas/topics that actually have information in the results. Use null for areas/topics not covered.
|
||||
"""
|
||||
|
||||
return prompt
|
||||
|
||||
@@ -137,6 +137,11 @@ class IntentQueryGenerator:
|
||||
provider=q.get("provider", "exa"),
|
||||
priority=min(max(int(q.get("priority", 3)), 1), 5), # Clamp 1-5
|
||||
expected_results=q.get("expected_results", ""),
|
||||
addresses_primary_question=q.get("addresses_primary_question", False),
|
||||
addresses_secondary_questions=q.get("addresses_secondary_questions", []),
|
||||
targets_focus_areas=q.get("targets_focus_areas", []),
|
||||
covers_also_answering=q.get("covers_also_answering", []),
|
||||
justification=q.get("justification"),
|
||||
)
|
||||
queries.append(query)
|
||||
except Exception as e:
|
||||
@@ -266,6 +271,10 @@ class IntentQueryGenerator:
|
||||
provider=template["provider"],
|
||||
priority=template["priority"],
|
||||
expected_results=template["expected"],
|
||||
addresses_primary_question=False,
|
||||
addresses_secondary_questions=[],
|
||||
targets_focus_areas=[],
|
||||
covers_also_answering=[],
|
||||
)
|
||||
|
||||
def _create_fallback_queries(self, intent: ResearchIntent) -> Dict[str, Any]:
|
||||
@@ -287,6 +296,10 @@ class IntentQueryGenerator:
|
||||
provider="exa",
|
||||
priority=5,
|
||||
expected_results="General information and insights",
|
||||
addresses_primary_question=True,
|
||||
addresses_secondary_questions=[],
|
||||
targets_focus_areas=[],
|
||||
covers_also_answering=[],
|
||||
))
|
||||
|
||||
return {
|
||||
@@ -357,10 +370,17 @@ class QueryOptimizer:
|
||||
if ExpectedDeliverable.TRENDS.value in deliverables:
|
||||
topic = "news"
|
||||
|
||||
# Determine search depth
|
||||
search_depth = "basic"
|
||||
if intent.depth in ["detailed", "expert"]:
|
||||
search_depth = "advanced"
|
||||
# Determine search depth based on depth and time sensitivity
|
||||
# advanced = 2 credits (best quality), basic/fast/ultra-fast = 1 credit
|
||||
search_depth = "basic" # Default: balanced
|
||||
if intent.depth == "expert":
|
||||
search_depth = "advanced" # Best quality for expert research
|
||||
elif intent.depth == "detailed":
|
||||
search_depth = "advanced" # Better snippets for detailed research
|
||||
elif intent.time_sensitivity == "real_time":
|
||||
search_depth = "ultra-fast" # Minimize latency for real-time
|
||||
elif intent.time_sensitivity == "recent":
|
||||
search_depth = "fast" # Good balance for recent content
|
||||
|
||||
# Include answer for factual queries
|
||||
include_answer = False
|
||||
|
||||
121
backend/services/research/intent/query_deduplicator.py
Normal file
121
backend/services/research/intent/query_deduplicator.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
Query deduplication logic for unified research analyzer.
|
||||
|
||||
Removes redundant queries that would return similar results
|
||||
and ensures queries are linked to intent fields.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import ResearchIntent, ResearchQuery
|
||||
|
||||
|
||||
def deduplicate_queries(
|
||||
queries: List[ResearchQuery],
|
||||
intent: ResearchIntent
|
||||
) -> List[ResearchQuery]:
|
||||
"""
|
||||
Remove redundant queries that would return similar results.
|
||||
|
||||
Rules:
|
||||
1. If two queries are semantically very similar (same keywords, same purpose), merge them
|
||||
2. If a query can answer multiple secondary questions, combine them
|
||||
3. If focus areas overlap significantly, don't create separate queries
|
||||
4. Maximum 8 queries - prioritize by importance
|
||||
5. Always keep the primary query (addresses_primary_question=True)
|
||||
"""
|
||||
if len(queries) <= 8:
|
||||
# Still check for exact duplicates
|
||||
seen_queries = set()
|
||||
deduplicated = []
|
||||
for query in queries:
|
||||
query_key = (query.query.lower().strip(), query.provider)
|
||||
if query_key not in seen_queries:
|
||||
seen_queries.add(query_key)
|
||||
deduplicated.append(query)
|
||||
return deduplicated
|
||||
|
||||
# Sort by priority (highest first)
|
||||
queries.sort(key=lambda q: q.priority, reverse=True)
|
||||
|
||||
# Always keep primary query
|
||||
primary_queries = [q for q in queries if q.addresses_primary_question]
|
||||
other_queries = [q for q in queries if not q.addresses_primary_question]
|
||||
|
||||
deduplicated = []
|
||||
seen_keywords = set()
|
||||
|
||||
# Add primary queries first (should be only one, but handle multiple)
|
||||
for query in primary_queries:
|
||||
query_key = (query.query.lower().strip(), query.provider)
|
||||
if query_key not in seen_keywords:
|
||||
seen_keywords.add(query_key)
|
||||
deduplicated.append(query)
|
||||
|
||||
# Process other queries with similarity checking
|
||||
for query in other_queries:
|
||||
query_key = (query.query.lower().strip(), query.provider)
|
||||
|
||||
# Check for exact duplicate
|
||||
if query_key in seen_keywords:
|
||||
continue
|
||||
|
||||
# Check for semantic similarity with existing queries
|
||||
query_words = set(query.query.lower().split())
|
||||
is_duplicate = False
|
||||
|
||||
for existing in deduplicated:
|
||||
existing_words = set(existing.query.lower().split())
|
||||
|
||||
# Calculate Jaccard similarity (intersection over union)
|
||||
intersection = query_words & existing_words
|
||||
union = query_words | existing_words
|
||||
similarity = len(intersection) / len(union) if union else 0
|
||||
|
||||
# CRITICAL: Don't merge queries that target different focus areas or also_answering topics
|
||||
# These should remain separate even if they're similar
|
||||
query_focus_areas = set(query.targets_focus_areas)
|
||||
existing_focus_areas = set(existing.targets_focus_areas)
|
||||
query_also_answering = set(query.covers_also_answering)
|
||||
existing_also_answering = set(existing.covers_also_answering)
|
||||
|
||||
# If queries target different focus areas, keep them separate
|
||||
if query_focus_areas and existing_focus_areas and query_focus_areas != existing_focus_areas:
|
||||
continue # Keep separate - different focus areas
|
||||
|
||||
# If queries cover different also_answering topics, keep them separate
|
||||
if query_also_answering and existing_also_answering and query_also_answering != existing_also_answering:
|
||||
continue # Keep separate - different also_answering topics
|
||||
|
||||
# Only consider duplicate if >90% similarity (increased from 80%) AND same purpose/provider AND same focus/also_answering
|
||||
# This is more strict to avoid over-deduplication
|
||||
if similarity > 0.9 and query.purpose == existing.purpose and query.provider == existing.provider:
|
||||
# Only merge if they truly target the same things
|
||||
if query_focus_areas == existing_focus_areas and query_also_answering == existing_also_answering:
|
||||
is_duplicate = True
|
||||
# Merge: update existing query's linking arrays
|
||||
existing.addresses_secondary_questions = list(set(
|
||||
existing.addresses_secondary_questions + query.addresses_secondary_questions
|
||||
))
|
||||
existing.targets_focus_areas = list(set(
|
||||
existing.targets_focus_areas + query.targets_focus_areas
|
||||
))
|
||||
existing.covers_also_answering = list(set(
|
||||
existing.covers_also_answering + query.covers_also_answering
|
||||
))
|
||||
# Update expected_results to reflect merged coverage
|
||||
if query.expected_results and query.expected_results not in existing.expected_results:
|
||||
existing.expected_results += f" Also covers: {query.expected_results}"
|
||||
break
|
||||
|
||||
if not is_duplicate:
|
||||
deduplicated.append(query)
|
||||
seen_keywords.add(query_key)
|
||||
|
||||
# Limit to 8 queries total
|
||||
if len(deduplicated) >= 8:
|
||||
break
|
||||
|
||||
logger.info(f"Deduplicated queries: {len(queries)} -> {len(deduplicated)}")
|
||||
return deduplicated
|
||||
112
backend/services/research/intent/unified_analyzer_utils.py
Normal file
112
backend/services/research/intent/unified_analyzer_utils.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Utility functions for unified research analyzer.
|
||||
|
||||
Provides helper functions for date context, persona context,
|
||||
competitor context, and fallback response creation.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from models.research_intent_models import ResearchIntent, ResearchQuery
|
||||
from models.research_persona_models import ResearchPersona
|
||||
|
||||
|
||||
def get_current_date_context() -> str:
|
||||
"""Get current date/time context for prompts."""
|
||||
now = datetime.now()
|
||||
current_year = now.year
|
||||
current_month = now.strftime("%B") # Full month name
|
||||
current_date = now.strftime("%Y-%m-%d")
|
||||
return f"CURRENT DATE: {current_date} ({current_month} {current_year})\nCURRENT YEAR: {current_year}"
|
||||
|
||||
|
||||
def build_persona_context(
|
||||
research_persona: Optional[ResearchPersona],
|
||||
industry: Optional[str],
|
||||
target_audience: Optional[str],
|
||||
) -> str:
|
||||
"""Build persona context section."""
|
||||
parts = []
|
||||
|
||||
if research_persona:
|
||||
if research_persona.default_industry:
|
||||
parts.append(f"Industry: {research_persona.default_industry}")
|
||||
if research_persona.default_target_audience:
|
||||
parts.append(f"Target Audience: {research_persona.default_target_audience}")
|
||||
if research_persona.research_angles:
|
||||
parts.append(f"Preferred Research Angles: {', '.join(research_persona.research_angles[:3])}")
|
||||
if research_persona.suggested_keywords:
|
||||
parts.append(f"Relevant Keywords: {', '.join(research_persona.suggested_keywords[:5])}")
|
||||
else:
|
||||
if industry:
|
||||
parts.append(f"Industry: {industry}")
|
||||
if target_audience:
|
||||
parts.append(f"Target Audience: {target_audience}")
|
||||
|
||||
if not parts:
|
||||
return "No specific user context available. Use general best practices."
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def build_competitor_context(competitor_data: Optional[List[Dict]]) -> str:
|
||||
"""Build competitor context section."""
|
||||
if not competitor_data:
|
||||
return ""
|
||||
|
||||
competitor_names = [c.get("name", c.get("url", "")) for c in competitor_data[:5]]
|
||||
if competitor_names:
|
||||
return f"\nKnown Competitors: {', '.join(competitor_names)}"
|
||||
return ""
|
||||
|
||||
|
||||
def create_fallback_response(user_input: str, keywords: List[str]) -> Dict[str, Any]:
|
||||
"""Create fallback response when analysis fails."""
|
||||
return {
|
||||
"success": False,
|
||||
"intent": ResearchIntent(
|
||||
primary_question=f"What are the key insights about: {user_input}?",
|
||||
purpose="learn",
|
||||
content_output="general",
|
||||
expected_deliverables=["key_statistics", "best_practices"],
|
||||
depth="detailed",
|
||||
focus_areas=[],
|
||||
also_answering=[],
|
||||
original_input=user_input,
|
||||
confidence=0.5,
|
||||
),
|
||||
"queries": [
|
||||
ResearchQuery(
|
||||
query=user_input,
|
||||
purpose="key_statistics",
|
||||
provider="exa",
|
||||
priority=5,
|
||||
expected_results="General research results",
|
||||
addresses_primary_question=True,
|
||||
addresses_secondary_questions=[],
|
||||
targets_focus_areas=[],
|
||||
covers_also_answering=[],
|
||||
)
|
||||
],
|
||||
"enhanced_keywords": keywords,
|
||||
"research_angles": [],
|
||||
"recommended_provider": "exa",
|
||||
"provider_justification": "Default fallback to Exa for semantic search",
|
||||
"exa_config": {
|
||||
"enabled": True,
|
||||
"type": "auto",
|
||||
"type_justification": "Auto mode for balanced results",
|
||||
"numResults": 10,
|
||||
"highlights": True,
|
||||
},
|
||||
"tavily_config": {
|
||||
"enabled": True,
|
||||
"topic": "general",
|
||||
"search_depth": "advanced",
|
||||
"include_answer": True,
|
||||
},
|
||||
"trends_config": {
|
||||
"enabled": False, # Disabled in fallback
|
||||
},
|
||||
}
|
||||
277
backend/services/research/intent/unified_prompt_builder.py
Normal file
277
backend/services/research/intent/unified_prompt_builder.py
Normal file
@@ -0,0 +1,277 @@
|
||||
"""
|
||||
Prompt builder for unified research analyzer.
|
||||
|
||||
Builds the comprehensive LLM prompt that guides intent inference,
|
||||
query generation, and parameter optimization in a single call.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .unified_analyzer_utils import (
|
||||
get_current_date_context,
|
||||
build_persona_context,
|
||||
build_competitor_context,
|
||||
)
|
||||
|
||||
|
||||
def build_unified_prompt(
|
||||
user_input: str,
|
||||
keywords: List[str],
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
user_provided_purpose: Optional[str] = None,
|
||||
user_provided_content_output: Optional[str] = None,
|
||||
user_provided_depth: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build the unified prompt for intent + queries + parameters.
|
||||
|
||||
This prompt guides the LLM to:
|
||||
1. Infer research intent (or use user-provided purpose/content_output/depth)
|
||||
2. Generate targeted queries linked to intent fields
|
||||
3. Optimize provider settings based on queries and intent
|
||||
"""
|
||||
# Get current date context
|
||||
date_context = get_current_date_context()
|
||||
now = datetime.now()
|
||||
current_year = now.year
|
||||
next_year = current_year + 1
|
||||
current_month_year = now.strftime("%B %Y")
|
||||
|
||||
# Build persona context
|
||||
persona_context = build_persona_context(research_persona, industry, target_audience)
|
||||
|
||||
# Build competitor context
|
||||
competitor_context = build_competitor_context(competitor_data)
|
||||
|
||||
prompt = f'''You are an expert AI research strategist. Analyze the user's research request and provide a complete research plan including intent understanding, search queries, and optimal API settings.
|
||||
|
||||
## CURRENT DATE/TIME CONTEXT
|
||||
{date_context}
|
||||
|
||||
**NOTE**: When user mentions time-sensitive terms (latest, current, recent, trends, predictions), prioritize {current_year} data.
|
||||
|
||||
## USER INPUT
|
||||
"{user_input}"
|
||||
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
|
||||
|
||||
## USER CONTEXT
|
||||
{persona_context}
|
||||
{competitor_context}
|
||||
{f'''
|
||||
## USER-PROVIDED INTENT SETTINGS
|
||||
The user has explicitly selected these settings - USE THESE VALUES, do NOT infer different ones:
|
||||
- purpose: {user_provided_purpose} (USE THIS EXACT VALUE)
|
||||
- content_output: {user_provided_content_output} (USE THIS EXACT VALUE)
|
||||
- depth: {user_provided_depth} (USE THIS EXACT VALUE)
|
||||
|
||||
IMPORTANT: Since the user has explicitly selected these, you should:
|
||||
1. Use the provided purpose, content_output, and depth values exactly as given
|
||||
2. Still infer secondary_questions, focus_areas, also_answering, and expected_deliverables based on the user input and these provided settings
|
||||
3. Generate queries that align with the user's explicit selections
|
||||
''' if (user_provided_purpose or user_provided_content_output or user_provided_depth) else ''}
|
||||
|
||||
## YOUR TASK: Provide a Complete Research Plan
|
||||
|
||||
### PART 1: INTENT ANALYSIS
|
||||
{f"Use the user-provided settings above. For fields not provided, infer what the user really wants from their research." if (user_provided_purpose or user_provided_content_output or user_provided_depth) else "Understand what the user really wants from their research."}
|
||||
|
||||
**CRITICAL: Use EXACT enum values - do NOT return descriptive strings.**
|
||||
- purpose: Must be one of: "learn", "create_content", "make_decision", "compare", "solve_problem", "find_data", "explore_trends", "validate", "generate_ideas"
|
||||
{f"**USER PROVIDED: {user_provided_purpose} - USE THIS EXACT VALUE**" if user_provided_purpose else "- Infer from user input"}
|
||||
- content_output: Must be one of: "blog", "podcast", "video", "social_post", "newsletter", "presentation", "report", "whitepaper", "email", "general"
|
||||
{f"**USER PROVIDED: {user_provided_content_output} - USE THIS EXACT VALUE**" if user_provided_content_output else "- Infer from user input"}
|
||||
- depth: Must be one of: "overview", "detailed", "expert"
|
||||
{f"**USER PROVIDED: {user_provided_depth} - USE THIS EXACT VALUE**" if user_provided_depth else "- Infer from user input"}
|
||||
- expected_deliverables: Must be an array of exact values: "key_statistics", "expert_quotes", "case_studies", "comparisons", "trends", "best_practices", "step_by_step", "pros_cons", "definitions", "citations", "examples", "predictions"
|
||||
- Infer based on purpose, content_output, and user input
|
||||
|
||||
**CRITICAL: ALWAYS generate focus_areas and also_answering fields:**
|
||||
- focus_areas: Generate 2-5 specific focus areas based on user input (e.g., "academic research", "industry trends", "company analysis", "practical applications", "safety considerations")
|
||||
- also_answering: Generate 2-4 related topics or questions that should also be addressed (e.g., "benefits and drawbacks", "alternatives", "implementation steps", "cost considerations")
|
||||
- These fields are REQUIRED and MUST be populated - do NOT leave them empty
|
||||
- Think about what additional aspects of the topic would be valuable to cover
|
||||
|
||||
### PART 2: SEARCH QUERIES
|
||||
Generate 4-8 targeted, diverse search queries optimized for semantic search.
|
||||
|
||||
**CRITICAL: Generate MULTIPLE DIVERSE queries (minimum 4, maximum 8). Do NOT generate just one query.**
|
||||
|
||||
**QUERY GENERATION RULES:**
|
||||
|
||||
1. **PRIMARY QUERY**: Generate 1 query that directly addresses the primary_question
|
||||
- This should be the highest priority (priority: 5)
|
||||
- Should comprehensively cover the main research goal
|
||||
- Set addresses_primary_question: true
|
||||
|
||||
2. **SECONDARY QUERY MAPPING**: For EACH secondary_question, generate a SEPARATE query that addresses it
|
||||
- Link each query to its corresponding secondary_question in addresses_secondary_questions array
|
||||
- Priority: 4 (high but secondary to primary)
|
||||
- **CRITICAL**: Create SEPARATE queries for each secondary question UNLESS they are extremely similar (same keywords, same search intent)
|
||||
- Only merge if queries would return identical results
|
||||
|
||||
3. **FOCUS AREA QUERIES**: Generate SEPARATE queries for EACH focus_area
|
||||
- **CRITICAL**: If focus_areas exist, generate AT LEAST one query per focus_area
|
||||
- Add each focus area to targets_focus_areas array for its corresponding query
|
||||
- Priority: 3-4 depending on importance
|
||||
- **CRITICAL**: Create SEPARATE queries for each focus_area UNLESS they are extremely similar (same search intent, same keywords)
|
||||
- Each focus area should have its own dedicated query to ensure comprehensive coverage
|
||||
|
||||
4. **ALSO ANSWERING QUERIES**: Generate queries for EACH also_answering topic
|
||||
- **CRITICAL**: Generate at least one query per also_answering topic that is NOT covered by primary/secondary queries
|
||||
- Lower priority (priority: 2-3)
|
||||
- Add each topic to covers_also_answering array for its corresponding query
|
||||
- Only skip if the topic is already fully covered by existing queries
|
||||
|
||||
5. **QUERY DIVERSITY RULES** (IMPORTANT):
|
||||
- **CRITICAL**: Ensure queries are DISTINCT and target DIFFERENT aspects
|
||||
- Vary search terms: use synonyms, related terms, different angles
|
||||
- Vary query structure: some specific, some broader
|
||||
- Vary providers: mix Exa and Tavily when appropriate
|
||||
- Target different content types: academic, news, practical guides, etc.
|
||||
- **DO NOT** create queries that are just slight variations of each other
|
||||
- **DO NOT** merge queries that target different focus areas or also_answering topics
|
||||
|
||||
6. **MINIMUM QUERY REQUIREMENTS**:
|
||||
- **ALWAYS generate at least 4 queries** (even for simple topics)
|
||||
- If you have: 1 primary + 1 secondary + 2 focus areas = generate at least 4 queries
|
||||
- If you have: 1 primary + 3 secondary + 2 focus areas + 2 also_answering = generate 6-8 queries
|
||||
- **If focus_areas or also_answering are empty, generate queries covering different angles/aspects of the primary question**
|
||||
|
||||
7. **QUERY-TO-INTENT LINKING**: For each query, specify:
|
||||
- addresses_primary_question: true/false (only one query should be true)
|
||||
- addresses_secondary_questions: array of secondary question strings (can be empty, or contain one/multiple)
|
||||
- targets_focus_areas: array of focus area strings (should match focus_areas when relevant)
|
||||
- covers_also_answering: array of also_answering topic strings (should match also_answering when relevant)
|
||||
- justification: brief explanation explaining how this query differs from others and what it will find
|
||||
|
||||
**OUTPUT FORMAT FOR QUERIES:**
|
||||
Each query must include these linking fields. Ensure queries are DIVERSE and target different aspects, not just variations of the same search.
|
||||
|
||||
### PART 3: PROVIDER SETTINGS
|
||||
Configure Exa and Tavily API parameters with justifications.
|
||||
|
||||
**Provider settings should be optimized based on:**
|
||||
1. **Primary query characteristics** (most important - this is what will be executed)
|
||||
2. **Secondary questions** (if they require different settings for comprehensive coverage)
|
||||
3. **Focus areas** (if they need specific content types or sources)
|
||||
4. **Also answering topics** (if they need different time ranges or sources)
|
||||
5. **Time sensitivity** from intent (real_time, recent, historical, evergreen)
|
||||
6. **Depth requirements** from intent (overview, detailed, expert)
|
||||
|
||||
**SETTING OPTIMIZATION RULES:**
|
||||
|
||||
1. **Time Sensitivity Based on Intent**:
|
||||
- If time_sensitivity = "real_time" OR any secondary_question/focus_area needs recent data:
|
||||
- Tavily: time_range = "day" or "week", topic = "news"
|
||||
- Exa: startPublishedDate = current year, type = "auto" or "fast"
|
||||
- If time_sensitivity = "historical":
|
||||
- Exa: No date filters, use historical content, type = "deep" or "neural"
|
||||
- Tavily: time_range = "year" or null, topic = "general"
|
||||
- If time_sensitivity = "recent":
|
||||
- Exa: startPublishedDate = current year or last 6 months
|
||||
- Tavily: time_range = "month" or "week"
|
||||
- If time_sensitivity = "evergreen":
|
||||
- Exa: No date filters, type = "deep" for comprehensive coverage
|
||||
- Tavily: time_range = null, topic = "general"
|
||||
|
||||
2. **Content Type Based on Focus Areas**:
|
||||
- If focus_areas include "academic" or "research" or "studies":
|
||||
- Exa: category = "research paper", includeDomains = ["arxiv.org", "nature.com", "pubmed.ncbi.nlm.nih.gov"]
|
||||
- Exa: type = "deep" or "neural" for comprehensive academic coverage
|
||||
- If focus_areas include "companies" or "competitors" or "business":
|
||||
- Exa: category = "company"
|
||||
- Exa: type = "auto" or "deep" for company research
|
||||
- If focus_areas include "news" or "trends" or "current events":
|
||||
- Tavily: topic = "news", search_depth = "advanced"
|
||||
- Exa: category = "news" (if using Exa for news)
|
||||
- If focus_areas include "social" or "twitter" or "social media":
|
||||
- Exa: category = "tweet"
|
||||
- If focus_areas include "github" or "code" or "technical":
|
||||
- Exa: category = "github"
|
||||
|
||||
3. **Depth Based on Intent Depth and Secondary Questions**:
|
||||
- If depth = "expert" OR secondary_questions require detailed analysis:
|
||||
- Exa: type = "deep", context = true, contextMaxCharacters = 15000+, numResults = 20-50
|
||||
- Tavily: search_depth = "advanced", chunks_per_source = 3, max_results = 15-20
|
||||
- If depth = "detailed":
|
||||
- Exa: type = "auto" or "deep", context = true, contextMaxCharacters = 10000+, numResults = 10-20
|
||||
- Tavily: search_depth = "advanced" or "basic", chunks_per_source = 3, max_results = 10-15
|
||||
- If depth = "overview":
|
||||
- Exa: type = "auto" or "fast", numResults = 5-10
|
||||
- Tavily: search_depth = "basic" or "fast", max_results = 5-10
|
||||
|
||||
4. **Query-Specific Settings (Primary Query Focus)**:
|
||||
- If primary query needs comprehensive results (addresses multiple secondary questions or focus areas):
|
||||
- Exa: type = "deep", context = true, contextMaxCharacters = 15000+
|
||||
- Tavily: search_depth = "advanced", chunks_per_source = 3
|
||||
- If primary query needs speed (simple factual answer):
|
||||
- Exa: type = "fast", numResults = 5-10
|
||||
- Tavily: search_depth = "ultra-fast", max_results = 5
|
||||
- If primary query targets specific content type:
|
||||
- Match Exa category or Tavily topic to content type
|
||||
- If primary query is time-sensitive:
|
||||
- Apply time filters based on urgency
|
||||
|
||||
5. **Also Answering Topics Considerations**:
|
||||
- If also_answering topics need different time ranges:
|
||||
- Use broader time_range in Tavily (e.g., "year" instead of "month")
|
||||
- Don't apply strict date filters in Exa
|
||||
- If also_answering topics need different sources:
|
||||
- Consider including additional domains in includeDomains
|
||||
- Use more comprehensive search (type = "deep" in Exa)
|
||||
|
||||
6. **Provider Selection Based on Intent**:
|
||||
- Use EXA when:
|
||||
* Primary query needs semantic understanding
|
||||
* Focus areas include "academic", "research", "companies"
|
||||
* Depth = "expert" or "detailed"
|
||||
* Need comprehensive context (context = true)
|
||||
- Use TAVILY when:
|
||||
* Time sensitivity = "real_time" or "recent"
|
||||
* Focus areas include "news", "trends", "current events"
|
||||
* Need quick AI-generated answers
|
||||
* Primary query is about recent developments
|
||||
|
||||
**NOTE**: Since we're executing only the PRIMARY query initially, optimize settings for the primary query, but ensure settings can accommodate secondary questions and focus areas in the results. The settings should be comprehensive enough to capture information relevant to all intent aspects.
|
||||
|
||||
### PART 4: GOOGLE TRENDS KEYWORDS (if trends in deliverables)
|
||||
If "trends" is in expected_deliverables OR purpose is "explore_trends":
|
||||
- Suggest 1-3 optimized keywords for Google Trends analysis
|
||||
- These may differ from research queries (trends need broader, searchable terms)
|
||||
- Consider: What keywords will show meaningful trends over time?
|
||||
- Consider: What timeframe will show relevant trends? (1 year, 12 months, etc.)
|
||||
- Consider: What geographic region is most relevant for the user?
|
||||
- Explain what insights trends will uncover for content generation:
|
||||
* Search interest trends over time (optimal publication timing)
|
||||
* Regional interest distribution (audience targeting)
|
||||
* Related topics for content expansion
|
||||
* Related queries for FAQ sections
|
||||
* Rising topics for timely content opportunities
|
||||
|
||||
---
|
||||
|
||||
## PROVIDER OPTIONS
|
||||
|
||||
**EXA**: type (auto/fast/deep/neural/keyword), category (company/research paper/news/etc), numResults (1-100), includeDomains, startPublishedDate, highlights, context (required for deep). Best for: academic, companies, deep analysis.
|
||||
|
||||
**TAVILY**: topic (general/news/finance), search_depth (advanced/basic/fast/ultra-fast), time_range, max_results (0-20), chunks_per_source (1-3). Best for: news, real-time, quick facts.
|
||||
|
||||
---
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return JSON with: intent (all fields), queries (with linking fields), enhanced_keywords, research_angles, recommended_provider, provider_justification, exa_config (enabled, type, category, numResults, includeDomains, excludeDomains, startPublishedDate, highlights, context, contextMaxCharacters, and justifications), tavily_config (enabled, topic, search_depth, include_answer, time_range, max_results, chunks_per_source, and justifications), trends_config (if trends enabled).
|
||||
|
||||
**Key Requirements:**
|
||||
- Provide brief justifications (1 sentence) for all config parameters
|
||||
- Reference intent fields (depth, time_sensitivity, focus_areas) in justifications
|
||||
- Include current year ({current_year}) in time-sensitive queries
|
||||
- Use EXA for academic/companies/deep analysis, TAVILY for news/real-time
|
||||
'''
|
||||
|
||||
return prompt
|
||||
@@ -8,24 +8,17 @@ This reduces 2 LLM calls to 1, improves coherence, and provides
|
||||
user-friendly justifications for all settings.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
Version: 2.0 (Refactored)
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from typing import Dict, Any, List, Optional
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchQuery,
|
||||
IntentInferenceResponse,
|
||||
ResearchPurpose,
|
||||
ContentOutput,
|
||||
ExpectedDeliverable,
|
||||
ResearchDepthLevel,
|
||||
InputType,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .unified_prompt_builder import build_unified_prompt
|
||||
from .unified_schema_builder import build_unified_schema
|
||||
from .unified_result_parser import parse_unified_result
|
||||
from .unified_analyzer_utils import create_fallback_response
|
||||
|
||||
|
||||
class UnifiedResearchAnalyzer:
|
||||
@@ -36,6 +29,13 @@ class UnifiedResearchAnalyzer:
|
||||
3. Parameter optimization (Exa/Tavily settings)
|
||||
|
||||
All in a single LLM call with justifications.
|
||||
|
||||
Refactored to use modular components for better maintainability:
|
||||
- unified_prompt_builder: Builds the comprehensive LLM prompt
|
||||
- unified_schema_builder: Defines the JSON schema for structured output
|
||||
- unified_result_parser: Parses LLM response into structured models
|
||||
- unified_analyzer_utils: Utility functions for context and fallback
|
||||
- query_deduplicator: Removes redundant queries (used by parser)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -51,36 +51,56 @@ class UnifiedResearchAnalyzer:
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
user_provided_purpose: Optional[str] = None,
|
||||
user_provided_content_output: Optional[str] = None,
|
||||
user_provided_depth: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Perform unified analysis of user research request.
|
||||
|
||||
Args:
|
||||
user_input: The user's research input (keywords, question, etc.)
|
||||
keywords: Optional list of keywords
|
||||
research_persona: Optional research persona for personalization
|
||||
competitor_data: Optional competitor analysis data
|
||||
industry: Optional industry context
|
||||
target_audience: Optional target audience context
|
||||
user_id: User ID for subscription checks (required)
|
||||
|
||||
Returns:
|
||||
Dict containing:
|
||||
- success: bool
|
||||
- intent: ResearchIntent
|
||||
- queries: List[ResearchQuery]
|
||||
- exa_config: Dict with settings and justifications
|
||||
- tavily_config: Dict with settings and justifications
|
||||
- recommended_provider: str
|
||||
- provider_justification: str
|
||||
- trends_config: Dict with Google Trends settings (optional)
|
||||
- enhanced_keywords: List[str]
|
||||
- research_angles: List[str]
|
||||
- analysis_summary: str
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Unified analysis for: {user_input[:100]}...")
|
||||
|
||||
keywords = keywords or []
|
||||
|
||||
# Build the unified prompt
|
||||
prompt = self._build_unified_prompt(
|
||||
# Build the unified prompt using the prompt builder module
|
||||
prompt = build_unified_prompt(
|
||||
user_input=user_input,
|
||||
keywords=keywords,
|
||||
research_persona=research_persona,
|
||||
competitor_data=competitor_data,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
user_provided_purpose=user_provided_purpose,
|
||||
user_provided_content_output=user_provided_content_output,
|
||||
user_provided_depth=user_provided_depth,
|
||||
)
|
||||
|
||||
# Define the comprehensive JSON schema
|
||||
unified_schema = self._build_unified_schema()
|
||||
# Define the comprehensive JSON schema using the schema builder module
|
||||
unified_schema = build_unified_schema()
|
||||
|
||||
# Call LLM (single call for everything)
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
@@ -93,467 +113,11 @@ class UnifiedResearchAnalyzer:
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Unified analysis failed: {result.get('error')}")
|
||||
return self._create_fallback_response(user_input, keywords)
|
||||
return create_fallback_response(user_input, keywords)
|
||||
|
||||
# Parse the unified result
|
||||
return self._parse_unified_result(result, user_input)
|
||||
# Parse the unified result using the result parser module
|
||||
return parse_unified_result(result, user_input)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in unified analysis: {e}")
|
||||
return self._create_fallback_response(user_input, keywords or [])
|
||||
|
||||
def _build_unified_prompt(
|
||||
self,
|
||||
user_input: str,
|
||||
keywords: List[str],
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Build the unified prompt for intent + queries + parameters."""
|
||||
|
||||
# Build persona context
|
||||
persona_context = self._build_persona_context(research_persona, industry, target_audience)
|
||||
|
||||
# Build competitor context
|
||||
competitor_context = self._build_competitor_context(competitor_data)
|
||||
|
||||
prompt = f'''You are an expert AI research strategist. Analyze the user's research request and provide a complete research plan including intent understanding, search queries, and optimal API settings.
|
||||
|
||||
## USER INPUT
|
||||
"{user_input}"
|
||||
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
|
||||
|
||||
## USER CONTEXT
|
||||
{persona_context}
|
||||
{competitor_context}
|
||||
|
||||
## YOUR TASK: Provide a Complete Research Plan
|
||||
|
||||
### PART 1: INTENT ANALYSIS
|
||||
Understand what the user really wants from their research.
|
||||
|
||||
### PART 2: SEARCH QUERIES
|
||||
Generate 4-8 targeted search queries optimized for semantic search.
|
||||
|
||||
### PART 3: PROVIDER SETTINGS
|
||||
Configure Exa and Tavily API parameters with justifications.
|
||||
|
||||
### PART 4: GOOGLE TRENDS KEYWORDS (if trends in deliverables)
|
||||
If "trends" is in expected_deliverables OR purpose is "explore_trends":
|
||||
- Suggest 1-3 optimized keywords for Google Trends analysis
|
||||
- These may differ from research queries (trends need broader, searchable terms)
|
||||
- Consider: What keywords will show meaningful trends over time?
|
||||
- Consider: What timeframe will show relevant trends? (1 year, 12 months, etc.)
|
||||
- Consider: What geographic region is most relevant for the user?
|
||||
- Explain what insights trends will uncover for content generation:
|
||||
* Search interest trends over time (optimal publication timing)
|
||||
* Regional interest distribution (audience targeting)
|
||||
* Related topics for content expansion
|
||||
* Related queries for FAQ sections
|
||||
* Rising topics for timely content opportunities
|
||||
|
||||
---
|
||||
|
||||
## AVAILABLE PROVIDER OPTIONS
|
||||
|
||||
### EXA API OPTIONS (Semantic Search Engine)
|
||||
| Parameter | Options | Description |
|
||||
|-----------|---------|-------------|
|
||||
| type | "auto", "neural", "fast", "deep" | "neural" = semantic understanding, "deep" = comprehensive with query expansion |
|
||||
| category | "company", "research paper", "news", "github", "tweet", "personal site", "pdf", "financial report", "people" | Focus on specific content types |
|
||||
| numResults | 5-25 | Number of results (10 recommended) |
|
||||
| includeDomains | string[] | Domains to include (e.g., ["arxiv.org", "nature.com"]) |
|
||||
| excludeDomains | string[] | Domains to exclude |
|
||||
| startPublishedDate | ISO date | Filter by publish date (e.g., "2024-01-01T00:00:00.000Z") |
|
||||
| text | boolean | Include full text content |
|
||||
| highlights | boolean | Extract key highlights |
|
||||
| context | boolean | Return as single context string for RAG |
|
||||
|
||||
**WHEN TO USE EXA:**
|
||||
- Semantic understanding needed (finding similar content)
|
||||
- Academic/research papers
|
||||
- Company/competitor research
|
||||
- Deep, comprehensive results
|
||||
- Historical content
|
||||
|
||||
### TAVILY API OPTIONS (AI-Powered Search)
|
||||
| Parameter | Options | Description |
|
||||
|-----------|---------|-------------|
|
||||
| topic | "general", "news", "finance" | Search topic category |
|
||||
| search_depth | "basic", "advanced" | "advanced" = multiple semantic snippets per URL |
|
||||
| include_answer | false, true, "basic", "advanced" | AI-generated answer from results |
|
||||
| include_raw_content | false, true, "markdown", "text" | Raw page content format |
|
||||
| time_range | "day", "week", "month", "year" | Filter by recency |
|
||||
| max_results | 5-20 | Number of results |
|
||||
| include_domains | string[] | Domains to include |
|
||||
| exclude_domains | string[] | Domains to exclude |
|
||||
|
||||
**WHEN TO USE TAVILY:**
|
||||
- Real-time/current events
|
||||
- News and trending topics
|
||||
- Quick facts with AI answers
|
||||
- Financial data
|
||||
- Recent time-sensitive content
|
||||
|
||||
---
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return a JSON object with this exact structure:
|
||||
|
||||
```json
|
||||
{{
|
||||
"intent": {{
|
||||
"input_type": "keywords|question|goal|mixed",
|
||||
"primary_question": "The main question to answer",
|
||||
"secondary_questions": ["question 1", "question 2"],
|
||||
"purpose": "learn|create_content|make_decision|compare|solve_problem|find_data|explore_trends|validate|generate_ideas",
|
||||
"content_output": "blog|podcast|video|social_post|newsletter|presentation|report|whitepaper|email|general",
|
||||
"expected_deliverables": ["key_statistics", "expert_quotes", "case_studies", "trends", "best_practices"],
|
||||
"depth": "overview|detailed|expert",
|
||||
"focus_areas": ["area1", "area2"],
|
||||
"perspective": "target perspective or null",
|
||||
"time_sensitivity": "real_time|recent|historical|evergreen",
|
||||
"confidence": 0.85,
|
||||
"confidence_reason": "Why this confidence level",
|
||||
"great_example": "Example of better input if confidence < 0.8",
|
||||
"needs_clarification": false,
|
||||
"clarifying_questions": [],
|
||||
"analysis_summary": "Brief summary of research plan"
|
||||
}},
|
||||
"queries": [
|
||||
{{
|
||||
"query": "Optimized search query string",
|
||||
"purpose": "key_statistics|expert_quotes|case_studies|trends|etc",
|
||||
"provider": "exa|tavily",
|
||||
"priority": 5,
|
||||
"expected_results": "What we expect to find",
|
||||
"justification": "Why this query and provider"
|
||||
}}
|
||||
],
|
||||
"enhanced_keywords": ["expanded", "related", "keywords"],
|
||||
"research_angles": ["Angle 1: ...", "Angle 2: ..."],
|
||||
"recommended_provider": "exa|tavily",
|
||||
"provider_justification": "Why this provider is best for this research",
|
||||
"exa_config": {{
|
||||
"enabled": true,
|
||||
"type": "auto|neural|fast|deep",
|
||||
"type_justification": "Why this search type",
|
||||
"category": "news|research paper|company|etc or null",
|
||||
"category_justification": "Why this category or null",
|
||||
"numResults": 10,
|
||||
"numResults_justification": "Why this number",
|
||||
"includeDomains": [],
|
||||
"includeDomains_justification": "Why these domains or empty",
|
||||
"startPublishedDate": "2024-01-01T00:00:00.000Z or null",
|
||||
"date_justification": "Why this date filter or null",
|
||||
"highlights": true,
|
||||
"highlights_justification": "Why enable/disable highlights",
|
||||
"context": true,
|
||||
"context_justification": "Why enable/disable context string"
|
||||
}},
|
||||
"tavily_config": {{
|
||||
"enabled": true,
|
||||
"topic": "general|news|finance",
|
||||
"topic_justification": "Why this topic",
|
||||
"search_depth": "basic|advanced",
|
||||
"search_depth_justification": "Why this depth",
|
||||
"include_answer": "true|false|basic|advanced",
|
||||
"include_answer_justification": "Why this answer mode",
|
||||
"time_range": "day|week|month|year|null",
|
||||
"time_range_justification": "Why this time range or null",
|
||||
"max_results": 10,
|
||||
"max_results_justification": "Why this number",
|
||||
"include_raw_content": "false|true|markdown|text",
|
||||
"include_raw_content_justification": "Why this content mode"
|
||||
}},
|
||||
"trends_config": {{
|
||||
"enabled": true|false,
|
||||
"keywords": ["keyword1", "keyword2"],
|
||||
"keywords_justification": "Why these keywords for trends analysis",
|
||||
"timeframe": "today 1-y|today 12-m|all",
|
||||
"timeframe_justification": "Why this timeframe",
|
||||
"geo": "US|GB|IN|etc",
|
||||
"geo_justification": "Why this geographic region",
|
||||
"expected_insights": [
|
||||
"Search interest trends over the past year",
|
||||
"Regional interest distribution",
|
||||
"Related topics for content expansion",
|
||||
"Related queries for FAQ sections",
|
||||
"Optimal publication timing based on interest peaks"
|
||||
]
|
||||
}}
|
||||
}}
|
||||
```
|
||||
|
||||
## DECISION RULES
|
||||
|
||||
1. **Provider Selection:**
|
||||
- Use EXA for: academic research, competitor analysis, deep understanding, finding similar content
|
||||
- Use TAVILY for: news, current events, quick facts, financial data, real-time info
|
||||
|
||||
2. **Query Optimization:**
|
||||
- Include relevant keywords for semantic matching
|
||||
- Add context words based on deliverables (e.g., "statistics 2024" for key_statistics)
|
||||
- Match query style to provider (natural language for Exa, keyword-rich for Tavily)
|
||||
|
||||
3. **Parameter Selection:**
|
||||
- ALWAYS provide justification for each parameter choice
|
||||
- Consider time sensitivity when setting date filters
|
||||
- Match category/topic to content type
|
||||
- Use "advanced" depth when quality matters more than speed
|
||||
|
||||
4. **Google Trends Keywords (if trends enabled):**
|
||||
- Suggest 1-3 keywords optimized for trends analysis
|
||||
- Keywords should be broader than research queries (e.g., "AI marketing" vs "AI marketing tools for small businesses")
|
||||
- Consider what will show meaningful search interest trends
|
||||
- Choose timeframe based on content type (12 months for blogs, 1 year for comprehensive)
|
||||
- Select geo based on user's target audience or industry
|
||||
- List specific insights trends will uncover
|
||||
|
||||
5. **Justifications:**
|
||||
- Keep justifications concise (1 sentence)
|
||||
- Explain the "why" not the "what"
|
||||
- Reference user's intent when relevant
|
||||
'''
|
||||
|
||||
return prompt
|
||||
|
||||
def _build_unified_schema(self) -> Dict[str, Any]:
|
||||
"""Build the JSON schema for unified response."""
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"intent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
|
||||
"primary_question": {"type": "string"},
|
||||
"secondary_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"purpose": {"type": "string"},
|
||||
"content_output": {"type": "string"},
|
||||
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
|
||||
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
|
||||
"focus_areas": {"type": "array", "items": {"type": "string"}},
|
||||
"perspective": {"type": "string"},
|
||||
"time_sensitivity": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"confidence_reason": {"type": "string"},
|
||||
"great_example": {"type": "string"},
|
||||
"needs_clarification": {"type": "boolean"},
|
||||
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"analysis_summary": {"type": "string"}
|
||||
},
|
||||
"required": ["primary_question", "purpose", "expected_deliverables", "confidence"]
|
||||
},
|
||||
"queries": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
"purpose": {"type": "string"},
|
||||
"provider": {"type": "string"},
|
||||
"priority": {"type": "integer"},
|
||||
"expected_results": {"type": "string"},
|
||||
"justification": {"type": "string"}
|
||||
},
|
||||
"required": ["query", "purpose", "provider", "priority"]
|
||||
}
|
||||
},
|
||||
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"research_angles": {"type": "array", "items": {"type": "string"}},
|
||||
"recommended_provider": {"type": "string"},
|
||||
"provider_justification": {"type": "string"},
|
||||
"exa_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"type": {"type": "string"},
|
||||
"type_justification": {"type": "string"},
|
||||
"category": {"type": "string"},
|
||||
"category_justification": {"type": "string"},
|
||||
"numResults": {"type": "integer"},
|
||||
"numResults_justification": {"type": "string"},
|
||||
"includeDomains": {"type": "array", "items": {"type": "string"}},
|
||||
"includeDomains_justification": {"type": "string"},
|
||||
"startPublishedDate": {"type": "string"},
|
||||
"date_justification": {"type": "string"},
|
||||
"highlights": {"type": "boolean"},
|
||||
"highlights_justification": {"type": "string"},
|
||||
"context": {"type": "boolean"},
|
||||
"context_justification": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"tavily_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"topic": {"type": "string"},
|
||||
"topic_justification": {"type": "string"},
|
||||
"search_depth": {"type": "string"},
|
||||
"search_depth_justification": {"type": "string"},
|
||||
"include_answer": {"type": "string"},
|
||||
"include_answer_justification": {"type": "string"},
|
||||
"time_range": {"type": "string"},
|
||||
"time_range_justification": {"type": "string"},
|
||||
"max_results": {"type": "integer"},
|
||||
"max_results_justification": {"type": "string"},
|
||||
"include_raw_content": {"type": "string"},
|
||||
"include_raw_content_justification": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"trends_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"keywords_justification": {"type": "string"},
|
||||
"timeframe": {"type": "string"},
|
||||
"timeframe_justification": {"type": "string"},
|
||||
"geo": {"type": "string"},
|
||||
"geo_justification": {"type": "string"},
|
||||
"expected_insights": {"type": "array", "items": {"type": "string"}}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["intent", "queries", "recommended_provider", "exa_config", "tavily_config"]
|
||||
}
|
||||
|
||||
def _build_persona_context(
|
||||
self,
|
||||
research_persona: Optional[ResearchPersona],
|
||||
industry: Optional[str],
|
||||
target_audience: Optional[str],
|
||||
) -> str:
|
||||
"""Build persona context section."""
|
||||
parts = []
|
||||
|
||||
if research_persona:
|
||||
if research_persona.default_industry:
|
||||
parts.append(f"Industry: {research_persona.default_industry}")
|
||||
if research_persona.default_target_audience:
|
||||
parts.append(f"Target Audience: {research_persona.default_target_audience}")
|
||||
if research_persona.research_angles:
|
||||
parts.append(f"Preferred Research Angles: {', '.join(research_persona.research_angles[:3])}")
|
||||
if research_persona.suggested_keywords:
|
||||
parts.append(f"Relevant Keywords: {', '.join(research_persona.suggested_keywords[:5])}")
|
||||
else:
|
||||
if industry:
|
||||
parts.append(f"Industry: {industry}")
|
||||
if target_audience:
|
||||
parts.append(f"Target Audience: {target_audience}")
|
||||
|
||||
if not parts:
|
||||
return "No specific user context available. Use general best practices."
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
|
||||
"""Build competitor context section."""
|
||||
if not competitor_data:
|
||||
return ""
|
||||
|
||||
competitor_names = [c.get("name", c.get("url", "")) for c in competitor_data[:5]]
|
||||
if competitor_names:
|
||||
return f"\nKnown Competitors: {', '.join(competitor_names)}"
|
||||
return ""
|
||||
|
||||
def _parse_unified_result(self, result: Dict[str, Any], user_input: str) -> Dict[str, Any]:
|
||||
"""Parse the unified LLM result into structured response."""
|
||||
|
||||
intent_data = result.get("intent", {})
|
||||
|
||||
# Build ResearchIntent
|
||||
intent = ResearchIntent(
|
||||
primary_question=intent_data.get("primary_question", user_input),
|
||||
secondary_questions=intent_data.get("secondary_questions", []),
|
||||
purpose=intent_data.get("purpose", "learn"),
|
||||
content_output=intent_data.get("content_output", "general"),
|
||||
expected_deliverables=intent_data.get("expected_deliverables", ["key_statistics"]),
|
||||
depth=intent_data.get("depth", "detailed"),
|
||||
focus_areas=intent_data.get("focus_areas", []),
|
||||
perspective=intent_data.get("perspective"),
|
||||
time_sensitivity=intent_data.get("time_sensitivity"),
|
||||
input_type=intent_data.get("input_type", "keywords"),
|
||||
original_input=user_input,
|
||||
confidence=float(intent_data.get("confidence", 0.7)),
|
||||
confidence_reason=intent_data.get("confidence_reason"),
|
||||
great_example=intent_data.get("great_example"),
|
||||
needs_clarification=intent_data.get("needs_clarification", False),
|
||||
clarifying_questions=intent_data.get("clarifying_questions", []),
|
||||
)
|
||||
|
||||
# Build queries
|
||||
queries = []
|
||||
for q in result.get("queries", []):
|
||||
try:
|
||||
queries.append(ResearchQuery(
|
||||
query=q.get("query", ""),
|
||||
purpose=q.get("purpose", "key_statistics"),
|
||||
provider=q.get("provider", "exa"),
|
||||
priority=int(q.get("priority", 3)),
|
||||
expected_results=q.get("expected_results", ""),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse query: {e}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": intent,
|
||||
"queries": queries,
|
||||
"enhanced_keywords": result.get("enhanced_keywords", []),
|
||||
"research_angles": result.get("research_angles", []),
|
||||
"recommended_provider": result.get("recommended_provider", "exa"),
|
||||
"provider_justification": result.get("provider_justification", ""),
|
||||
"exa_config": result.get("exa_config", {}),
|
||||
"tavily_config": result.get("tavily_config", {}),
|
||||
"trends_config": result.get("trends_config", {}), # NEW: Google Trends configuration
|
||||
"analysis_summary": intent_data.get("analysis_summary", ""),
|
||||
}
|
||||
|
||||
def _create_fallback_response(self, user_input: str, keywords: List[str]) -> Dict[str, Any]:
|
||||
"""Create fallback response when analysis fails."""
|
||||
return {
|
||||
"success": False,
|
||||
"intent": ResearchIntent(
|
||||
primary_question=f"What are the key insights about: {user_input}?",
|
||||
purpose="learn",
|
||||
content_output="general",
|
||||
expected_deliverables=["key_statistics", "best_practices"],
|
||||
depth="detailed",
|
||||
original_input=user_input,
|
||||
confidence=0.5,
|
||||
),
|
||||
"queries": [
|
||||
ResearchQuery(
|
||||
query=user_input,
|
||||
purpose="key_statistics",
|
||||
provider="exa",
|
||||
priority=5,
|
||||
expected_results="General research results",
|
||||
)
|
||||
],
|
||||
"enhanced_keywords": keywords,
|
||||
"research_angles": [],
|
||||
"recommended_provider": "exa",
|
||||
"provider_justification": "Default fallback to Exa for semantic search",
|
||||
"exa_config": {
|
||||
"enabled": True,
|
||||
"type": "auto",
|
||||
"type_justification": "Auto mode for balanced results",
|
||||
"numResults": 10,
|
||||
"highlights": True,
|
||||
},
|
||||
"tavily_config": {
|
||||
"enabled": True,
|
||||
"topic": "general",
|
||||
"search_depth": "advanced",
|
||||
"include_answer": True,
|
||||
},
|
||||
"trends_config": {
|
||||
"enabled": False, # Disabled in fallback
|
||||
},
|
||||
}
|
||||
return create_fallback_response(user_input, keywords or [])
|
||||
|
||||
209
backend/services/research/intent/unified_result_parser.py
Normal file
209
backend/services/research/intent/unified_result_parser.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
Result parsing logic for unified research analyzer.
|
||||
|
||||
Parses LLM response into structured ResearchIntent, ResearchQuery,
|
||||
and configuration dictionaries.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent, ResearchQuery,
|
||||
ResearchPurpose, ContentOutput, ExpectedDeliverable,
|
||||
ResearchDepthLevel, InputType
|
||||
)
|
||||
from .query_deduplicator import deduplicate_queries
|
||||
|
||||
|
||||
def _normalize_purpose(value: str) -> str:
|
||||
"""Normalize purpose value to enum."""
|
||||
if not value or not isinstance(value, str):
|
||||
return "learn"
|
||||
value_lower = value.lower()
|
||||
# Check for exact match
|
||||
for purpose in ResearchPurpose:
|
||||
if value_lower == purpose.value or value_lower == purpose.name.lower():
|
||||
return purpose.value
|
||||
# Check for keywords in description
|
||||
if "content" in value_lower or "write" in value_lower or "create" in value_lower or "blog" in value_lower:
|
||||
return "create_content"
|
||||
elif "compare" in value_lower or "comparison" in value_lower:
|
||||
return "compare"
|
||||
elif "decision" in value_lower or "choose" in value_lower:
|
||||
return "make_decision"
|
||||
elif "problem" in value_lower or "solve" in value_lower:
|
||||
return "solve_problem"
|
||||
elif "data" in value_lower or "statistic" in value_lower or "fact" in value_lower:
|
||||
return "find_data"
|
||||
elif "trend" in value_lower:
|
||||
return "explore_trends"
|
||||
elif "validat" in value_lower or "verify" in value_lower:
|
||||
return "validate"
|
||||
elif "idea" in value_lower or "brainstorm" in value_lower:
|
||||
return "generate_ideas"
|
||||
return "learn"
|
||||
|
||||
|
||||
def _normalize_content_output(value: str) -> str:
|
||||
"""Normalize content_output value to enum."""
|
||||
if not value or not isinstance(value, str):
|
||||
return "general"
|
||||
value_lower = value.lower()
|
||||
# Check for exact match
|
||||
for output in ContentOutput:
|
||||
if value_lower == output.value or value_lower == output.name.lower():
|
||||
return output.value
|
||||
# Check for keywords
|
||||
if "blog" in value_lower or "article" in value_lower:
|
||||
return "blog"
|
||||
elif "podcast" in value_lower:
|
||||
return "podcast"
|
||||
elif "video" in value_lower:
|
||||
return "video"
|
||||
elif "social" in value_lower or "post" in value_lower:
|
||||
return "social_post"
|
||||
elif "newsletter" in value_lower:
|
||||
return "newsletter"
|
||||
elif "presentation" in value_lower or "slide" in value_lower:
|
||||
return "presentation"
|
||||
elif "report" in value_lower:
|
||||
return "report"
|
||||
elif "whitepaper" in value_lower or "white paper" in value_lower:
|
||||
return "whitepaper"
|
||||
elif "email" in value_lower:
|
||||
return "email"
|
||||
return "general"
|
||||
|
||||
|
||||
def _normalize_deliverable(value: str) -> str:
|
||||
"""Normalize deliverable value to enum."""
|
||||
if not value or not isinstance(value, str):
|
||||
return "key_statistics"
|
||||
value_lower = value.lower().strip()
|
||||
# Check for exact match first
|
||||
for deliverable in ExpectedDeliverable:
|
||||
if value_lower == deliverable.value or value_lower == deliverable.name.lower():
|
||||
return deliverable.value
|
||||
# Check for keywords (more aggressive matching)
|
||||
if "statistic" in value_lower or "data" in value_lower or "number" in value_lower or "metric" in value_lower or "report" in value_lower:
|
||||
return "key_statistics"
|
||||
elif "quote" in value_lower or "expert" in value_lower:
|
||||
return "expert_quotes"
|
||||
elif "case" in value_lower or "study" in value_lower:
|
||||
return "case_studies"
|
||||
elif "compar" in value_lower or "compare" in value_lower or "landscape" in value_lower or "matrix" in value_lower:
|
||||
return "comparisons"
|
||||
elif "trend" in value_lower or "keyword" in value_lower or "seo" in value_lower:
|
||||
return "trends"
|
||||
elif "practice" in value_lower or "best" in value_lower or "guideline" in value_lower or "recommendation" in value_lower or "calendar" in value_lower:
|
||||
return "best_practices"
|
||||
elif "step" in value_lower or "how" in value_lower or "process" in value_lower or "guide" in value_lower or "outline" in value_lower or "heading" in value_lower:
|
||||
return "step_by_step"
|
||||
elif ("pro" in value_lower and "con" in value_lower) or "advantage" in value_lower or "disadvantage" in value_lower:
|
||||
return "pros_cons"
|
||||
elif "defin" in value_lower or "explain" in value_lower:
|
||||
return "definitions"
|
||||
elif "citation" in value_lower or "source" in value_lower or "reference" in value_lower:
|
||||
return "citations"
|
||||
elif "example" in value_lower or "sample" in value_lower:
|
||||
return "examples"
|
||||
elif "prediction" in value_lower or "future" in value_lower or "outlook" in value_lower:
|
||||
return "predictions"
|
||||
# Default fallback
|
||||
return "key_statistics"
|
||||
|
||||
|
||||
def parse_unified_result(result: Dict[str, Any], user_input: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse the unified LLM result into structured response.
|
||||
|
||||
Args:
|
||||
result: Raw LLM response dictionary
|
||||
user_input: Original user input for fallback values
|
||||
|
||||
Returns:
|
||||
Structured response with intent, queries, configs, etc.
|
||||
"""
|
||||
intent_data = result.get("intent", {})
|
||||
|
||||
# Normalize enum values
|
||||
purpose_value = _normalize_purpose(intent_data.get("purpose", "learn"))
|
||||
content_output_value = _normalize_content_output(intent_data.get("content_output", "general"))
|
||||
|
||||
# Normalize deliverables list
|
||||
deliverables_raw = intent_data.get("expected_deliverables", ["key_statistics"])
|
||||
if not isinstance(deliverables_raw, list):
|
||||
deliverables_raw = [deliverables_raw] if deliverables_raw else ["key_statistics"]
|
||||
normalized_deliverables = [_normalize_deliverable(d) for d in deliverables_raw if d]
|
||||
if not normalized_deliverables:
|
||||
normalized_deliverables = ["key_statistics"]
|
||||
|
||||
# Build ResearchIntent
|
||||
try:
|
||||
intent = ResearchIntent(
|
||||
primary_question=intent_data.get("primary_question", user_input),
|
||||
secondary_questions=intent_data.get("secondary_questions", []),
|
||||
purpose=purpose_value,
|
||||
content_output=content_output_value,
|
||||
expected_deliverables=normalized_deliverables,
|
||||
depth=intent_data.get("depth", "detailed"),
|
||||
focus_areas=intent_data.get("focus_areas", []),
|
||||
also_answering=intent_data.get("also_answering", []),
|
||||
perspective=intent_data.get("perspective"),
|
||||
time_sensitivity=intent_data.get("time_sensitivity"),
|
||||
input_type=intent_data.get("input_type", "keywords"),
|
||||
original_input=user_input,
|
||||
confidence=float(intent_data.get("confidence", 0.7)),
|
||||
confidence_reason=intent_data.get("confidence_reason"),
|
||||
great_example=intent_data.get("great_example"),
|
||||
needs_clarification=intent_data.get("needs_clarification", False),
|
||||
clarifying_questions=intent_data.get("clarifying_questions", []),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse intent: {e}, intent_data: {intent_data}")
|
||||
# Return fallback intent
|
||||
from .unified_analyzer_utils import create_fallback_response
|
||||
return create_fallback_response(user_input, [])
|
||||
|
||||
# Build queries
|
||||
queries = []
|
||||
for q in result.get("queries", []):
|
||||
try:
|
||||
# Normalize query purpose
|
||||
query_purpose = _normalize_deliverable(q.get("purpose", "key_statistics"))
|
||||
queries.append(ResearchQuery(
|
||||
query=q.get("query", ""),
|
||||
purpose=query_purpose,
|
||||
provider=q.get("provider", "exa"),
|
||||
priority=int(q.get("priority", 3)),
|
||||
expected_results=q.get("expected_results", ""),
|
||||
addresses_primary_question=q.get("addresses_primary_question", False),
|
||||
addresses_secondary_questions=q.get("addresses_secondary_questions", []),
|
||||
targets_focus_areas=q.get("targets_focus_areas", []),
|
||||
covers_also_answering=q.get("covers_also_answering", []),
|
||||
justification=q.get("justification"),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse query: {e}, query: {q}")
|
||||
|
||||
# Deduplicate queries to avoid redundant API calls
|
||||
queries = deduplicate_queries(queries, intent)
|
||||
|
||||
# Log warning if no queries after parsing
|
||||
if not queries:
|
||||
logger.warning("No valid queries parsed from LLM response")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": intent,
|
||||
"queries": queries,
|
||||
"enhanced_keywords": result.get("enhanced_keywords", []),
|
||||
"research_angles": result.get("research_angles", []),
|
||||
"recommended_provider": result.get("recommended_provider", "exa"),
|
||||
"provider_justification": result.get("provider_justification", ""),
|
||||
"exa_config": result.get("exa_config", {}),
|
||||
"tavily_config": result.get("tavily_config", {}),
|
||||
"trends_config": result.get("trends_config", {}), # Google Trends configuration
|
||||
"analysis_summary": intent_data.get("analysis_summary", ""),
|
||||
}
|
||||
140
backend/services/research/intent/unified_schema_builder.py
Normal file
140
backend/services/research/intent/unified_schema_builder.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
JSON schema builder for unified research analyzer.
|
||||
|
||||
Defines the structured JSON schema that the LLM must return
|
||||
for intent analysis, query generation, and parameter optimization.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
def build_unified_schema() -> Dict[str, Any]:
|
||||
"""
|
||||
Build the JSON schema for unified response.
|
||||
|
||||
This schema defines the structure expected from the LLM
|
||||
for intent + queries + provider settings.
|
||||
"""
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"intent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
|
||||
"primary_question": {"type": "string"},
|
||||
"secondary_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"purpose": {"type": "string"},
|
||||
"content_output": {"type": "string"},
|
||||
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
|
||||
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
|
||||
"focus_areas": {"type": "array", "items": {"type": "string"}},
|
||||
"also_answering": {"type": "array", "items": {"type": "string"}},
|
||||
"perspective": {"type": "string"},
|
||||
"time_sensitivity": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"confidence_reason": {"type": "string"},
|
||||
"great_example": {"type": "string"},
|
||||
"needs_clarification": {"type": "boolean"},
|
||||
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"analysis_summary": {"type": "string"}
|
||||
},
|
||||
"required": ["primary_question", "purpose", "expected_deliverables", "confidence"]
|
||||
},
|
||||
"queries": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
"purpose": {"type": "string"},
|
||||
"provider": {"type": "string"},
|
||||
"priority": {"type": "integer"},
|
||||
"expected_results": {"type": "string"},
|
||||
"justification": {"type": "string"},
|
||||
"addresses_primary_question": {"type": "boolean"},
|
||||
"addresses_secondary_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"targets_focus_areas": {"type": "array", "items": {"type": "string"}},
|
||||
"covers_also_answering": {"type": "array", "items": {"type": "string"}}
|
||||
},
|
||||
"required": ["query", "purpose", "provider", "priority", "addresses_primary_question"]
|
||||
}
|
||||
},
|
||||
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"research_angles": {"type": "array", "items": {"type": "string"}},
|
||||
"recommended_provider": {"type": "string"},
|
||||
"provider_justification": {"type": "string"},
|
||||
"exa_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"type": {"type": "string"},
|
||||
"type_justification": {"type": "string"},
|
||||
"category": {"type": "string"},
|
||||
"category_justification": {"type": "string"},
|
||||
"numResults": {"type": "integer"},
|
||||
"numResults_justification": {"type": "string"},
|
||||
"includeDomains": {"type": "array", "items": {"type": "string"}},
|
||||
"includeDomains_justification": {"type": "string"},
|
||||
"startPublishedDate": {"type": "string"},
|
||||
"date_justification": {"type": "string"},
|
||||
"highlights": {"type": "boolean"},
|
||||
"highlights_justification": {"type": "string"},
|
||||
"context": {"type": "boolean"},
|
||||
"context_justification": {"type": "string"},
|
||||
"additionalQueries": {"type": "array", "items": {"type": "string"}},
|
||||
"additionalQueries_justification": {"type": "string"},
|
||||
"livecrawl": {"type": "string"},
|
||||
"livecrawl_justification": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"tavily_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"topic": {"type": "string"},
|
||||
"topic_justification": {"type": "string"},
|
||||
"search_depth": {"type": "string"},
|
||||
"search_depth_justification": {"type": "string"},
|
||||
"include_answer": {"oneOf": [{"type": "string"}, {"type": "boolean"}]},
|
||||
"include_answer_justification": {"type": "string"},
|
||||
"time_range": {"oneOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"time_range_justification": {"type": "string"},
|
||||
"start_date": {"oneOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"start_date_justification": {"type": "string"},
|
||||
"end_date": {"oneOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"end_date_justification": {"type": "string"},
|
||||
"max_results": {"type": "integer"},
|
||||
"max_results_justification": {"type": "string"},
|
||||
"chunks_per_source": {"type": "integer"},
|
||||
"chunks_per_source_justification": {"type": "string"},
|
||||
"include_raw_content": {"oneOf": [{"type": "string"}, {"type": "boolean"}]},
|
||||
"include_raw_content_justification": {"type": "string"},
|
||||
"country": {"oneOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"country_justification": {"type": "string"},
|
||||
"include_images": {"type": "boolean"},
|
||||
"include_images_justification": {"type": "string"},
|
||||
"include_image_descriptions": {"type": "boolean"},
|
||||
"include_image_descriptions_justification": {"type": "string"},
|
||||
"include_favicon": {"type": "boolean"},
|
||||
"include_favicon_justification": {"type": "string"},
|
||||
"auto_parameters": {"type": "boolean"},
|
||||
"auto_parameters_justification": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"trends_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"keywords_justification": {"type": "string"},
|
||||
"timeframe": {"type": "string"},
|
||||
"timeframe_justification": {"type": "string"},
|
||||
"geo": {"type": "string"},
|
||||
"geo_justification": {"type": "string"},
|
||||
"expected_insights": {"type": "array", "items": {"type": "string"}}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["intent", "queries", "recommended_provider", "exa_config", "tavily_config"]
|
||||
}
|
||||
Reference in New Issue
Block a user