AI Researcher and Video Studio implementation complete
This commit is contained in:
@@ -50,6 +50,7 @@ class IntentAwareAnalyzer:
|
||||
raw_results: Dict[str, Any],
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> IntentDrivenResearchResult:
|
||||
"""
|
||||
Analyze raw research results based on user intent.
|
||||
@@ -84,7 +85,7 @@ class IntentAwareAnalyzer:
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=analysis_schema,
|
||||
user_id=None
|
||||
user_id=user_id # Required for subscription checking
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
|
||||
@@ -151,6 +151,8 @@ Analyze the user's input and infer their research intent. Determine:
|
||||
|
||||
11. **CONFIDENCE**: How confident are you in this inference? (0.0-1.0)
|
||||
- If < 0.7, set needs_clarification to true and provide clarifying_questions
|
||||
- Provide a brief reason for your confidence level
|
||||
- If confidence is low, provide an example of what a great input would look like
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
@@ -168,6 +170,8 @@ Return a JSON object:
|
||||
"perspective": "target perspective or null",
|
||||
"time_sensitivity": "real_time|recent|historical|evergreen",
|
||||
"confidence": 0.85,
|
||||
"confidence_reason": "Brief explanation of why this confidence level (e.g., 'User provided clear keywords and context' or 'Input is vague, missing specific goals')",
|
||||
"great_example": "Example of what a great input would look like for this research (only if confidence < 0.8)",
|
||||
"needs_clarification": false,
|
||||
"clarifying_questions": [],
|
||||
"analysis_summary": "Brief summary of what the user wants"
|
||||
|
||||
@@ -39,6 +39,7 @@ class IntentQueryGenerator:
|
||||
self,
|
||||
intent: ResearchIntent,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate targeted research queries based on intent.
|
||||
@@ -89,7 +90,7 @@ class IntentQueryGenerator:
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=query_schema,
|
||||
user_id=None
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
|
||||
@@ -51,6 +51,7 @@ class ResearchIntentInference:
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> IntentInferenceResponse:
|
||||
"""
|
||||
Analyze user input and infer their research intent.
|
||||
@@ -96,13 +97,15 @@ class ResearchIntentInference:
|
||||
"perspective": {"type": "string"},
|
||||
"time_sensitivity": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"confidence_reason": {"type": "string"},
|
||||
"great_example": {"type": "string"},
|
||||
"needs_clarification": {"type": "boolean"},
|
||||
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"analysis_summary": {"type": "string"}
|
||||
},
|
||||
"required": [
|
||||
"input_type", "primary_question", "purpose", "content_output",
|
||||
"expected_deliverables", "depth", "confidence", "analysis_summary"
|
||||
"expected_deliverables", "depth", "confidence", "confidence_reason", "analysis_summary"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -112,7 +115,7 @@ class ResearchIntentInference:
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=intent_schema,
|
||||
user_id=None
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
@@ -134,6 +137,8 @@ class ResearchIntentInference:
|
||||
suggested_keywords=self._extract_keywords_from_input(user_input, keywords),
|
||||
suggested_angles=result.get("focus_areas", []),
|
||||
quick_options=quick_options,
|
||||
confidence_reason=result.get("confidence_reason", ""),
|
||||
great_example=result.get("great_example", ""),
|
||||
)
|
||||
|
||||
logger.info(f"Intent inferred: purpose={intent.purpose}, confidence={intent.confidence}")
|
||||
@@ -166,7 +171,7 @@ class ResearchIntentInference:
|
||||
if not expected_deliverables:
|
||||
expected_deliverables = self._infer_deliverables_from_purpose(purpose)
|
||||
|
||||
return ResearchIntent(
|
||||
intent = ResearchIntent(
|
||||
primary_question=result.get("primary_question", user_input),
|
||||
secondary_questions=result.get("secondary_questions", []),
|
||||
purpose=purpose.value,
|
||||
@@ -179,9 +184,13 @@ class ResearchIntentInference:
|
||||
input_type=input_type.value,
|
||||
original_input=user_input,
|
||||
confidence=float(result.get("confidence", 0.7)),
|
||||
confidence_reason=result.get("confidence_reason"),
|
||||
great_example=result.get("great_example"),
|
||||
needs_clarification=result.get("needs_clarification", False),
|
||||
clarifying_questions=result.get("clarifying_questions", []),
|
||||
)
|
||||
|
||||
return intent
|
||||
|
||||
def _safe_enum(self, enum_class, value: str, default):
|
||||
"""Safely convert string to enum, returning default if invalid."""
|
||||
|
||||
559
backend/services/research/intent/unified_research_analyzer.py
Normal file
559
backend/services/research/intent/unified_research_analyzer.py
Normal file
@@ -0,0 +1,559 @@
|
||||
"""
|
||||
Unified Research Analyzer
|
||||
|
||||
Combines intent inference, query generation, and parameter optimization
|
||||
into a single AI call with justifications for each decision.
|
||||
|
||||
This reduces 2 LLM calls to 1, improves coherence, and provides
|
||||
user-friendly justifications for all settings.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from loguru import logger
|
||||
|
||||
from models.research_intent_models import (
|
||||
ResearchIntent,
|
||||
ResearchQuery,
|
||||
IntentInferenceResponse,
|
||||
ResearchPurpose,
|
||||
ContentOutput,
|
||||
ExpectedDeliverable,
|
||||
ResearchDepthLevel,
|
||||
InputType,
|
||||
)
|
||||
from models.research_persona_models import ResearchPersona
|
||||
|
||||
|
||||
class UnifiedResearchAnalyzer:
|
||||
"""
|
||||
Unified AI-driven analyzer that performs:
|
||||
1. Intent inference (what user wants)
|
||||
2. Query generation (how to search)
|
||||
3. Parameter optimization (Exa/Tavily settings)
|
||||
|
||||
All in a single LLM call with justifications.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the unified analyzer."""
|
||||
logger.info("UnifiedResearchAnalyzer initialized")
|
||||
|
||||
async def analyze(
|
||||
self,
|
||||
user_input: str,
|
||||
keywords: Optional[List[str]] = None,
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Perform unified analysis of user research request.
|
||||
|
||||
Returns:
|
||||
Dict containing:
|
||||
- intent: ResearchIntent
|
||||
- queries: List[ResearchQuery]
|
||||
- exa_config: Dict with settings and justifications
|
||||
- tavily_config: Dict with settings and justifications
|
||||
- recommended_provider: str
|
||||
- provider_justification: str
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Unified analysis for: {user_input[:100]}...")
|
||||
|
||||
keywords = keywords or []
|
||||
|
||||
# Build the unified prompt
|
||||
prompt = self._build_unified_prompt(
|
||||
user_input=user_input,
|
||||
keywords=keywords,
|
||||
research_persona=research_persona,
|
||||
competitor_data=competitor_data,
|
||||
industry=industry,
|
||||
target_audience=target_audience,
|
||||
)
|
||||
|
||||
# Define the comprehensive JSON schema
|
||||
unified_schema = self._build_unified_schema()
|
||||
|
||||
# Call LLM (single call for everything)
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=unified_schema,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
if isinstance(result, dict) and "error" in result:
|
||||
logger.error(f"Unified analysis failed: {result.get('error')}")
|
||||
return self._create_fallback_response(user_input, keywords)
|
||||
|
||||
# Parse the unified result
|
||||
return self._parse_unified_result(result, user_input)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in unified analysis: {e}")
|
||||
return self._create_fallback_response(user_input, keywords or [])
|
||||
|
||||
def _build_unified_prompt(
|
||||
self,
|
||||
user_input: str,
|
||||
keywords: List[str],
|
||||
research_persona: Optional[ResearchPersona] = None,
|
||||
competitor_data: Optional[List[Dict]] = None,
|
||||
industry: Optional[str] = None,
|
||||
target_audience: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Build the unified prompt for intent + queries + parameters."""
|
||||
|
||||
# Build persona context
|
||||
persona_context = self._build_persona_context(research_persona, industry, target_audience)
|
||||
|
||||
# Build competitor context
|
||||
competitor_context = self._build_competitor_context(competitor_data)
|
||||
|
||||
prompt = f'''You are an expert AI research strategist. Analyze the user's research request and provide a complete research plan including intent understanding, search queries, and optimal API settings.
|
||||
|
||||
## USER INPUT
|
||||
"{user_input}"
|
||||
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
|
||||
|
||||
## USER CONTEXT
|
||||
{persona_context}
|
||||
{competitor_context}
|
||||
|
||||
## YOUR TASK: Provide a Complete Research Plan
|
||||
|
||||
### PART 1: INTENT ANALYSIS
|
||||
Understand what the user really wants from their research.
|
||||
|
||||
### PART 2: SEARCH QUERIES
|
||||
Generate 4-8 targeted search queries optimized for semantic search.
|
||||
|
||||
### PART 3: PROVIDER SETTINGS
|
||||
Configure Exa and Tavily API parameters with justifications.
|
||||
|
||||
### PART 4: GOOGLE TRENDS KEYWORDS (if trends in deliverables)
|
||||
If "trends" is in expected_deliverables OR purpose is "explore_trends":
|
||||
- Suggest 1-3 optimized keywords for Google Trends analysis
|
||||
- These may differ from research queries (trends need broader, searchable terms)
|
||||
- Consider: What keywords will show meaningful trends over time?
|
||||
- Consider: What timeframe will show relevant trends? (1 year, 12 months, etc.)
|
||||
- Consider: What geographic region is most relevant for the user?
|
||||
- Explain what insights trends will uncover for content generation:
|
||||
* Search interest trends over time (optimal publication timing)
|
||||
* Regional interest distribution (audience targeting)
|
||||
* Related topics for content expansion
|
||||
* Related queries for FAQ sections
|
||||
* Rising topics for timely content opportunities
|
||||
|
||||
---
|
||||
|
||||
## AVAILABLE PROVIDER OPTIONS
|
||||
|
||||
### EXA API OPTIONS (Semantic Search Engine)
|
||||
| Parameter | Options | Description |
|
||||
|-----------|---------|-------------|
|
||||
| type | "auto", "neural", "fast", "deep" | "neural" = semantic understanding, "deep" = comprehensive with query expansion |
|
||||
| category | "company", "research paper", "news", "github", "tweet", "personal site", "pdf", "financial report", "people" | Focus on specific content types |
|
||||
| numResults | 5-25 | Number of results (10 recommended) |
|
||||
| includeDomains | string[] | Domains to include (e.g., ["arxiv.org", "nature.com"]) |
|
||||
| excludeDomains | string[] | Domains to exclude |
|
||||
| startPublishedDate | ISO date | Filter by publish date (e.g., "2024-01-01T00:00:00.000Z") |
|
||||
| text | boolean | Include full text content |
|
||||
| highlights | boolean | Extract key highlights |
|
||||
| context | boolean | Return as single context string for RAG |
|
||||
|
||||
**WHEN TO USE EXA:**
|
||||
- Semantic understanding needed (finding similar content)
|
||||
- Academic/research papers
|
||||
- Company/competitor research
|
||||
- Deep, comprehensive results
|
||||
- Historical content
|
||||
|
||||
### TAVILY API OPTIONS (AI-Powered Search)
|
||||
| Parameter | Options | Description |
|
||||
|-----------|---------|-------------|
|
||||
| topic | "general", "news", "finance" | Search topic category |
|
||||
| search_depth | "basic", "advanced" | "advanced" = multiple semantic snippets per URL |
|
||||
| include_answer | false, true, "basic", "advanced" | AI-generated answer from results |
|
||||
| include_raw_content | false, true, "markdown", "text" | Raw page content format |
|
||||
| time_range | "day", "week", "month", "year" | Filter by recency |
|
||||
| max_results | 5-20 | Number of results |
|
||||
| include_domains | string[] | Domains to include |
|
||||
| exclude_domains | string[] | Domains to exclude |
|
||||
|
||||
**WHEN TO USE TAVILY:**
|
||||
- Real-time/current events
|
||||
- News and trending topics
|
||||
- Quick facts with AI answers
|
||||
- Financial data
|
||||
- Recent time-sensitive content
|
||||
|
||||
---
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
||||
Return a JSON object with this exact structure:
|
||||
|
||||
```json
|
||||
{{
|
||||
"intent": {{
|
||||
"input_type": "keywords|question|goal|mixed",
|
||||
"primary_question": "The main question to answer",
|
||||
"secondary_questions": ["question 1", "question 2"],
|
||||
"purpose": "learn|create_content|make_decision|compare|solve_problem|find_data|explore_trends|validate|generate_ideas",
|
||||
"content_output": "blog|podcast|video|social_post|newsletter|presentation|report|whitepaper|email|general",
|
||||
"expected_deliverables": ["key_statistics", "expert_quotes", "case_studies", "trends", "best_practices"],
|
||||
"depth": "overview|detailed|expert",
|
||||
"focus_areas": ["area1", "area2"],
|
||||
"perspective": "target perspective or null",
|
||||
"time_sensitivity": "real_time|recent|historical|evergreen",
|
||||
"confidence": 0.85,
|
||||
"confidence_reason": "Why this confidence level",
|
||||
"great_example": "Example of better input if confidence < 0.8",
|
||||
"needs_clarification": false,
|
||||
"clarifying_questions": [],
|
||||
"analysis_summary": "Brief summary of research plan"
|
||||
}},
|
||||
"queries": [
|
||||
{{
|
||||
"query": "Optimized search query string",
|
||||
"purpose": "key_statistics|expert_quotes|case_studies|trends|etc",
|
||||
"provider": "exa|tavily",
|
||||
"priority": 5,
|
||||
"expected_results": "What we expect to find",
|
||||
"justification": "Why this query and provider"
|
||||
}}
|
||||
],
|
||||
"enhanced_keywords": ["expanded", "related", "keywords"],
|
||||
"research_angles": ["Angle 1: ...", "Angle 2: ..."],
|
||||
"recommended_provider": "exa|tavily",
|
||||
"provider_justification": "Why this provider is best for this research",
|
||||
"exa_config": {{
|
||||
"enabled": true,
|
||||
"type": "auto|neural|fast|deep",
|
||||
"type_justification": "Why this search type",
|
||||
"category": "news|research paper|company|etc or null",
|
||||
"category_justification": "Why this category or null",
|
||||
"numResults": 10,
|
||||
"numResults_justification": "Why this number",
|
||||
"includeDomains": [],
|
||||
"includeDomains_justification": "Why these domains or empty",
|
||||
"startPublishedDate": "2024-01-01T00:00:00.000Z or null",
|
||||
"date_justification": "Why this date filter or null",
|
||||
"highlights": true,
|
||||
"highlights_justification": "Why enable/disable highlights",
|
||||
"context": true,
|
||||
"context_justification": "Why enable/disable context string"
|
||||
}},
|
||||
"tavily_config": {{
|
||||
"enabled": true,
|
||||
"topic": "general|news|finance",
|
||||
"topic_justification": "Why this topic",
|
||||
"search_depth": "basic|advanced",
|
||||
"search_depth_justification": "Why this depth",
|
||||
"include_answer": "true|false|basic|advanced",
|
||||
"include_answer_justification": "Why this answer mode",
|
||||
"time_range": "day|week|month|year|null",
|
||||
"time_range_justification": "Why this time range or null",
|
||||
"max_results": 10,
|
||||
"max_results_justification": "Why this number",
|
||||
"include_raw_content": "false|true|markdown|text",
|
||||
"include_raw_content_justification": "Why this content mode"
|
||||
}},
|
||||
"trends_config": {{
|
||||
"enabled": true|false,
|
||||
"keywords": ["keyword1", "keyword2"],
|
||||
"keywords_justification": "Why these keywords for trends analysis",
|
||||
"timeframe": "today 1-y|today 12-m|all",
|
||||
"timeframe_justification": "Why this timeframe",
|
||||
"geo": "US|GB|IN|etc",
|
||||
"geo_justification": "Why this geographic region",
|
||||
"expected_insights": [
|
||||
"Search interest trends over the past year",
|
||||
"Regional interest distribution",
|
||||
"Related topics for content expansion",
|
||||
"Related queries for FAQ sections",
|
||||
"Optimal publication timing based on interest peaks"
|
||||
]
|
||||
}}
|
||||
}}
|
||||
```
|
||||
|
||||
## DECISION RULES
|
||||
|
||||
1. **Provider Selection:**
|
||||
- Use EXA for: academic research, competitor analysis, deep understanding, finding similar content
|
||||
- Use TAVILY for: news, current events, quick facts, financial data, real-time info
|
||||
|
||||
2. **Query Optimization:**
|
||||
- Include relevant keywords for semantic matching
|
||||
- Add context words based on deliverables (e.g., "statistics 2024" for key_statistics)
|
||||
- Match query style to provider (natural language for Exa, keyword-rich for Tavily)
|
||||
|
||||
3. **Parameter Selection:**
|
||||
- ALWAYS provide justification for each parameter choice
|
||||
- Consider time sensitivity when setting date filters
|
||||
- Match category/topic to content type
|
||||
- Use "advanced" depth when quality matters more than speed
|
||||
|
||||
4. **Google Trends Keywords (if trends enabled):**
|
||||
- Suggest 1-3 keywords optimized for trends analysis
|
||||
- Keywords should be broader than research queries (e.g., "AI marketing" vs "AI marketing tools for small businesses")
|
||||
- Consider what will show meaningful search interest trends
|
||||
- Choose timeframe based on content type (12 months for blogs, 1 year for comprehensive)
|
||||
- Select geo based on user's target audience or industry
|
||||
- List specific insights trends will uncover
|
||||
|
||||
5. **Justifications:**
|
||||
- Keep justifications concise (1 sentence)
|
||||
- Explain the "why" not the "what"
|
||||
- Reference user's intent when relevant
|
||||
'''
|
||||
|
||||
return prompt
|
||||
|
||||
def _build_unified_schema(self) -> Dict[str, Any]:
|
||||
"""Build the JSON schema for unified response."""
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"intent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
|
||||
"primary_question": {"type": "string"},
|
||||
"secondary_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"purpose": {"type": "string"},
|
||||
"content_output": {"type": "string"},
|
||||
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
|
||||
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
|
||||
"focus_areas": {"type": "array", "items": {"type": "string"}},
|
||||
"perspective": {"type": "string"},
|
||||
"time_sensitivity": {"type": "string"},
|
||||
"confidence": {"type": "number"},
|
||||
"confidence_reason": {"type": "string"},
|
||||
"great_example": {"type": "string"},
|
||||
"needs_clarification": {"type": "boolean"},
|
||||
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
|
||||
"analysis_summary": {"type": "string"}
|
||||
},
|
||||
"required": ["primary_question", "purpose", "expected_deliverables", "confidence"]
|
||||
},
|
||||
"queries": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
"purpose": {"type": "string"},
|
||||
"provider": {"type": "string"},
|
||||
"priority": {"type": "integer"},
|
||||
"expected_results": {"type": "string"},
|
||||
"justification": {"type": "string"}
|
||||
},
|
||||
"required": ["query", "purpose", "provider", "priority"]
|
||||
}
|
||||
},
|
||||
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"research_angles": {"type": "array", "items": {"type": "string"}},
|
||||
"recommended_provider": {"type": "string"},
|
||||
"provider_justification": {"type": "string"},
|
||||
"exa_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"type": {"type": "string"},
|
||||
"type_justification": {"type": "string"},
|
||||
"category": {"type": "string"},
|
||||
"category_justification": {"type": "string"},
|
||||
"numResults": {"type": "integer"},
|
||||
"numResults_justification": {"type": "string"},
|
||||
"includeDomains": {"type": "array", "items": {"type": "string"}},
|
||||
"includeDomains_justification": {"type": "string"},
|
||||
"startPublishedDate": {"type": "string"},
|
||||
"date_justification": {"type": "string"},
|
||||
"highlights": {"type": "boolean"},
|
||||
"highlights_justification": {"type": "string"},
|
||||
"context": {"type": "boolean"},
|
||||
"context_justification": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"tavily_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"topic": {"type": "string"},
|
||||
"topic_justification": {"type": "string"},
|
||||
"search_depth": {"type": "string"},
|
||||
"search_depth_justification": {"type": "string"},
|
||||
"include_answer": {"type": "string"},
|
||||
"include_answer_justification": {"type": "string"},
|
||||
"time_range": {"type": "string"},
|
||||
"time_range_justification": {"type": "string"},
|
||||
"max_results": {"type": "integer"},
|
||||
"max_results_justification": {"type": "string"},
|
||||
"include_raw_content": {"type": "string"},
|
||||
"include_raw_content_justification": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"trends_config": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {"type": "boolean"},
|
||||
"keywords": {"type": "array", "items": {"type": "string"}},
|
||||
"keywords_justification": {"type": "string"},
|
||||
"timeframe": {"type": "string"},
|
||||
"timeframe_justification": {"type": "string"},
|
||||
"geo": {"type": "string"},
|
||||
"geo_justification": {"type": "string"},
|
||||
"expected_insights": {"type": "array", "items": {"type": "string"}}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["intent", "queries", "recommended_provider", "exa_config", "tavily_config"]
|
||||
}
|
||||
|
||||
def _build_persona_context(
|
||||
self,
|
||||
research_persona: Optional[ResearchPersona],
|
||||
industry: Optional[str],
|
||||
target_audience: Optional[str],
|
||||
) -> str:
|
||||
"""Build persona context section."""
|
||||
parts = []
|
||||
|
||||
if research_persona:
|
||||
if research_persona.default_industry:
|
||||
parts.append(f"Industry: {research_persona.default_industry}")
|
||||
if research_persona.default_target_audience:
|
||||
parts.append(f"Target Audience: {research_persona.default_target_audience}")
|
||||
if research_persona.research_angles:
|
||||
parts.append(f"Preferred Research Angles: {', '.join(research_persona.research_angles[:3])}")
|
||||
if research_persona.suggested_keywords:
|
||||
parts.append(f"Relevant Keywords: {', '.join(research_persona.suggested_keywords[:5])}")
|
||||
else:
|
||||
if industry:
|
||||
parts.append(f"Industry: {industry}")
|
||||
if target_audience:
|
||||
parts.append(f"Target Audience: {target_audience}")
|
||||
|
||||
if not parts:
|
||||
return "No specific user context available. Use general best practices."
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
|
||||
"""Build competitor context section."""
|
||||
if not competitor_data:
|
||||
return ""
|
||||
|
||||
competitor_names = [c.get("name", c.get("url", "")) for c in competitor_data[:5]]
|
||||
if competitor_names:
|
||||
return f"\nKnown Competitors: {', '.join(competitor_names)}"
|
||||
return ""
|
||||
|
||||
def _parse_unified_result(self, result: Dict[str, Any], user_input: str) -> Dict[str, Any]:
|
||||
"""Parse the unified LLM result into structured response."""
|
||||
|
||||
intent_data = result.get("intent", {})
|
||||
|
||||
# Build ResearchIntent
|
||||
intent = ResearchIntent(
|
||||
primary_question=intent_data.get("primary_question", user_input),
|
||||
secondary_questions=intent_data.get("secondary_questions", []),
|
||||
purpose=intent_data.get("purpose", "learn"),
|
||||
content_output=intent_data.get("content_output", "general"),
|
||||
expected_deliverables=intent_data.get("expected_deliverables", ["key_statistics"]),
|
||||
depth=intent_data.get("depth", "detailed"),
|
||||
focus_areas=intent_data.get("focus_areas", []),
|
||||
perspective=intent_data.get("perspective"),
|
||||
time_sensitivity=intent_data.get("time_sensitivity"),
|
||||
input_type=intent_data.get("input_type", "keywords"),
|
||||
original_input=user_input,
|
||||
confidence=float(intent_data.get("confidence", 0.7)),
|
||||
confidence_reason=intent_data.get("confidence_reason"),
|
||||
great_example=intent_data.get("great_example"),
|
||||
needs_clarification=intent_data.get("needs_clarification", False),
|
||||
clarifying_questions=intent_data.get("clarifying_questions", []),
|
||||
)
|
||||
|
||||
# Build queries
|
||||
queries = []
|
||||
for q in result.get("queries", []):
|
||||
try:
|
||||
queries.append(ResearchQuery(
|
||||
query=q.get("query", ""),
|
||||
purpose=q.get("purpose", "key_statistics"),
|
||||
provider=q.get("provider", "exa"),
|
||||
priority=int(q.get("priority", 3)),
|
||||
expected_results=q.get("expected_results", ""),
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse query: {e}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": intent,
|
||||
"queries": queries,
|
||||
"enhanced_keywords": result.get("enhanced_keywords", []),
|
||||
"research_angles": result.get("research_angles", []),
|
||||
"recommended_provider": result.get("recommended_provider", "exa"),
|
||||
"provider_justification": result.get("provider_justification", ""),
|
||||
"exa_config": result.get("exa_config", {}),
|
||||
"tavily_config": result.get("tavily_config", {}),
|
||||
"trends_config": result.get("trends_config", {}), # NEW: Google Trends configuration
|
||||
"analysis_summary": intent_data.get("analysis_summary", ""),
|
||||
}
|
||||
|
||||
def _create_fallback_response(self, user_input: str, keywords: List[str]) -> Dict[str, Any]:
|
||||
"""Create fallback response when analysis fails."""
|
||||
return {
|
||||
"success": False,
|
||||
"intent": ResearchIntent(
|
||||
primary_question=f"What are the key insights about: {user_input}?",
|
||||
purpose="learn",
|
||||
content_output="general",
|
||||
expected_deliverables=["key_statistics", "best_practices"],
|
||||
depth="detailed",
|
||||
original_input=user_input,
|
||||
confidence=0.5,
|
||||
),
|
||||
"queries": [
|
||||
ResearchQuery(
|
||||
query=user_input,
|
||||
purpose="key_statistics",
|
||||
provider="exa",
|
||||
priority=5,
|
||||
expected_results="General research results",
|
||||
)
|
||||
],
|
||||
"enhanced_keywords": keywords,
|
||||
"research_angles": [],
|
||||
"recommended_provider": "exa",
|
||||
"provider_justification": "Default fallback to Exa for semantic search",
|
||||
"exa_config": {
|
||||
"enabled": True,
|
||||
"type": "auto",
|
||||
"type_justification": "Auto mode for balanced results",
|
||||
"numResults": 10,
|
||||
"highlights": True,
|
||||
},
|
||||
"tavily_config": {
|
||||
"enabled": True,
|
||||
"topic": "general",
|
||||
"search_depth": "advanced",
|
||||
"include_answer": True,
|
||||
},
|
||||
"trends_config": {
|
||||
"enabled": False, # Disabled in fallback
|
||||
},
|
||||
}
|
||||
@@ -34,39 +34,81 @@ class ResearchPersonaService:
|
||||
user_id: str
|
||||
) -> Optional[ResearchPersona]:
|
||||
"""
|
||||
Get research persona for user ONLY if it exists in cache.
|
||||
This method NEVER generates - it only returns cached personas.
|
||||
Get research persona for user if it exists in database (regardless of cache validity).
|
||||
This method NEVER generates - it only returns existing personas.
|
||||
Use this for config endpoints to avoid triggering rate limit checks.
|
||||
|
||||
Note: Returns persona even if cache is expired - cache validity only matters for regeneration.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
|
||||
Returns:
|
||||
ResearchPersona if cached and valid, None otherwise
|
||||
ResearchPersona if exists in database, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Get persona data record
|
||||
persona_data = self._get_persona_data_record(user_id)
|
||||
|
||||
if not persona_data:
|
||||
logger.debug(f"No persona data found for user {user_id}")
|
||||
logger.debug(f"[get_cached_only] No persona data record found for user {user_id}")
|
||||
return None
|
||||
|
||||
# Only return if cache is valid and persona exists
|
||||
if self.is_cache_valid(persona_data) and persona_data.research_persona:
|
||||
# Check if research_persona field exists and is not None/empty
|
||||
# Handle cases where it might be None, empty dict {}, or empty string ""
|
||||
research_persona_raw = persona_data.research_persona
|
||||
has_persona = (
|
||||
research_persona_raw is not None
|
||||
and research_persona_raw != {}
|
||||
and research_persona_raw != ""
|
||||
and (isinstance(research_persona_raw, dict) and len(research_persona_raw) > 0)
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[get_cached_only] Checking research persona for user {user_id}: "
|
||||
f"persona_data exists=True, research_persona_raw={research_persona_raw is not None}, "
|
||||
f"research_persona type={type(research_persona_raw)}, "
|
||||
f"has_persona={has_persona}, "
|
||||
f"generated_at={persona_data.research_persona_generated_at}"
|
||||
)
|
||||
|
||||
# Return persona if it exists, regardless of cache validity
|
||||
# Cache validity only matters when deciding whether to regenerate
|
||||
if has_persona:
|
||||
try:
|
||||
logger.debug(f"Returning cached research persona for user {user_id}")
|
||||
return ResearchPersona(**persona_data.research_persona)
|
||||
cache_valid = self.is_cache_valid(persona_data)
|
||||
cache_status = "valid" if cache_valid else "expired"
|
||||
logger.info(
|
||||
f"[get_cached_only] ✅ Returning research persona for user {user_id} "
|
||||
f"(cache: {cache_status}, generated_at: {persona_data.research_persona_generated_at})"
|
||||
)
|
||||
# Ensure we're passing a dict to ResearchPersona
|
||||
if not isinstance(research_persona_raw, dict):
|
||||
logger.error(f"[get_cached_only] research_persona_raw is not a dict: {type(research_persona_raw)}")
|
||||
return None
|
||||
parsed_persona = ResearchPersona(**research_persona_raw)
|
||||
logger.info(
|
||||
f"[get_cached_only] ✅ Successfully parsed persona for user {user_id}: "
|
||||
f"industry={parsed_persona.default_industry}, "
|
||||
f"target_audience={parsed_persona.default_target_audience}"
|
||||
)
|
||||
return parsed_persona
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse cached research persona: {e}")
|
||||
logger.error(f"[get_cached_only] ❌ Failed to parse research persona for user {user_id}: {e}", exc_info=True)
|
||||
logger.debug(
|
||||
f"[get_cached_only] Persona data details: "
|
||||
f"type={type(research_persona_raw)}, "
|
||||
f"is_dict={isinstance(research_persona_raw, dict)}, "
|
||||
f"value sample: {str(research_persona_raw)[:500] if research_persona_raw else 'None'}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Cache invalid or persona missing - return None (don't generate)
|
||||
logger.debug(f"No valid cached research persona for user {user_id}")
|
||||
# Persona doesn't exist in database
|
||||
logger.info(f"[get_cached_only] ⚠️ No research persona found in database for user {user_id}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting cached research persona for user {user_id}: {e}")
|
||||
logger.error(f"[get_cached_only] ❌ Error getting research persona for user {user_id}: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def get_or_generate(
|
||||
@@ -92,25 +134,40 @@ class ResearchPersonaService:
|
||||
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
|
||||
return None
|
||||
|
||||
# Check cache if not forcing refresh
|
||||
if not force_refresh and self.is_cache_valid(persona_data):
|
||||
if persona_data.research_persona:
|
||||
# Check if persona exists in database
|
||||
if persona_data.research_persona:
|
||||
# Persona exists - check if we should return it or regenerate
|
||||
cache_valid = self.is_cache_valid(persona_data)
|
||||
|
||||
if not force_refresh and cache_valid:
|
||||
# Cache is valid - return existing persona
|
||||
logger.info(f"Using cached research persona for user {user_id}")
|
||||
try:
|
||||
return ResearchPersona(**persona_data.research_persona)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse cached research persona: {e}, regenerating...")
|
||||
# Fall through to regeneration
|
||||
# Fall through to regeneration if parsing fails
|
||||
elif not force_refresh:
|
||||
# Persona exists but cache expired - return it anyway (don't regenerate unless forced)
|
||||
logger.info(f"Research persona exists for user {user_id} but cache expired - returning existing persona (use force_refresh=true to regenerate)")
|
||||
try:
|
||||
return ResearchPersona(**persona_data.research_persona)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse existing research persona: {e}, regenerating...")
|
||||
# Fall through to regeneration if parsing fails
|
||||
else:
|
||||
logger.info(f"Research persona missing for user {user_id}, generating...")
|
||||
else:
|
||||
if force_refresh:
|
||||
# force_refresh=True - regenerate even though persona exists
|
||||
logger.info(f"Forcing refresh of research persona for user {user_id}")
|
||||
else:
|
||||
logger.info(f"Cache expired for user {user_id}, regenerating...")
|
||||
else:
|
||||
# Persona doesn't exist - generate new one
|
||||
logger.info(f"Research persona missing for user {user_id}, generating...")
|
||||
|
||||
# Generate new research persona
|
||||
# Generate new research persona (only reaches here if:
|
||||
# 1. Persona doesn't exist, OR
|
||||
# 2. force_refresh=True, OR
|
||||
# 3. Parsing of existing persona failed
|
||||
try:
|
||||
logger.info(f"Generating research persona for user {user_id}")
|
||||
research_persona = self.generate_research_persona(user_id)
|
||||
except HTTPException:
|
||||
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
|
||||
|
||||
9
backend/services/research/trends/__init__.py
Normal file
9
backend/services/research/trends/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
Google Trends Research Service
|
||||
|
||||
Provides Google Trends data integration for the Research Engine.
|
||||
"""
|
||||
|
||||
from .google_trends_service import GoogleTrendsService
|
||||
|
||||
__all__ = ['GoogleTrendsService']
|
||||
380
backend/services/research/trends/google_trends_service.py
Normal file
380
backend/services/research/trends/google_trends_service.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""
|
||||
Google Trends Service
|
||||
|
||||
Provides Google Trends data integration for the Research Engine.
|
||||
Handles rate limiting, caching, error handling, and data serialization.
|
||||
|
||||
Author: ALwrity Team
|
||||
Version: 1.0
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
import pandas as pd
|
||||
|
||||
try:
|
||||
from pytrends.request import TrendReq
|
||||
PYTrends_AVAILABLE = True
|
||||
except ImportError:
|
||||
PYTrends_AVAILABLE = False
|
||||
logger.warning("pytrends not installed. Google Trends features will be unavailable.")
|
||||
|
||||
from .rate_limiter import RateLimiter
|
||||
|
||||
|
||||
class GoogleTrendsService:
|
||||
"""
|
||||
Service for fetching and analyzing Google Trends data.
|
||||
|
||||
Features:
|
||||
- Interest over time
|
||||
- Interest by region
|
||||
- Related topics
|
||||
- Related queries
|
||||
- Rate limiting (1 req/sec)
|
||||
- Caching (24-hour TTL)
|
||||
- Async support
|
||||
- Error handling with retry logic
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Google Trends service."""
|
||||
if not PYTrends_AVAILABLE:
|
||||
raise RuntimeError("pytrends library is required. Install with: pip install pytrends")
|
||||
|
||||
self.rate_limiter = RateLimiter(max_calls=1, period=1.0) # 1 request per second
|
||||
self.cache: Dict[str, Dict[str, Any]] = {} # Simple in-memory cache
|
||||
self.cache_ttl = timedelta(hours=24) # 24-hour cache
|
||||
|
||||
logger.info("GoogleTrendsService initialized")
|
||||
|
||||
async def analyze_trends(
|
||||
self,
|
||||
keywords: List[str],
|
||||
timeframe: str = "today 12-m",
|
||||
geo: str = "US",
|
||||
user_id: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Comprehensive trends analysis.
|
||||
|
||||
Fetches all trends data in a single optimized call:
|
||||
- Interest over time
|
||||
- Interest by region
|
||||
- Related topics (top & rising)
|
||||
- Related queries (top & rising)
|
||||
|
||||
Args:
|
||||
keywords: List of keywords to analyze (1-5 keywords recommended)
|
||||
timeframe: Timeframe string (e.g., "today 12-m", "today 1-y", "all")
|
||||
geo: Country code (e.g., "US", "GB", "IN")
|
||||
user_id: User ID for subscription checks (optional for now)
|
||||
|
||||
Returns:
|
||||
Dict containing all trends data in serializable format
|
||||
|
||||
Raises:
|
||||
ValueError: If keywords list is empty or too long
|
||||
RuntimeError: If pytrends is not available or API fails
|
||||
"""
|
||||
if not keywords:
|
||||
raise ValueError("Keywords list cannot be empty")
|
||||
|
||||
if len(keywords) > 5:
|
||||
logger.warning(f"Too many keywords ({len(keywords)}), using first 5")
|
||||
keywords = keywords[:5]
|
||||
|
||||
# Check cache first
|
||||
cache_key = self._build_cache_key(keywords, timeframe, geo)
|
||||
cached_data = self._get_from_cache(cache_key)
|
||||
if cached_data:
|
||||
logger.info(f"Returning cached trends data for: {keywords}")
|
||||
return {**cached_data, "cached": True}
|
||||
|
||||
# Rate limit
|
||||
await self.rate_limiter.acquire()
|
||||
|
||||
try:
|
||||
logger.info(f"Fetching Google Trends data for: {keywords} (timeframe: {timeframe}, geo: {geo})")
|
||||
|
||||
# Initialize pytrends (sync operation, run in thread)
|
||||
pytrends = await asyncio.to_thread(
|
||||
self._initialize_pytrends,
|
||||
keywords,
|
||||
timeframe,
|
||||
geo
|
||||
)
|
||||
|
||||
# Fetch all data in parallel (pytrends methods are sync, so use to_thread)
|
||||
interest_over_time_task = asyncio.to_thread(
|
||||
lambda: self._safe_interest_over_time(pytrends)
|
||||
)
|
||||
interest_by_region_task = asyncio.to_thread(
|
||||
lambda: self._safe_interest_by_region(pytrends)
|
||||
)
|
||||
related_topics_task = asyncio.to_thread(
|
||||
lambda: self._safe_related_topics(pytrends, keywords)
|
||||
)
|
||||
related_queries_task = asyncio.to_thread(
|
||||
lambda: self._safe_related_queries(pytrends, keywords)
|
||||
)
|
||||
|
||||
# Wait for all tasks
|
||||
interest_over_time, interest_by_region, related_topics, related_queries = await asyncio.gather(
|
||||
interest_over_time_task,
|
||||
interest_by_region_task,
|
||||
related_topics_task,
|
||||
related_queries_task,
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
# Handle exceptions
|
||||
if isinstance(interest_over_time, Exception):
|
||||
logger.error(f"Interest over time failed: {interest_over_time}")
|
||||
interest_over_time = []
|
||||
if isinstance(interest_by_region, Exception):
|
||||
logger.error(f"Interest by region failed: {interest_by_region}")
|
||||
interest_by_region = []
|
||||
if isinstance(related_topics, Exception):
|
||||
logger.error(f"Related topics failed: {related_topics}")
|
||||
related_topics = {"top": [], "rising": []}
|
||||
if isinstance(related_queries, Exception):
|
||||
logger.error(f"Related queries failed: {related_queries}")
|
||||
related_queries = {"top": [], "rising": []}
|
||||
|
||||
# Build result
|
||||
result = {
|
||||
"interest_over_time": interest_over_time,
|
||||
"interest_by_region": interest_by_region,
|
||||
"related_topics": related_topics,
|
||||
"related_queries": related_queries,
|
||||
"timeframe": timeframe,
|
||||
"geo": geo,
|
||||
"keywords": keywords,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"cached": False
|
||||
}
|
||||
|
||||
# Cache result
|
||||
self._save_to_cache(cache_key, result)
|
||||
|
||||
logger.info(f"Google Trends data fetched successfully: {len(interest_over_time)} time points, {len(interest_by_region)} regions")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Google Trends analysis failed: {e}")
|
||||
# Return fallback response
|
||||
return self._create_fallback_response(keywords, timeframe, geo, str(e))
|
||||
|
||||
def _initialize_pytrends(
|
||||
self,
|
||||
keywords: List[str],
|
||||
timeframe: str,
|
||||
geo: str
|
||||
) -> TrendReq:
|
||||
"""Initialize pytrends and build payload (sync operation)."""
|
||||
pytrends = TrendReq(hl='en-US', tz=360)
|
||||
pytrends.build_payload(kw_list=keywords, timeframe=timeframe, geo=geo)
|
||||
return pytrends
|
||||
|
||||
def _safe_interest_over_time(self, pytrends: TrendReq) -> List[Dict[str, Any]]:
|
||||
"""Safely fetch interest over time data."""
|
||||
try:
|
||||
df = pytrends.interest_over_time()
|
||||
if df.empty:
|
||||
return []
|
||||
return self._format_dataframe(df.reset_index())
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching interest over time: {e}")
|
||||
return []
|
||||
|
||||
def _safe_interest_by_region(self, pytrends: TrendReq) -> List[Dict[str, Any]]:
|
||||
"""Safely fetch interest by region data."""
|
||||
try:
|
||||
df = pytrends.interest_by_region(resolution='COUNTRY', inc_low_vol=True, inc_geo_code=False)
|
||||
if df.empty:
|
||||
return []
|
||||
return self._format_dataframe(df.reset_index())
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching interest by region: {e}")
|
||||
return []
|
||||
|
||||
def _safe_related_topics(
|
||||
self,
|
||||
pytrends: TrendReq,
|
||||
keywords: List[str]
|
||||
) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""Safely fetch related topics."""
|
||||
try:
|
||||
topics_data = pytrends.related_topics()
|
||||
result = {"top": [], "rising": []}
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword in topics_data and isinstance(topics_data[keyword], dict):
|
||||
keyword_topics = topics_data[keyword]
|
||||
|
||||
if "top" in keyword_topics and not keyword_topics["top"].empty:
|
||||
top_df = keyword_topics["top"]
|
||||
# Select relevant columns
|
||||
if "topic_title" in top_df.columns and "value" in top_df.columns:
|
||||
top_data = top_df[["topic_title", "value"]].to_dict('records')
|
||||
result["top"].extend(top_data)
|
||||
|
||||
if "rising" in keyword_topics and not keyword_topics["rising"].empty:
|
||||
rising_df = keyword_topics["rising"]
|
||||
if "topic_title" in rising_df.columns and "value" in rising_df.columns:
|
||||
rising_data = rising_df[["topic_title", "value"]].to_dict('records')
|
||||
result["rising"].extend(rising_data)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching related topics: {e}")
|
||||
return {"top": [], "rising": []}
|
||||
|
||||
def _safe_related_queries(
|
||||
self,
|
||||
pytrends: TrendReq,
|
||||
keywords: List[str]
|
||||
) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""Safely fetch related queries."""
|
||||
try:
|
||||
queries_data = pytrends.related_queries()
|
||||
result = {"top": [], "rising": []}
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword in queries_data and isinstance(queries_data[keyword], dict):
|
||||
keyword_queries = queries_data[keyword]
|
||||
|
||||
if "top" in keyword_queries and not keyword_queries["top"].empty:
|
||||
top_df = keyword_queries["top"]
|
||||
result["top"].extend(top_df.to_dict('records'))
|
||||
|
||||
if "rising" in keyword_queries and not keyword_queries["rising"].empty:
|
||||
rising_df = keyword_queries["rising"]
|
||||
result["rising"].extend(rising_df.to_dict('records'))
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching related queries: {e}")
|
||||
return {"top": [], "rising": []}
|
||||
|
||||
def _format_dataframe(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
|
||||
"""Convert DataFrame to list of dicts (serializable format)."""
|
||||
if df.empty:
|
||||
return []
|
||||
|
||||
# Convert datetime columns to strings
|
||||
for col in df.columns:
|
||||
if pd.api.types.is_datetime64_any_dtype(df[col]):
|
||||
df[col] = df[col].astype(str)
|
||||
|
||||
# Convert to dict records
|
||||
return df.to_dict('records')
|
||||
|
||||
def _build_cache_key(self, keywords: List[str], timeframe: str, geo: str) -> str:
|
||||
"""Build cache key from parameters."""
|
||||
keywords_str = ":".join(sorted(keywords))
|
||||
return f"google_trends:{keywords_str}:{timeframe}:{geo}"
|
||||
|
||||
def _get_from_cache(self, cache_key: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get data from cache if not expired."""
|
||||
if cache_key not in self.cache:
|
||||
return None
|
||||
|
||||
cached_entry = self.cache[cache_key]
|
||||
cached_time = datetime.fromisoformat(cached_entry.get("timestamp", ""))
|
||||
|
||||
if datetime.utcnow() - cached_time > self.cache_ttl:
|
||||
# Expired, remove from cache
|
||||
del self.cache[cache_key]
|
||||
return None
|
||||
|
||||
# Return cached data (without cached flag)
|
||||
result = {**cached_entry}
|
||||
result.pop("cached", None)
|
||||
return result
|
||||
|
||||
def _save_to_cache(self, cache_key: str, data: Dict[str, Any]):
|
||||
"""Save data to cache."""
|
||||
# Store with timestamp
|
||||
cache_entry = {
|
||||
**data,
|
||||
"cached_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
self.cache[cache_key] = cache_entry
|
||||
|
||||
# Clean up old cache entries periodically
|
||||
if len(self.cache) > 100: # Limit cache size
|
||||
self._cleanup_cache()
|
||||
|
||||
def _cleanup_cache(self):
|
||||
"""Remove expired cache entries."""
|
||||
now = datetime.utcnow()
|
||||
expired_keys = []
|
||||
|
||||
for key, entry in self.cache.items():
|
||||
cached_time = datetime.fromisoformat(entry.get("cached_at", entry.get("timestamp", "")))
|
||||
if now - cached_time > self.cache_ttl:
|
||||
expired_keys.append(key)
|
||||
|
||||
for key in expired_keys:
|
||||
del self.cache[key]
|
||||
|
||||
logger.debug(f"Cleaned up {len(expired_keys)} expired cache entries")
|
||||
|
||||
def _create_fallback_response(
|
||||
self,
|
||||
keywords: List[str],
|
||||
timeframe: str,
|
||||
geo: str,
|
||||
error_message: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Create fallback response when trends analysis fails."""
|
||||
return {
|
||||
"interest_over_time": [],
|
||||
"interest_by_region": [],
|
||||
"related_topics": {"top": [], "rising": []},
|
||||
"related_queries": {"top": [], "rising": []},
|
||||
"timeframe": timeframe,
|
||||
"geo": geo,
|
||||
"keywords": keywords,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"cached": False,
|
||||
"error": error_message
|
||||
}
|
||||
|
||||
async def get_trending_searches(
|
||||
self,
|
||||
country: str = "united_states",
|
||||
user_id: Optional[str] = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
Get current trending searches for a country.
|
||||
|
||||
Args:
|
||||
country: Country name (e.g., "united_states", "united_kingdom")
|
||||
user_id: User ID for subscription checks
|
||||
|
||||
Returns:
|
||||
List of trending search terms
|
||||
"""
|
||||
await self.rate_limiter.acquire()
|
||||
|
||||
try:
|
||||
pytrends = TrendReq(hl='en-US', tz=360)
|
||||
trending_df = await asyncio.to_thread(
|
||||
lambda: pytrends.trending_searches(pn=country)
|
||||
)
|
||||
|
||||
if trending_df.empty:
|
||||
return []
|
||||
|
||||
# Return as list of strings
|
||||
return trending_df[0].tolist() if len(trending_df.columns) > 0 else []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching trending searches: {e}")
|
||||
return []
|
||||
57
backend/services/research/trends/rate_limiter.py
Normal file
57
backend/services/research/trends/rate_limiter.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""
|
||||
Rate Limiter for Google Trends API
|
||||
|
||||
Ensures we don't exceed Google Trends rate limits (1 request per second).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from time import time
|
||||
from collections import deque
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""
|
||||
Simple rate limiter for Google Trends API.
|
||||
|
||||
Limits requests to max_calls per period (in seconds).
|
||||
"""
|
||||
|
||||
def __init__(self, max_calls: int = 1, period: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
max_calls: Maximum number of calls allowed
|
||||
period: Time period in seconds
|
||||
"""
|
||||
self.max_calls = max_calls
|
||||
self.period = period
|
||||
self.calls = deque()
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self):
|
||||
"""
|
||||
Acquire permission to make a request.
|
||||
|
||||
Will wait if rate limit would be exceeded.
|
||||
"""
|
||||
async with self._lock:
|
||||
now = time()
|
||||
|
||||
# Remove old calls outside the period
|
||||
while self.calls and self.calls[0] < now - self.period:
|
||||
self.calls.popleft()
|
||||
|
||||
# If at limit, wait until oldest call expires
|
||||
if len(self.calls) >= self.max_calls:
|
||||
sleep_time = self.period - (now - self.calls[0])
|
||||
if sleep_time > 0:
|
||||
logger.debug(f"Rate limit reached, waiting {sleep_time:.2f}s")
|
||||
await asyncio.sleep(sleep_time)
|
||||
# Recursively try again after waiting
|
||||
return await self.acquire()
|
||||
|
||||
# Record this call
|
||||
self.calls.append(time())
|
||||
logger.debug(f"Rate limit check passed, {len(self.calls)}/{self.max_calls} calls in period")
|
||||
Reference in New Issue
Block a user