AI Researcher and Video Studio implementation complete

This commit is contained in:
ajaysi
2026-01-05 15:49:51 +05:30
parent b134e9dc7e
commit 0b63ae7fc1
200 changed files with 39535 additions and 1375 deletions

View File

@@ -50,6 +50,7 @@ class IntentAwareAnalyzer:
raw_results: Dict[str, Any],
intent: ResearchIntent,
research_persona: Optional[ResearchPersona] = None,
user_id: Optional[str] = None,
) -> IntentDrivenResearchResult:
"""
Analyze raw research results based on user intent.
@@ -84,7 +85,7 @@ class IntentAwareAnalyzer:
result = llm_text_gen(
prompt=prompt,
json_struct=analysis_schema,
user_id=None
user_id=user_id # Required for subscription checking
)
if isinstance(result, dict) and "error" in result:

View File

@@ -151,6 +151,8 @@ Analyze the user's input and infer their research intent. Determine:
11. **CONFIDENCE**: How confident are you in this inference? (0.0-1.0)
- If < 0.7, set needs_clarification to true and provide clarifying_questions
- Provide a brief reason for your confidence level
- If confidence is low, provide an example of what a great input would look like
## OUTPUT FORMAT
@@ -168,6 +170,8 @@ Return a JSON object:
"perspective": "target perspective or null",
"time_sensitivity": "real_time|recent|historical|evergreen",
"confidence": 0.85,
"confidence_reason": "Brief explanation of why this confidence level (e.g., 'User provided clear keywords and context' or 'Input is vague, missing specific goals')",
"great_example": "Example of what a great input would look like for this research (only if confidence < 0.8)",
"needs_clarification": false,
"clarifying_questions": [],
"analysis_summary": "Brief summary of what the user wants"

View File

@@ -39,6 +39,7 @@ class IntentQueryGenerator:
self,
intent: ResearchIntent,
research_persona: Optional[ResearchPersona] = None,
user_id: Optional[str] = None,
) -> Dict[str, Any]:
"""
Generate targeted research queries based on intent.
@@ -89,7 +90,7 @@ class IntentQueryGenerator:
result = llm_text_gen(
prompt=prompt,
json_struct=query_schema,
user_id=None
user_id=user_id
)
if isinstance(result, dict) and "error" in result:

View File

@@ -51,6 +51,7 @@ class ResearchIntentInference:
competitor_data: Optional[List[Dict]] = None,
industry: Optional[str] = None,
target_audience: Optional[str] = None,
user_id: Optional[str] = None,
) -> IntentInferenceResponse:
"""
Analyze user input and infer their research intent.
@@ -96,13 +97,15 @@ class ResearchIntentInference:
"perspective": {"type": "string"},
"time_sensitivity": {"type": "string"},
"confidence": {"type": "number"},
"confidence_reason": {"type": "string"},
"great_example": {"type": "string"},
"needs_clarification": {"type": "boolean"},
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
"analysis_summary": {"type": "string"}
},
"required": [
"input_type", "primary_question", "purpose", "content_output",
"expected_deliverables", "depth", "confidence", "analysis_summary"
"expected_deliverables", "depth", "confidence", "confidence_reason", "analysis_summary"
]
}
@@ -112,7 +115,7 @@ class ResearchIntentInference:
result = llm_text_gen(
prompt=prompt,
json_struct=intent_schema,
user_id=None
user_id=user_id
)
if isinstance(result, dict) and "error" in result:
@@ -134,6 +137,8 @@ class ResearchIntentInference:
suggested_keywords=self._extract_keywords_from_input(user_input, keywords),
suggested_angles=result.get("focus_areas", []),
quick_options=quick_options,
confidence_reason=result.get("confidence_reason", ""),
great_example=result.get("great_example", ""),
)
logger.info(f"Intent inferred: purpose={intent.purpose}, confidence={intent.confidence}")
@@ -166,7 +171,7 @@ class ResearchIntentInference:
if not expected_deliverables:
expected_deliverables = self._infer_deliverables_from_purpose(purpose)
return ResearchIntent(
intent = ResearchIntent(
primary_question=result.get("primary_question", user_input),
secondary_questions=result.get("secondary_questions", []),
purpose=purpose.value,
@@ -179,9 +184,13 @@ class ResearchIntentInference:
input_type=input_type.value,
original_input=user_input,
confidence=float(result.get("confidence", 0.7)),
confidence_reason=result.get("confidence_reason"),
great_example=result.get("great_example"),
needs_clarification=result.get("needs_clarification", False),
clarifying_questions=result.get("clarifying_questions", []),
)
return intent
def _safe_enum(self, enum_class, value: str, default):
"""Safely convert string to enum, returning default if invalid."""

View File

@@ -0,0 +1,559 @@
"""
Unified Research Analyzer
Combines intent inference, query generation, and parameter optimization
into a single AI call with justifications for each decision.
This reduces 2 LLM calls to 1, improves coherence, and provides
user-friendly justifications for all settings.
Author: ALwrity Team
Version: 1.0
"""
import json
from typing import Dict, Any, List, Optional, Tuple
from loguru import logger
from models.research_intent_models import (
ResearchIntent,
ResearchQuery,
IntentInferenceResponse,
ResearchPurpose,
ContentOutput,
ExpectedDeliverable,
ResearchDepthLevel,
InputType,
)
from models.research_persona_models import ResearchPersona
class UnifiedResearchAnalyzer:
"""
Unified AI-driven analyzer that performs:
1. Intent inference (what user wants)
2. Query generation (how to search)
3. Parameter optimization (Exa/Tavily settings)
All in a single LLM call with justifications.
"""
def __init__(self):
"""Initialize the unified analyzer."""
logger.info("UnifiedResearchAnalyzer initialized")
async def analyze(
self,
user_input: str,
keywords: Optional[List[str]] = None,
research_persona: Optional[ResearchPersona] = None,
competitor_data: Optional[List[Dict]] = None,
industry: Optional[str] = None,
target_audience: Optional[str] = None,
user_id: Optional[str] = None,
) -> Dict[str, Any]:
"""
Perform unified analysis of user research request.
Returns:
Dict containing:
- intent: ResearchIntent
- queries: List[ResearchQuery]
- exa_config: Dict with settings and justifications
- tavily_config: Dict with settings and justifications
- recommended_provider: str
- provider_justification: str
"""
try:
logger.info(f"Unified analysis for: {user_input[:100]}...")
keywords = keywords or []
# Build the unified prompt
prompt = self._build_unified_prompt(
user_input=user_input,
keywords=keywords,
research_persona=research_persona,
competitor_data=competitor_data,
industry=industry,
target_audience=target_audience,
)
# Define the comprehensive JSON schema
unified_schema = self._build_unified_schema()
# Call LLM (single call for everything)
from services.llm_providers.main_text_generation import llm_text_gen
result = llm_text_gen(
prompt=prompt,
json_struct=unified_schema,
user_id=user_id
)
if isinstance(result, dict) and "error" in result:
logger.error(f"Unified analysis failed: {result.get('error')}")
return self._create_fallback_response(user_input, keywords)
# Parse the unified result
return self._parse_unified_result(result, user_input)
except Exception as e:
logger.error(f"Error in unified analysis: {e}")
return self._create_fallback_response(user_input, keywords or [])
def _build_unified_prompt(
self,
user_input: str,
keywords: List[str],
research_persona: Optional[ResearchPersona] = None,
competitor_data: Optional[List[Dict]] = None,
industry: Optional[str] = None,
target_audience: Optional[str] = None,
) -> str:
"""Build the unified prompt for intent + queries + parameters."""
# Build persona context
persona_context = self._build_persona_context(research_persona, industry, target_audience)
# Build competitor context
competitor_context = self._build_competitor_context(competitor_data)
prompt = f'''You are an expert AI research strategist. Analyze the user's research request and provide a complete research plan including intent understanding, search queries, and optimal API settings.
## USER INPUT
"{user_input}"
{f"KEYWORDS: {', '.join(keywords)}" if keywords else ""}
## USER CONTEXT
{persona_context}
{competitor_context}
## YOUR TASK: Provide a Complete Research Plan
### PART 1: INTENT ANALYSIS
Understand what the user really wants from their research.
### PART 2: SEARCH QUERIES
Generate 4-8 targeted search queries optimized for semantic search.
### PART 3: PROVIDER SETTINGS
Configure Exa and Tavily API parameters with justifications.
### PART 4: GOOGLE TRENDS KEYWORDS (if trends in deliverables)
If "trends" is in expected_deliverables OR purpose is "explore_trends":
- Suggest 1-3 optimized keywords for Google Trends analysis
- These may differ from research queries (trends need broader, searchable terms)
- Consider: What keywords will show meaningful trends over time?
- Consider: What timeframe will show relevant trends? (1 year, 12 months, etc.)
- Consider: What geographic region is most relevant for the user?
- Explain what insights trends will uncover for content generation:
* Search interest trends over time (optimal publication timing)
* Regional interest distribution (audience targeting)
* Related topics for content expansion
* Related queries for FAQ sections
* Rising topics for timely content opportunities
---
## AVAILABLE PROVIDER OPTIONS
### EXA API OPTIONS (Semantic Search Engine)
| Parameter | Options | Description |
|-----------|---------|-------------|
| type | "auto", "neural", "fast", "deep" | "neural" = semantic understanding, "deep" = comprehensive with query expansion |
| category | "company", "research paper", "news", "github", "tweet", "personal site", "pdf", "financial report", "people" | Focus on specific content types |
| numResults | 5-25 | Number of results (10 recommended) |
| includeDomains | string[] | Domains to include (e.g., ["arxiv.org", "nature.com"]) |
| excludeDomains | string[] | Domains to exclude |
| startPublishedDate | ISO date | Filter by publish date (e.g., "2024-01-01T00:00:00.000Z") |
| text | boolean | Include full text content |
| highlights | boolean | Extract key highlights |
| context | boolean | Return as single context string for RAG |
**WHEN TO USE EXA:**
- Semantic understanding needed (finding similar content)
- Academic/research papers
- Company/competitor research
- Deep, comprehensive results
- Historical content
### TAVILY API OPTIONS (AI-Powered Search)
| Parameter | Options | Description |
|-----------|---------|-------------|
| topic | "general", "news", "finance" | Search topic category |
| search_depth | "basic", "advanced" | "advanced" = multiple semantic snippets per URL |
| include_answer | false, true, "basic", "advanced" | AI-generated answer from results |
| include_raw_content | false, true, "markdown", "text" | Raw page content format |
| time_range | "day", "week", "month", "year" | Filter by recency |
| max_results | 5-20 | Number of results |
| include_domains | string[] | Domains to include |
| exclude_domains | string[] | Domains to exclude |
**WHEN TO USE TAVILY:**
- Real-time/current events
- News and trending topics
- Quick facts with AI answers
- Financial data
- Recent time-sensitive content
---
## OUTPUT FORMAT
Return a JSON object with this exact structure:
```json
{{
"intent": {{
"input_type": "keywords|question|goal|mixed",
"primary_question": "The main question to answer",
"secondary_questions": ["question 1", "question 2"],
"purpose": "learn|create_content|make_decision|compare|solve_problem|find_data|explore_trends|validate|generate_ideas",
"content_output": "blog|podcast|video|social_post|newsletter|presentation|report|whitepaper|email|general",
"expected_deliverables": ["key_statistics", "expert_quotes", "case_studies", "trends", "best_practices"],
"depth": "overview|detailed|expert",
"focus_areas": ["area1", "area2"],
"perspective": "target perspective or null",
"time_sensitivity": "real_time|recent|historical|evergreen",
"confidence": 0.85,
"confidence_reason": "Why this confidence level",
"great_example": "Example of better input if confidence < 0.8",
"needs_clarification": false,
"clarifying_questions": [],
"analysis_summary": "Brief summary of research plan"
}},
"queries": [
{{
"query": "Optimized search query string",
"purpose": "key_statistics|expert_quotes|case_studies|trends|etc",
"provider": "exa|tavily",
"priority": 5,
"expected_results": "What we expect to find",
"justification": "Why this query and provider"
}}
],
"enhanced_keywords": ["expanded", "related", "keywords"],
"research_angles": ["Angle 1: ...", "Angle 2: ..."],
"recommended_provider": "exa|tavily",
"provider_justification": "Why this provider is best for this research",
"exa_config": {{
"enabled": true,
"type": "auto|neural|fast|deep",
"type_justification": "Why this search type",
"category": "news|research paper|company|etc or null",
"category_justification": "Why this category or null",
"numResults": 10,
"numResults_justification": "Why this number",
"includeDomains": [],
"includeDomains_justification": "Why these domains or empty",
"startPublishedDate": "2024-01-01T00:00:00.000Z or null",
"date_justification": "Why this date filter or null",
"highlights": true,
"highlights_justification": "Why enable/disable highlights",
"context": true,
"context_justification": "Why enable/disable context string"
}},
"tavily_config": {{
"enabled": true,
"topic": "general|news|finance",
"topic_justification": "Why this topic",
"search_depth": "basic|advanced",
"search_depth_justification": "Why this depth",
"include_answer": "true|false|basic|advanced",
"include_answer_justification": "Why this answer mode",
"time_range": "day|week|month|year|null",
"time_range_justification": "Why this time range or null",
"max_results": 10,
"max_results_justification": "Why this number",
"include_raw_content": "false|true|markdown|text",
"include_raw_content_justification": "Why this content mode"
}},
"trends_config": {{
"enabled": true|false,
"keywords": ["keyword1", "keyword2"],
"keywords_justification": "Why these keywords for trends analysis",
"timeframe": "today 1-y|today 12-m|all",
"timeframe_justification": "Why this timeframe",
"geo": "US|GB|IN|etc",
"geo_justification": "Why this geographic region",
"expected_insights": [
"Search interest trends over the past year",
"Regional interest distribution",
"Related topics for content expansion",
"Related queries for FAQ sections",
"Optimal publication timing based on interest peaks"
]
}}
}}
```
## DECISION RULES
1. **Provider Selection:**
- Use EXA for: academic research, competitor analysis, deep understanding, finding similar content
- Use TAVILY for: news, current events, quick facts, financial data, real-time info
2. **Query Optimization:**
- Include relevant keywords for semantic matching
- Add context words based on deliverables (e.g., "statistics 2024" for key_statistics)
- Match query style to provider (natural language for Exa, keyword-rich for Tavily)
3. **Parameter Selection:**
- ALWAYS provide justification for each parameter choice
- Consider time sensitivity when setting date filters
- Match category/topic to content type
- Use "advanced" depth when quality matters more than speed
4. **Google Trends Keywords (if trends enabled):**
- Suggest 1-3 keywords optimized for trends analysis
- Keywords should be broader than research queries (e.g., "AI marketing" vs "AI marketing tools for small businesses")
- Consider what will show meaningful search interest trends
- Choose timeframe based on content type (12 months for blogs, 1 year for comprehensive)
- Select geo based on user's target audience or industry
- List specific insights trends will uncover
5. **Justifications:**
- Keep justifications concise (1 sentence)
- Explain the "why" not the "what"
- Reference user's intent when relevant
'''
return prompt
def _build_unified_schema(self) -> Dict[str, Any]:
"""Build the JSON schema for unified response."""
return {
"type": "object",
"properties": {
"intent": {
"type": "object",
"properties": {
"input_type": {"type": "string", "enum": ["keywords", "question", "goal", "mixed"]},
"primary_question": {"type": "string"},
"secondary_questions": {"type": "array", "items": {"type": "string"}},
"purpose": {"type": "string"},
"content_output": {"type": "string"},
"expected_deliverables": {"type": "array", "items": {"type": "string"}},
"depth": {"type": "string", "enum": ["overview", "detailed", "expert"]},
"focus_areas": {"type": "array", "items": {"type": "string"}},
"perspective": {"type": "string"},
"time_sensitivity": {"type": "string"},
"confidence": {"type": "number"},
"confidence_reason": {"type": "string"},
"great_example": {"type": "string"},
"needs_clarification": {"type": "boolean"},
"clarifying_questions": {"type": "array", "items": {"type": "string"}},
"analysis_summary": {"type": "string"}
},
"required": ["primary_question", "purpose", "expected_deliverables", "confidence"]
},
"queries": {
"type": "array",
"items": {
"type": "object",
"properties": {
"query": {"type": "string"},
"purpose": {"type": "string"},
"provider": {"type": "string"},
"priority": {"type": "integer"},
"expected_results": {"type": "string"},
"justification": {"type": "string"}
},
"required": ["query", "purpose", "provider", "priority"]
}
},
"enhanced_keywords": {"type": "array", "items": {"type": "string"}},
"research_angles": {"type": "array", "items": {"type": "string"}},
"recommended_provider": {"type": "string"},
"provider_justification": {"type": "string"},
"exa_config": {
"type": "object",
"properties": {
"enabled": {"type": "boolean"},
"type": {"type": "string"},
"type_justification": {"type": "string"},
"category": {"type": "string"},
"category_justification": {"type": "string"},
"numResults": {"type": "integer"},
"numResults_justification": {"type": "string"},
"includeDomains": {"type": "array", "items": {"type": "string"}},
"includeDomains_justification": {"type": "string"},
"startPublishedDate": {"type": "string"},
"date_justification": {"type": "string"},
"highlights": {"type": "boolean"},
"highlights_justification": {"type": "string"},
"context": {"type": "boolean"},
"context_justification": {"type": "string"}
}
},
"tavily_config": {
"type": "object",
"properties": {
"enabled": {"type": "boolean"},
"topic": {"type": "string"},
"topic_justification": {"type": "string"},
"search_depth": {"type": "string"},
"search_depth_justification": {"type": "string"},
"include_answer": {"type": "string"},
"include_answer_justification": {"type": "string"},
"time_range": {"type": "string"},
"time_range_justification": {"type": "string"},
"max_results": {"type": "integer"},
"max_results_justification": {"type": "string"},
"include_raw_content": {"type": "string"},
"include_raw_content_justification": {"type": "string"}
}
},
"trends_config": {
"type": "object",
"properties": {
"enabled": {"type": "boolean"},
"keywords": {"type": "array", "items": {"type": "string"}},
"keywords_justification": {"type": "string"},
"timeframe": {"type": "string"},
"timeframe_justification": {"type": "string"},
"geo": {"type": "string"},
"geo_justification": {"type": "string"},
"expected_insights": {"type": "array", "items": {"type": "string"}}
}
}
},
"required": ["intent", "queries", "recommended_provider", "exa_config", "tavily_config"]
}
def _build_persona_context(
self,
research_persona: Optional[ResearchPersona],
industry: Optional[str],
target_audience: Optional[str],
) -> str:
"""Build persona context section."""
parts = []
if research_persona:
if research_persona.default_industry:
parts.append(f"Industry: {research_persona.default_industry}")
if research_persona.default_target_audience:
parts.append(f"Target Audience: {research_persona.default_target_audience}")
if research_persona.research_angles:
parts.append(f"Preferred Research Angles: {', '.join(research_persona.research_angles[:3])}")
if research_persona.suggested_keywords:
parts.append(f"Relevant Keywords: {', '.join(research_persona.suggested_keywords[:5])}")
else:
if industry:
parts.append(f"Industry: {industry}")
if target_audience:
parts.append(f"Target Audience: {target_audience}")
if not parts:
return "No specific user context available. Use general best practices."
return "\n".join(parts)
def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str:
"""Build competitor context section."""
if not competitor_data:
return ""
competitor_names = [c.get("name", c.get("url", "")) for c in competitor_data[:5]]
if competitor_names:
return f"\nKnown Competitors: {', '.join(competitor_names)}"
return ""
def _parse_unified_result(self, result: Dict[str, Any], user_input: str) -> Dict[str, Any]:
"""Parse the unified LLM result into structured response."""
intent_data = result.get("intent", {})
# Build ResearchIntent
intent = ResearchIntent(
primary_question=intent_data.get("primary_question", user_input),
secondary_questions=intent_data.get("secondary_questions", []),
purpose=intent_data.get("purpose", "learn"),
content_output=intent_data.get("content_output", "general"),
expected_deliverables=intent_data.get("expected_deliverables", ["key_statistics"]),
depth=intent_data.get("depth", "detailed"),
focus_areas=intent_data.get("focus_areas", []),
perspective=intent_data.get("perspective"),
time_sensitivity=intent_data.get("time_sensitivity"),
input_type=intent_data.get("input_type", "keywords"),
original_input=user_input,
confidence=float(intent_data.get("confidence", 0.7)),
confidence_reason=intent_data.get("confidence_reason"),
great_example=intent_data.get("great_example"),
needs_clarification=intent_data.get("needs_clarification", False),
clarifying_questions=intent_data.get("clarifying_questions", []),
)
# Build queries
queries = []
for q in result.get("queries", []):
try:
queries.append(ResearchQuery(
query=q.get("query", ""),
purpose=q.get("purpose", "key_statistics"),
provider=q.get("provider", "exa"),
priority=int(q.get("priority", 3)),
expected_results=q.get("expected_results", ""),
))
except Exception as e:
logger.warning(f"Failed to parse query: {e}")
return {
"success": True,
"intent": intent,
"queries": queries,
"enhanced_keywords": result.get("enhanced_keywords", []),
"research_angles": result.get("research_angles", []),
"recommended_provider": result.get("recommended_provider", "exa"),
"provider_justification": result.get("provider_justification", ""),
"exa_config": result.get("exa_config", {}),
"tavily_config": result.get("tavily_config", {}),
"trends_config": result.get("trends_config", {}), # NEW: Google Trends configuration
"analysis_summary": intent_data.get("analysis_summary", ""),
}
def _create_fallback_response(self, user_input: str, keywords: List[str]) -> Dict[str, Any]:
"""Create fallback response when analysis fails."""
return {
"success": False,
"intent": ResearchIntent(
primary_question=f"What are the key insights about: {user_input}?",
purpose="learn",
content_output="general",
expected_deliverables=["key_statistics", "best_practices"],
depth="detailed",
original_input=user_input,
confidence=0.5,
),
"queries": [
ResearchQuery(
query=user_input,
purpose="key_statistics",
provider="exa",
priority=5,
expected_results="General research results",
)
],
"enhanced_keywords": keywords,
"research_angles": [],
"recommended_provider": "exa",
"provider_justification": "Default fallback to Exa for semantic search",
"exa_config": {
"enabled": True,
"type": "auto",
"type_justification": "Auto mode for balanced results",
"numResults": 10,
"highlights": True,
},
"tavily_config": {
"enabled": True,
"topic": "general",
"search_depth": "advanced",
"include_answer": True,
},
"trends_config": {
"enabled": False, # Disabled in fallback
},
}

View File

@@ -34,39 +34,81 @@ class ResearchPersonaService:
user_id: str
) -> Optional[ResearchPersona]:
"""
Get research persona for user ONLY if it exists in cache.
This method NEVER generates - it only returns cached personas.
Get research persona for user if it exists in database (regardless of cache validity).
This method NEVER generates - it only returns existing personas.
Use this for config endpoints to avoid triggering rate limit checks.
Note: Returns persona even if cache is expired - cache validity only matters for regeneration.
Args:
user_id: User ID (Clerk string)
Returns:
ResearchPersona if cached and valid, None otherwise
ResearchPersona if exists in database, None otherwise
"""
try:
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
if not persona_data:
logger.debug(f"No persona data found for user {user_id}")
logger.debug(f"[get_cached_only] No persona data record found for user {user_id}")
return None
# Only return if cache is valid and persona exists
if self.is_cache_valid(persona_data) and persona_data.research_persona:
# Check if research_persona field exists and is not None/empty
# Handle cases where it might be None, empty dict {}, or empty string ""
research_persona_raw = persona_data.research_persona
has_persona = (
research_persona_raw is not None
and research_persona_raw != {}
and research_persona_raw != ""
and (isinstance(research_persona_raw, dict) and len(research_persona_raw) > 0)
)
logger.info(
f"[get_cached_only] Checking research persona for user {user_id}: "
f"persona_data exists=True, research_persona_raw={research_persona_raw is not None}, "
f"research_persona type={type(research_persona_raw)}, "
f"has_persona={has_persona}, "
f"generated_at={persona_data.research_persona_generated_at}"
)
# Return persona if it exists, regardless of cache validity
# Cache validity only matters when deciding whether to regenerate
if has_persona:
try:
logger.debug(f"Returning cached research persona for user {user_id}")
return ResearchPersona(**persona_data.research_persona)
cache_valid = self.is_cache_valid(persona_data)
cache_status = "valid" if cache_valid else "expired"
logger.info(
f"[get_cached_only] ✅ Returning research persona for user {user_id} "
f"(cache: {cache_status}, generated_at: {persona_data.research_persona_generated_at})"
)
# Ensure we're passing a dict to ResearchPersona
if not isinstance(research_persona_raw, dict):
logger.error(f"[get_cached_only] research_persona_raw is not a dict: {type(research_persona_raw)}")
return None
parsed_persona = ResearchPersona(**research_persona_raw)
logger.info(
f"[get_cached_only] ✅ Successfully parsed persona for user {user_id}: "
f"industry={parsed_persona.default_industry}, "
f"target_audience={parsed_persona.default_target_audience}"
)
return parsed_persona
except Exception as e:
logger.warning(f"Failed to parse cached research persona: {e}")
logger.error(f"[get_cached_only] ❌ Failed to parse research persona for user {user_id}: {e}", exc_info=True)
logger.debug(
f"[get_cached_only] Persona data details: "
f"type={type(research_persona_raw)}, "
f"is_dict={isinstance(research_persona_raw, dict)}, "
f"value sample: {str(research_persona_raw)[:500] if research_persona_raw else 'None'}"
)
return None
# Cache invalid or persona missing - return None (don't generate)
logger.debug(f"No valid cached research persona for user {user_id}")
# Persona doesn't exist in database
logger.info(f"[get_cached_only] ⚠️ No research persona found in database for user {user_id}")
return None
except Exception as e:
logger.error(f"Error getting cached research persona for user {user_id}: {e}")
logger.error(f"[get_cached_only] ❌ Error getting research persona for user {user_id}: {e}", exc_info=True)
return None
def get_or_generate(
@@ -92,25 +134,40 @@ class ResearchPersonaService:
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
return None
# Check cache if not forcing refresh
if not force_refresh and self.is_cache_valid(persona_data):
if persona_data.research_persona:
# Check if persona exists in database
if persona_data.research_persona:
# Persona exists - check if we should return it or regenerate
cache_valid = self.is_cache_valid(persona_data)
if not force_refresh and cache_valid:
# Cache is valid - return existing persona
logger.info(f"Using cached research persona for user {user_id}")
try:
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse cached research persona: {e}, regenerating...")
# Fall through to regeneration
# Fall through to regeneration if parsing fails
elif not force_refresh:
# Persona exists but cache expired - return it anyway (don't regenerate unless forced)
logger.info(f"Research persona exists for user {user_id} but cache expired - returning existing persona (use force_refresh=true to regenerate)")
try:
return ResearchPersona(**persona_data.research_persona)
except Exception as e:
logger.warning(f"Failed to parse existing research persona: {e}, regenerating...")
# Fall through to regeneration if parsing fails
else:
logger.info(f"Research persona missing for user {user_id}, generating...")
else:
if force_refresh:
# force_refresh=True - regenerate even though persona exists
logger.info(f"Forcing refresh of research persona for user {user_id}")
else:
logger.info(f"Cache expired for user {user_id}, regenerating...")
else:
# Persona doesn't exist - generate new one
logger.info(f"Research persona missing for user {user_id}, generating...")
# Generate new research persona
# Generate new research persona (only reaches here if:
# 1. Persona doesn't exist, OR
# 2. force_refresh=True, OR
# 3. Parsing of existing persona failed
try:
logger.info(f"Generating research persona for user {user_id}")
research_persona = self.generate_research_persona(user_id)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API

View File

@@ -0,0 +1,9 @@
"""
Google Trends Research Service
Provides Google Trends data integration for the Research Engine.
"""
from .google_trends_service import GoogleTrendsService
__all__ = ['GoogleTrendsService']

View File

@@ -0,0 +1,380 @@
"""
Google Trends Service
Provides Google Trends data integration for the Research Engine.
Handles rate limiting, caching, error handling, and data serialization.
Author: ALwrity Team
Version: 1.0
"""
import asyncio
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from loguru import logger
import pandas as pd
try:
from pytrends.request import TrendReq
PYTrends_AVAILABLE = True
except ImportError:
PYTrends_AVAILABLE = False
logger.warning("pytrends not installed. Google Trends features will be unavailable.")
from .rate_limiter import RateLimiter
class GoogleTrendsService:
"""
Service for fetching and analyzing Google Trends data.
Features:
- Interest over time
- Interest by region
- Related topics
- Related queries
- Rate limiting (1 req/sec)
- Caching (24-hour TTL)
- Async support
- Error handling with retry logic
"""
def __init__(self):
"""Initialize the Google Trends service."""
if not PYTrends_AVAILABLE:
raise RuntimeError("pytrends library is required. Install with: pip install pytrends")
self.rate_limiter = RateLimiter(max_calls=1, period=1.0) # 1 request per second
self.cache: Dict[str, Dict[str, Any]] = {} # Simple in-memory cache
self.cache_ttl = timedelta(hours=24) # 24-hour cache
logger.info("GoogleTrendsService initialized")
async def analyze_trends(
self,
keywords: List[str],
timeframe: str = "today 12-m",
geo: str = "US",
user_id: Optional[str] = None,
) -> Dict[str, Any]:
"""
Comprehensive trends analysis.
Fetches all trends data in a single optimized call:
- Interest over time
- Interest by region
- Related topics (top & rising)
- Related queries (top & rising)
Args:
keywords: List of keywords to analyze (1-5 keywords recommended)
timeframe: Timeframe string (e.g., "today 12-m", "today 1-y", "all")
geo: Country code (e.g., "US", "GB", "IN")
user_id: User ID for subscription checks (optional for now)
Returns:
Dict containing all trends data in serializable format
Raises:
ValueError: If keywords list is empty or too long
RuntimeError: If pytrends is not available or API fails
"""
if not keywords:
raise ValueError("Keywords list cannot be empty")
if len(keywords) > 5:
logger.warning(f"Too many keywords ({len(keywords)}), using first 5")
keywords = keywords[:5]
# Check cache first
cache_key = self._build_cache_key(keywords, timeframe, geo)
cached_data = self._get_from_cache(cache_key)
if cached_data:
logger.info(f"Returning cached trends data for: {keywords}")
return {**cached_data, "cached": True}
# Rate limit
await self.rate_limiter.acquire()
try:
logger.info(f"Fetching Google Trends data for: {keywords} (timeframe: {timeframe}, geo: {geo})")
# Initialize pytrends (sync operation, run in thread)
pytrends = await asyncio.to_thread(
self._initialize_pytrends,
keywords,
timeframe,
geo
)
# Fetch all data in parallel (pytrends methods are sync, so use to_thread)
interest_over_time_task = asyncio.to_thread(
lambda: self._safe_interest_over_time(pytrends)
)
interest_by_region_task = asyncio.to_thread(
lambda: self._safe_interest_by_region(pytrends)
)
related_topics_task = asyncio.to_thread(
lambda: self._safe_related_topics(pytrends, keywords)
)
related_queries_task = asyncio.to_thread(
lambda: self._safe_related_queries(pytrends, keywords)
)
# Wait for all tasks
interest_over_time, interest_by_region, related_topics, related_queries = await asyncio.gather(
interest_over_time_task,
interest_by_region_task,
related_topics_task,
related_queries_task,
return_exceptions=True
)
# Handle exceptions
if isinstance(interest_over_time, Exception):
logger.error(f"Interest over time failed: {interest_over_time}")
interest_over_time = []
if isinstance(interest_by_region, Exception):
logger.error(f"Interest by region failed: {interest_by_region}")
interest_by_region = []
if isinstance(related_topics, Exception):
logger.error(f"Related topics failed: {related_topics}")
related_topics = {"top": [], "rising": []}
if isinstance(related_queries, Exception):
logger.error(f"Related queries failed: {related_queries}")
related_queries = {"top": [], "rising": []}
# Build result
result = {
"interest_over_time": interest_over_time,
"interest_by_region": interest_by_region,
"related_topics": related_topics,
"related_queries": related_queries,
"timeframe": timeframe,
"geo": geo,
"keywords": keywords,
"timestamp": datetime.utcnow().isoformat(),
"cached": False
}
# Cache result
self._save_to_cache(cache_key, result)
logger.info(f"Google Trends data fetched successfully: {len(interest_over_time)} time points, {len(interest_by_region)} regions")
return result
except Exception as e:
logger.error(f"Google Trends analysis failed: {e}")
# Return fallback response
return self._create_fallback_response(keywords, timeframe, geo, str(e))
def _initialize_pytrends(
self,
keywords: List[str],
timeframe: str,
geo: str
) -> TrendReq:
"""Initialize pytrends and build payload (sync operation)."""
pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload(kw_list=keywords, timeframe=timeframe, geo=geo)
return pytrends
def _safe_interest_over_time(self, pytrends: TrendReq) -> List[Dict[str, Any]]:
"""Safely fetch interest over time data."""
try:
df = pytrends.interest_over_time()
if df.empty:
return []
return self._format_dataframe(df.reset_index())
except Exception as e:
logger.error(f"Error fetching interest over time: {e}")
return []
def _safe_interest_by_region(self, pytrends: TrendReq) -> List[Dict[str, Any]]:
"""Safely fetch interest by region data."""
try:
df = pytrends.interest_by_region(resolution='COUNTRY', inc_low_vol=True, inc_geo_code=False)
if df.empty:
return []
return self._format_dataframe(df.reset_index())
except Exception as e:
logger.error(f"Error fetching interest by region: {e}")
return []
def _safe_related_topics(
self,
pytrends: TrendReq,
keywords: List[str]
) -> Dict[str, List[Dict[str, Any]]]:
"""Safely fetch related topics."""
try:
topics_data = pytrends.related_topics()
result = {"top": [], "rising": []}
for keyword in keywords:
if keyword in topics_data and isinstance(topics_data[keyword], dict):
keyword_topics = topics_data[keyword]
if "top" in keyword_topics and not keyword_topics["top"].empty:
top_df = keyword_topics["top"]
# Select relevant columns
if "topic_title" in top_df.columns and "value" in top_df.columns:
top_data = top_df[["topic_title", "value"]].to_dict('records')
result["top"].extend(top_data)
if "rising" in keyword_topics and not keyword_topics["rising"].empty:
rising_df = keyword_topics["rising"]
if "topic_title" in rising_df.columns and "value" in rising_df.columns:
rising_data = rising_df[["topic_title", "value"]].to_dict('records')
result["rising"].extend(rising_data)
return result
except Exception as e:
logger.error(f"Error fetching related topics: {e}")
return {"top": [], "rising": []}
def _safe_related_queries(
self,
pytrends: TrendReq,
keywords: List[str]
) -> Dict[str, List[Dict[str, Any]]]:
"""Safely fetch related queries."""
try:
queries_data = pytrends.related_queries()
result = {"top": [], "rising": []}
for keyword in keywords:
if keyword in queries_data and isinstance(queries_data[keyword], dict):
keyword_queries = queries_data[keyword]
if "top" in keyword_queries and not keyword_queries["top"].empty:
top_df = keyword_queries["top"]
result["top"].extend(top_df.to_dict('records'))
if "rising" in keyword_queries and not keyword_queries["rising"].empty:
rising_df = keyword_queries["rising"]
result["rising"].extend(rising_df.to_dict('records'))
return result
except Exception as e:
logger.error(f"Error fetching related queries: {e}")
return {"top": [], "rising": []}
def _format_dataframe(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
"""Convert DataFrame to list of dicts (serializable format)."""
if df.empty:
return []
# Convert datetime columns to strings
for col in df.columns:
if pd.api.types.is_datetime64_any_dtype(df[col]):
df[col] = df[col].astype(str)
# Convert to dict records
return df.to_dict('records')
def _build_cache_key(self, keywords: List[str], timeframe: str, geo: str) -> str:
"""Build cache key from parameters."""
keywords_str = ":".join(sorted(keywords))
return f"google_trends:{keywords_str}:{timeframe}:{geo}"
def _get_from_cache(self, cache_key: str) -> Optional[Dict[str, Any]]:
"""Get data from cache if not expired."""
if cache_key not in self.cache:
return None
cached_entry = self.cache[cache_key]
cached_time = datetime.fromisoformat(cached_entry.get("timestamp", ""))
if datetime.utcnow() - cached_time > self.cache_ttl:
# Expired, remove from cache
del self.cache[cache_key]
return None
# Return cached data (without cached flag)
result = {**cached_entry}
result.pop("cached", None)
return result
def _save_to_cache(self, cache_key: str, data: Dict[str, Any]):
"""Save data to cache."""
# Store with timestamp
cache_entry = {
**data,
"cached_at": datetime.utcnow().isoformat()
}
self.cache[cache_key] = cache_entry
# Clean up old cache entries periodically
if len(self.cache) > 100: # Limit cache size
self._cleanup_cache()
def _cleanup_cache(self):
"""Remove expired cache entries."""
now = datetime.utcnow()
expired_keys = []
for key, entry in self.cache.items():
cached_time = datetime.fromisoformat(entry.get("cached_at", entry.get("timestamp", "")))
if now - cached_time > self.cache_ttl:
expired_keys.append(key)
for key in expired_keys:
del self.cache[key]
logger.debug(f"Cleaned up {len(expired_keys)} expired cache entries")
def _create_fallback_response(
self,
keywords: List[str],
timeframe: str,
geo: str,
error_message: str
) -> Dict[str, Any]:
"""Create fallback response when trends analysis fails."""
return {
"interest_over_time": [],
"interest_by_region": [],
"related_topics": {"top": [], "rising": []},
"related_queries": {"top": [], "rising": []},
"timeframe": timeframe,
"geo": geo,
"keywords": keywords,
"timestamp": datetime.utcnow().isoformat(),
"cached": False,
"error": error_message
}
async def get_trending_searches(
self,
country: str = "united_states",
user_id: Optional[str] = None
) -> List[str]:
"""
Get current trending searches for a country.
Args:
country: Country name (e.g., "united_states", "united_kingdom")
user_id: User ID for subscription checks
Returns:
List of trending search terms
"""
await self.rate_limiter.acquire()
try:
pytrends = TrendReq(hl='en-US', tz=360)
trending_df = await asyncio.to_thread(
lambda: pytrends.trending_searches(pn=country)
)
if trending_df.empty:
return []
# Return as list of strings
return trending_df[0].tolist() if len(trending_df.columns) > 0 else []
except Exception as e:
logger.error(f"Error fetching trending searches: {e}")
return []

View File

@@ -0,0 +1,57 @@
"""
Rate Limiter for Google Trends API
Ensures we don't exceed Google Trends rate limits (1 request per second).
"""
import asyncio
from time import time
from collections import deque
from loguru import logger
class RateLimiter:
"""
Simple rate limiter for Google Trends API.
Limits requests to max_calls per period (in seconds).
"""
def __init__(self, max_calls: int = 1, period: float = 1.0):
"""
Initialize rate limiter.
Args:
max_calls: Maximum number of calls allowed
period: Time period in seconds
"""
self.max_calls = max_calls
self.period = period
self.calls = deque()
self._lock = asyncio.Lock()
async def acquire(self):
"""
Acquire permission to make a request.
Will wait if rate limit would be exceeded.
"""
async with self._lock:
now = time()
# Remove old calls outside the period
while self.calls and self.calls[0] < now - self.period:
self.calls.popleft()
# If at limit, wait until oldest call expires
if len(self.calls) >= self.max_calls:
sleep_time = self.period - (now - self.calls[0])
if sleep_time > 0:
logger.debug(f"Rate limit reached, waiting {sleep_time:.2f}s")
await asyncio.sleep(sleep_time)
# Recursively try again after waiting
return await self.acquire()
# Record this call
self.calls.append(time())
logger.debug(f"Rate limit check passed, {len(self.calls)}/{self.max_calls} calls in period")