Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts
This commit is contained in:
603
backend/services/research/deep_competitor_analysis.py
Normal file
603
backend/services/research/deep_competitor_analysis.py
Normal file
@@ -0,0 +1,603 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from services.component_logic.web_crawler_logic import WebCrawlerLogic
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
from services.ai_service_manager import AIServiceManager, AIServiceType
|
||||
from services.seo_tools.sitemap_service import SitemapService
|
||||
from services.seo.advertools_service import AdvertoolsService
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("deep_competitor_analysis")
|
||||
|
||||
|
||||
class DeepCompetitorAnalysisService:
|
||||
def __init__(self):
|
||||
self.crawler = WebCrawlerLogic()
|
||||
self.advertools = AdvertoolsService()
|
||||
|
||||
async def run(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
website_analysis: Dict[str, Any],
|
||||
competitors: List[Dict[str, Any]],
|
||||
max_competitors: int = 25,
|
||||
crawl_concurrency: int = 4
|
||||
) -> Dict[str, Any]:
|
||||
baseline = self._build_baseline(website_analysis)
|
||||
normalized_competitors = self._normalize_competitors(competitors, max_competitors=max_competitors)
|
||||
|
||||
crawl_results = await self._crawl_competitors(
|
||||
normalized_competitors,
|
||||
crawl_concurrency=crawl_concurrency
|
||||
)
|
||||
|
||||
per_competitor_outputs: List[Dict[str, Any]] = []
|
||||
for competitor_input, crawl_result in crawl_results:
|
||||
extraction = self._build_extraction_artifact(competitor_input, crawl_result)
|
||||
ai_analysis = await self._analyze_competitor_with_ai(
|
||||
user_id=user_id,
|
||||
baseline=baseline,
|
||||
competitor_input=competitor_input,
|
||||
extraction=extraction
|
||||
)
|
||||
per_competitor_outputs.append({
|
||||
"input": competitor_input,
|
||||
"extraction": extraction,
|
||||
"ai_analysis": ai_analysis
|
||||
})
|
||||
|
||||
aggregation = await self._aggregate_with_ai(
|
||||
user_id=user_id,
|
||||
baseline=baseline,
|
||||
competitors=per_competitor_outputs
|
||||
)
|
||||
|
||||
return {
|
||||
"baseline": baseline,
|
||||
"competitors": per_competitor_outputs,
|
||||
"aggregation": aggregation,
|
||||
"metadata": {
|
||||
"generated_at": datetime.utcnow().isoformat(),
|
||||
"competitors_requested": len(normalized_competitors),
|
||||
"competitors_analyzed": len(per_competitor_outputs),
|
||||
"crawl_concurrency": crawl_concurrency
|
||||
}
|
||||
}
|
||||
|
||||
async def generate_weekly_strategy_brief(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
website_analysis: Dict[str, Any],
|
||||
competitors: List[Dict[str, Any]]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generates a weekly strategic intelligence brief by analyzing
|
||||
recent competitor changes and market shifts.
|
||||
"""
|
||||
sitemap_service = SitemapService()
|
||||
ai_manager = AIServiceManager()
|
||||
|
||||
# Stage 1: Data Collection (User + Competitors)
|
||||
baseline = self._build_baseline(website_analysis)
|
||||
normalized_competitors = self._normalize_competitors(competitors, max_competitors=10)
|
||||
|
||||
# Fetch competitor sitemaps for recent changes
|
||||
competitor_changes = []
|
||||
seven_days_ago = datetime.utcnow() - timedelta(days=7)
|
||||
ninety_days_ago = datetime.utcnow() - timedelta(days=90)
|
||||
|
||||
for comp in normalized_competitors:
|
||||
try:
|
||||
# Stage 1: Advertools Deep Intelligence
|
||||
# Discover exact sitemap URL first (essential for Advertools)
|
||||
discovered_sitemap = await sitemap_service.discover_sitemap_url(comp['url'])
|
||||
effective_url = discovered_sitemap if discovered_sitemap else comp['url']
|
||||
|
||||
adv_result = await self.advertools.analyze_sitemap(effective_url)
|
||||
|
||||
# REUSE: Use existing SitemapService.analyze_sitemap for robust Stage 1 & 2
|
||||
analysis_result = await sitemap_service.analyze_sitemap(
|
||||
sitemap_url=effective_url,
|
||||
analyze_content_trends=True,
|
||||
analyze_publishing_patterns=True,
|
||||
include_ai_insights=False,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
if analysis_result and analysis_result.get('urls'):
|
||||
urls = analysis_result['urls']
|
||||
structure = analysis_result.get('structure_analysis', {})
|
||||
|
||||
# Enhancement 1: Keyword Clustering (NLP from URLs) - REUSE from SitemapService
|
||||
keyword_clusters = structure.get('keyword_clusters', {})
|
||||
|
||||
# Enhancement 2: Strategic Pillar Mapping - REUSE from SitemapService
|
||||
pillars = structure.get('strategic_pillars', {})
|
||||
|
||||
# Enhancement 3: Advertools Site Hierarchy (from folders)
|
||||
site_hierarchy = adv_result.get('metrics', {}).get('top_pillars', {}) if adv_result.get('success') else {}
|
||||
|
||||
# Enhancement 4: Content Cadence Trend (Last 7 days vs 90 days)
|
||||
recent_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), seven_days_ago)]
|
||||
historical_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), ninety_days_ago)]
|
||||
|
||||
recent_velocity = len(recent_urls) / 7
|
||||
historical_velocity = len(historical_urls) / 90
|
||||
cadence_shift = ((recent_velocity - historical_velocity) / max(historical_velocity, 0.01)) * 100
|
||||
|
||||
# Advertools Word Frequency (Audit top 5 recent URLs)
|
||||
top_themes = []
|
||||
if recent_urls:
|
||||
audit_urls = [u['loc'] for u in recent_urls[:5]]
|
||||
# Use thread-safe audit_content from AdvertoolsService
|
||||
audit_result = await self.advertools.audit_content(audit_urls)
|
||||
if audit_result.get('success'):
|
||||
top_themes = audit_result.get('themes', [])
|
||||
|
||||
competitor_changes.append({
|
||||
"domain": comp['domain'],
|
||||
"name": comp['name'],
|
||||
"new_content_count": len(recent_urls),
|
||||
"recent_topics": [self._extract_topic_from_url(u['loc']) for u in recent_urls[:10]],
|
||||
"total_pages": len(urls),
|
||||
"keyword_clusters": keyword_clusters,
|
||||
"strategic_pillars": pillars,
|
||||
"site_hierarchy": site_hierarchy,
|
||||
"top_themes": top_themes,
|
||||
"cadence_shift_percent": round(cadence_shift, 1),
|
||||
"publishing_velocity": round(recent_velocity, 2),
|
||||
"stale_content_pct": adv_result.get('metrics', {}).get('stale_content_percentage', 0) if adv_result.get('success') else 0
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch sitemap for {comp['domain']}: {e}")
|
||||
|
||||
# Stage 2: Differential Analysis (Non-AI Aggregation)
|
||||
avg_competitor_velocity = sum(c['publishing_velocity'] for c in competitor_changes) / len(competitor_changes) if competitor_changes else 0
|
||||
market_clusters = self._aggregate_clusters([c['keyword_clusters'] for c in competitor_changes])
|
||||
|
||||
# Stage 3: AI Strategic Intelligence
|
||||
# Extract rich user context from baseline
|
||||
brand_analysis = baseline.get("brand_analysis", {})
|
||||
seo_audit = baseline.get("seo_audit", {})
|
||||
|
||||
user_niche = brand_analysis.get("industry") or "General Business"
|
||||
user_topics = brand_analysis.get("topics") or []
|
||||
if not user_topics and seo_audit.get("keywords"):
|
||||
user_topics = seo_audit.get("keywords")[:5]
|
||||
|
||||
analysis_context = {
|
||||
"user_profile": {
|
||||
"website_url": baseline.get("website_url"),
|
||||
"industry": user_niche,
|
||||
"niche_description": brand_analysis.get("description") or brand_analysis.get("summary") or "",
|
||||
"core_topics": user_topics,
|
||||
"target_audience": baseline.get("target_audience") or {},
|
||||
"business_objectives": brand_analysis.get("objectives") or "Growth",
|
||||
"brand_voice": brand_analysis.get("voice") or "Professional",
|
||||
"augmented_themes": brand_analysis.get("augmented_themes", []) # Added from Advertools
|
||||
},
|
||||
"market_intelligence": {
|
||||
"market_clusters": market_clusters,
|
||||
"competitors_analyzed_count": len(competitor_changes),
|
||||
"market_opportunities_detected": ["Content Velocity Gap", "Topic Authority Shift", "Stale Content Replacement"],
|
||||
"competitor_hierarchies": {c['name']: c['site_hierarchy'] for c in competitor_changes},
|
||||
"competitor_content_themes": {c['name']: c['top_themes'] for c in competitor_changes}
|
||||
},
|
||||
"competitive_landscape_detailed": competitor_changes,
|
||||
}
|
||||
|
||||
# Call AI for strategic intelligence
|
||||
strategic_intelligence = await ai_manager.generate_strategic_intelligence(analysis_context, user_id=user_id)
|
||||
content_gaps = await ai_manager.generate_content_gap_analysis(analysis_context, user_id=user_id)
|
||||
|
||||
# Stage 4: Result Assembly
|
||||
report = {
|
||||
"week_commencing": seven_days_ago.date().isoformat(),
|
||||
"generated_at": datetime.utcnow().isoformat(),
|
||||
"metrics": {
|
||||
"market_velocity": round(avg_competitor_velocity, 2),
|
||||
"market_clusters": market_clusters[:5],
|
||||
"aggressive_competitors": [c['name'] for c in competitor_changes if c['cadence_shift_percent'] > 50]
|
||||
},
|
||||
"insights": {
|
||||
"the_big_move": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[0] if strategic_intelligence.get("success") else {},
|
||||
"low_hanging_fruit": content_gaps.get("data", {}).get("content_recommendations", []) if content_gaps.get("success") else [],
|
||||
"threat_alerts": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[1:] if strategic_intelligence.get("success") else []
|
||||
},
|
||||
"raw_data": {
|
||||
"competitor_changes": competitor_changes
|
||||
}
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
def _is_newer_than(self, lastmod: Optional[str], threshold: datetime) -> bool:
|
||||
if not lastmod:
|
||||
return False
|
||||
try:
|
||||
# Handle various ISO formats
|
||||
dt_str = lastmod.replace('Z', '+00:00')
|
||||
return datetime.fromisoformat(dt_str).replace(tzinfo=None) > threshold
|
||||
except:
|
||||
return False
|
||||
|
||||
def _aggregate_clusters(self, clusters_list: List[Dict[str, int]]) -> List[str]:
|
||||
"""Aggregate clusters across competitors to find market-wide themes."""
|
||||
master: Dict[str, int] = {}
|
||||
for cluster in clusters_list:
|
||||
for k, v in cluster.items():
|
||||
master[k] = master.get(k, 0) + 1 # Count competitor occurrences
|
||||
return sorted(master, key=lambda x: master[x], reverse=True)[:10]
|
||||
|
||||
def _extract_topic_from_url(self, url: str) -> str:
|
||||
"""Helper to get a readable topic from a URL slug."""
|
||||
try:
|
||||
path = urlparse(url).path
|
||||
slug = path.strip('/').split('/')[-1]
|
||||
return slug.replace('-', ' ').replace('_', ' ').capitalize()
|
||||
except:
|
||||
return "New Content"
|
||||
|
||||
def _build_baseline(self, website_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
if not isinstance(website_analysis, dict):
|
||||
website_analysis = {}
|
||||
|
||||
baseline = {
|
||||
"website_url": website_analysis.get("website_url"),
|
||||
"brand_analysis": website_analysis.get("brand_analysis") or {},
|
||||
"content_strategy_insights": website_analysis.get("content_strategy_insights") or {},
|
||||
"seo_audit": website_analysis.get("seo_audit") or {},
|
||||
"style_guidelines": website_analysis.get("style_guidelines") or {},
|
||||
"style_patterns": website_analysis.get("style_patterns") or {}
|
||||
}
|
||||
|
||||
return baseline
|
||||
|
||||
def _normalize_competitors(self, competitors: List[Dict[str, Any]], *, max_competitors: int) -> List[Dict[str, Any]]:
|
||||
if not isinstance(competitors, list):
|
||||
return []
|
||||
|
||||
seen_domains = set()
|
||||
normalized: List[Dict[str, Any]] = []
|
||||
|
||||
for comp in competitors:
|
||||
if not isinstance(comp, dict):
|
||||
continue
|
||||
|
||||
raw_url = comp.get("url") or comp.get("website_url") or comp.get("domain") or ""
|
||||
url = self._normalize_url(raw_url)
|
||||
if not url:
|
||||
continue
|
||||
|
||||
domain = self._extract_domain(url)
|
||||
if not domain or domain in seen_domains:
|
||||
continue
|
||||
|
||||
seen_domains.add(domain)
|
||||
normalized.append({
|
||||
"url": url,
|
||||
"domain": domain,
|
||||
"name": comp.get("name") or comp.get("title") or domain,
|
||||
"summary": comp.get("summary") or comp.get("description") or ""
|
||||
})
|
||||
|
||||
if len(normalized) >= max_competitors:
|
||||
break
|
||||
|
||||
return normalized
|
||||
|
||||
def _normalize_url(self, raw: str) -> Optional[str]:
|
||||
if not raw or not isinstance(raw, str):
|
||||
return None
|
||||
|
||||
raw = raw.strip()
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
if not raw.startswith(("http://", "https://")):
|
||||
raw = "https://" + raw
|
||||
|
||||
try:
|
||||
parsed = urlparse(raw)
|
||||
if not parsed.scheme or not parsed.netloc:
|
||||
return None
|
||||
return f"{parsed.scheme}://{parsed.netloc}"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _extract_domain(self, url: str) -> Optional[str]:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = (parsed.netloc or "").lower()
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
return domain or None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def _crawl_competitors(
|
||||
self,
|
||||
competitors: List[Dict[str, Any]],
|
||||
*,
|
||||
crawl_concurrency: int
|
||||
) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
|
||||
semaphore = asyncio.Semaphore(max(1, int(crawl_concurrency)))
|
||||
|
||||
async def crawl_one(comp: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
async with semaphore:
|
||||
url = comp.get("url")
|
||||
if not url:
|
||||
return comp, {"success": False, "error": "missing_url"}
|
||||
try:
|
||||
return comp, await self.crawler.crawl_website(url)
|
||||
except Exception as e:
|
||||
return comp, {"success": False, "error": str(e)}
|
||||
|
||||
tasks = [crawl_one(c) for c in competitors]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
def _build_extraction_artifact(self, competitor_input: Dict[str, Any], crawl_result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
if not isinstance(crawl_result, dict) or not crawl_result.get("success"):
|
||||
return {
|
||||
"fetch_status": {
|
||||
"status": "failed",
|
||||
"error": crawl_result.get("error") if isinstance(crawl_result, dict) else "unknown_error"
|
||||
}
|
||||
}
|
||||
|
||||
content = crawl_result.get("content") if isinstance(crawl_result.get("content"), dict) else {}
|
||||
title = content.get("title") or ""
|
||||
description = content.get("description") or ""
|
||||
headings = content.get("headings") if isinstance(content.get("headings"), list) else []
|
||||
links = content.get("links") if isinstance(content.get("links"), list) else []
|
||||
meta_tags = content.get("meta_tags") if isinstance(content.get("meta_tags"), dict) else {}
|
||||
main_content = content.get("main_content") or ""
|
||||
content_structure = content.get("content_structure") if isinstance(content.get("content_structure"), dict) else {}
|
||||
|
||||
nav_labels = self._extract_nav_labels(links)
|
||||
h1_h2 = [h for h in headings if isinstance(h, str)][:25]
|
||||
cta_signals = self._extract_cta_signals(main_content, links)
|
||||
proof_signals = self._extract_proof_signals(main_content, links)
|
||||
|
||||
excerpt = main_content.strip()
|
||||
if len(excerpt) > 2000:
|
||||
excerpt = excerpt[:2000]
|
||||
|
||||
return {
|
||||
"fetch_status": {
|
||||
"status": "ok",
|
||||
"fetched_url": crawl_result.get("url"),
|
||||
"timestamp": crawl_result.get("timestamp")
|
||||
},
|
||||
"page_meta": {
|
||||
"title": title,
|
||||
"meta_description": description,
|
||||
"og_title": meta_tags.get("og:title"),
|
||||
"og_description": meta_tags.get("og:description")
|
||||
},
|
||||
"structure": {
|
||||
"headings": h1_h2,
|
||||
"nav_labels": nav_labels,
|
||||
"content_structure": content_structure
|
||||
},
|
||||
"signals": {
|
||||
"cta_signals": cta_signals,
|
||||
"proof_signals": proof_signals
|
||||
},
|
||||
"content_excerpt": excerpt
|
||||
}
|
||||
|
||||
def _extract_nav_labels(self, links: List[Dict[str, Any]]) -> List[str]:
|
||||
labels: List[str] = []
|
||||
for link in links[:200]:
|
||||
if not isinstance(link, dict):
|
||||
continue
|
||||
text = (link.get("text") or "").strip()
|
||||
if not text or len(text) > 50:
|
||||
continue
|
||||
labels.append(text)
|
||||
deduped: List[str] = []
|
||||
seen = set()
|
||||
for label in labels:
|
||||
key = label.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
deduped.append(label)
|
||||
if len(deduped) >= 25:
|
||||
break
|
||||
return deduped
|
||||
|
||||
def _extract_cta_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
text = (main_content or "").lower()
|
||||
keywords = ["get started", "start", "book", "demo", "trial", "pricing", "contact", "signup", "sign up", "subscribe"]
|
||||
keyword_hits = [k for k in keywords if k in text]
|
||||
|
||||
link_texts = []
|
||||
for link in links[:200]:
|
||||
if isinstance(link, dict):
|
||||
t = (link.get("text") or "").strip()
|
||||
if t:
|
||||
link_texts.append(t.lower())
|
||||
|
||||
cta_link_hits = [k for k in keywords if any(k in lt for lt in link_texts)]
|
||||
return {
|
||||
"keyword_hits": keyword_hits[:10],
|
||||
"link_cta_hits": list(dict.fromkeys(cta_link_hits))[:10]
|
||||
}
|
||||
|
||||
def _extract_proof_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
text = (main_content or "").lower()
|
||||
proof_keywords = ["case study", "testimonials", "customers", "trusted by", "reviews", "awards", "partners"]
|
||||
hits = [k for k in proof_keywords if k in text]
|
||||
|
||||
link_hits = []
|
||||
for link in links[:200]:
|
||||
if not isinstance(link, dict):
|
||||
continue
|
||||
href = (link.get("href") or "").lower()
|
||||
if any(k.replace(" ", "") in href.replace("-", "").replace("_", "") for k in ["case study", "testimonials", "customers"]):
|
||||
link_hits.append(href)
|
||||
return {
|
||||
"keyword_hits": hits[:10],
|
||||
"supporting_links": link_hits[:10]
|
||||
}
|
||||
|
||||
async def _analyze_competitor_with_ai(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
baseline: Dict[str, Any],
|
||||
competitor_input: Dict[str, Any],
|
||||
extraction: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
if not isinstance(extraction, dict) or extraction.get("fetch_status", {}).get("status") != "ok":
|
||||
return {
|
||||
"status": "skipped",
|
||||
"reason": "crawl_failed"
|
||||
}
|
||||
|
||||
json_struct = {
|
||||
"positioning": {
|
||||
"value_prop": "string",
|
||||
"target_audience": "string",
|
||||
"market_tier": "string",
|
||||
"primary_offer": "string"
|
||||
},
|
||||
"content_strategy": {
|
||||
"themes": ["string"],
|
||||
"messaging_angles": ["string"],
|
||||
"cta_patterns": ["string"],
|
||||
"tone_markers": ["string"]
|
||||
},
|
||||
"competitive_advantages": ["string"],
|
||||
"weaknesses_or_risks": ["string"],
|
||||
"comparison_to_user_baseline": {
|
||||
"overlaps": ["string"],
|
||||
"deltas": ["string"],
|
||||
"opportunities": ["string"]
|
||||
},
|
||||
"confidence": {
|
||||
"overall": "number",
|
||||
"notes": ["string"]
|
||||
}
|
||||
}
|
||||
|
||||
prompt = (
|
||||
"You are a competitive intelligence analyst.\n"
|
||||
"Analyze the competitor homepage extraction and compare it to the user's Step 2 baseline insights.\n"
|
||||
"Return strictly the requested JSON.\n\n"
|
||||
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
|
||||
f"Competitor input: {json.dumps(competitor_input, ensure_ascii=False)}\n\n"
|
||||
f"Homepage extraction: {json.dumps(extraction, ensure_ascii=False)}\n"
|
||||
)
|
||||
|
||||
try:
|
||||
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
|
||||
parsed = self._safe_json_parse(raw)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
return {"status": "failed", "error": "invalid_ai_json"}
|
||||
except Exception as e:
|
||||
logger.warning(f"AI competitor analysis failed for {competitor_input.get('domain')}: {e}")
|
||||
return {"status": "failed", "error": str(e)}
|
||||
|
||||
async def _aggregate_with_ai(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
baseline: Dict[str, Any],
|
||||
competitors: List[Dict[str, Any]]
|
||||
) -> Dict[str, Any]:
|
||||
json_struct = {
|
||||
"market_map": {
|
||||
"clusters": [
|
||||
{
|
||||
"cluster_name": "string",
|
||||
"description": "string",
|
||||
"competitors": ["string"]
|
||||
}
|
||||
]
|
||||
},
|
||||
"common_patterns": {
|
||||
"common_themes": ["string"],
|
||||
"common_ctas": ["string"],
|
||||
"common_proof_signals": ["string"]
|
||||
},
|
||||
"content_gaps_and_opportunities": [
|
||||
{
|
||||
"gap": "string",
|
||||
"why_it_matters": "string",
|
||||
"recommended_content_types": ["string"],
|
||||
"impact": "string",
|
||||
"effort": "string"
|
||||
}
|
||||
],
|
||||
"strategic_recommendations": [
|
||||
{
|
||||
"action": "string",
|
||||
"expected_impact": "string",
|
||||
"effort": "string",
|
||||
"first_steps": ["string"]
|
||||
}
|
||||
],
|
||||
"warnings": ["string"]
|
||||
}
|
||||
|
||||
compact = []
|
||||
for item in competitors:
|
||||
comp = item.get("input") if isinstance(item, dict) else None
|
||||
ai = item.get("ai_analysis") if isinstance(item, dict) else None
|
||||
if isinstance(comp, dict) and isinstance(ai, dict):
|
||||
compact.append({
|
||||
"domain": comp.get("domain"),
|
||||
"name": comp.get("name"),
|
||||
"ai_analysis": ai
|
||||
})
|
||||
|
||||
prompt = (
|
||||
"You are a senior strategy consultant.\n"
|
||||
"Using the user's Step 2 baseline insights and per-competitor analyses, produce an aggregated market view.\n"
|
||||
"Return strictly the requested JSON.\n\n"
|
||||
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
|
||||
f"Per-competitor analyses: {json.dumps(compact, ensure_ascii=False)}\n"
|
||||
)
|
||||
|
||||
try:
|
||||
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
|
||||
parsed = self._safe_json_parse(raw)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
return {"warnings": ["invalid_ai_json"]}
|
||||
except Exception as e:
|
||||
logger.warning(f"AI aggregation failed: {e}")
|
||||
return {"warnings": [str(e)]}
|
||||
|
||||
def _safe_json_parse(self, text: str) -> Any:
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
cleaned = text.strip()
|
||||
cleaned = re.sub(r"^```json\\s*", "", cleaned)
|
||||
cleaned = re.sub(r"^```\\s*", "", cleaned)
|
||||
cleaned = re.sub(r"```\\s*$", "", cleaned)
|
||||
cleaned = cleaned.strip()
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except Exception:
|
||||
match = re.search(r"\\{[\\s\\S]*\\}", cleaned)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group(0))
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
270
backend/services/research/deep_crawl_service.py
Normal file
270
backend/services/research/deep_crawl_service.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""
|
||||
Deep Crawl Service for Onboarding Step 3
|
||||
Handles deep crawling of user's website, combining Sitemap and Tavily data.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import httpx
|
||||
from typing import Dict, List, Any, Optional
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from services.seo_tools.sitemap_service import SitemapService
|
||||
from services.research.tavily_service import TavilyService
|
||||
from services.database import get_session_for_user
|
||||
from models.crawled_content import EndUserWebsiteContent
|
||||
from models.website_analysis_monitoring_models import DeepWebsiteCrawlTask, DeepWebsiteCrawlExecutionLog
|
||||
|
||||
class DeepCrawlService:
|
||||
def __init__(self):
|
||||
self.sitemap_service = SitemapService()
|
||||
self.tavily_service = TavilyService()
|
||||
|
||||
async def execute_deep_crawl(self, user_id: str, website_url: str, task_id: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Execute deep crawl for a user's website.
|
||||
|
||||
1. Fetch URLs from Sitemap.
|
||||
2. Crawl using Tavily.
|
||||
3. Deduplicate URLs.
|
||||
4. Check liveness (status code).
|
||||
5. Save content to DB and File.
|
||||
"""
|
||||
logger.info(f"Starting deep crawl for {website_url} (User: {user_id})")
|
||||
|
||||
execution_start = datetime.utcnow()
|
||||
db = get_session_for_user(user_id)
|
||||
if not db:
|
||||
raise Exception("Database connection failed")
|
||||
|
||||
try:
|
||||
# 1. Sitemap Discovery
|
||||
sitemap_urls = set()
|
||||
try:
|
||||
# Discover sitemap URL
|
||||
sitemap_url = await self.sitemap_service.discover_sitemap_url(website_url)
|
||||
if not sitemap_url:
|
||||
sitemap_url = f"{website_url.rstrip('/')}/sitemap.xml"
|
||||
|
||||
# Analyze sitemap to get URLs
|
||||
# We use analyze_sitemap directly to get raw URLs
|
||||
sitemap_data = await self.sitemap_service.analyze_sitemap(sitemap_url)
|
||||
|
||||
for url_entry in sitemap_data.get("urls", []):
|
||||
if isinstance(url_entry, dict) and "loc" in url_entry:
|
||||
sitemap_urls.add(url_entry["loc"])
|
||||
|
||||
logger.info(f"Found {len(sitemap_urls)} URLs from sitemap")
|
||||
except Exception as e:
|
||||
logger.warning(f"Sitemap analysis failed: {e}")
|
||||
|
||||
# 2. Tavily Crawl
|
||||
tavily_urls = set()
|
||||
tavily_results = []
|
||||
try:
|
||||
# Use intelligent instructions
|
||||
instructions = "Find all blog posts, articles, and main content pages. Ignore login, signup, and admin pages."
|
||||
|
||||
crawl_result = await self.tavily_service.crawl(
|
||||
url=website_url,
|
||||
limit=50, # Limit to avoid excessive costs/time
|
||||
max_depth=2,
|
||||
extract_depth="basic",
|
||||
instructions=instructions
|
||||
)
|
||||
|
||||
if crawl_result.get("success"):
|
||||
for res in crawl_result.get("results", []):
|
||||
url = res.get("url")
|
||||
if url:
|
||||
tavily_urls.add(url)
|
||||
tavily_results.append(res)
|
||||
|
||||
logger.info(f"Found {len(tavily_urls)} URLs from Tavily")
|
||||
except Exception as e:
|
||||
logger.warning(f"Tavily crawl failed: {e}")
|
||||
|
||||
# 3. Merge and Deduplicate
|
||||
all_urls = sitemap_urls.union(tavily_urls)
|
||||
unique_urls = list(all_urls)
|
||||
logger.info(f"Total unique URLs to process: {len(unique_urls)}")
|
||||
|
||||
# 4. Process URLs (Liveness & Save)
|
||||
processed_count = 0
|
||||
success_count = 0
|
||||
|
||||
# Create directory for documents if not exists
|
||||
# We'll save in workspace/{user_id}/crawled_content/
|
||||
# Note: Path logic should be consistent with project structure
|
||||
# Assuming workspace path is available via env or config, or constructing it.
|
||||
# Using relative path for now, adjusted to project root.
|
||||
# The memory says: workspace/workspace_{user_id}/db/alwrity.db
|
||||
# So workspace root is workspace/workspace_{user_id}/
|
||||
workspace_dir = f"workspace/workspace_{user_id}/crawled_content"
|
||||
os.makedirs(workspace_dir, exist_ok=True)
|
||||
|
||||
# Limit concurrent checks
|
||||
sem = asyncio.Semaphore(10)
|
||||
|
||||
async def process_url(url):
|
||||
async with sem:
|
||||
return await self._process_single_url(url, user_id, website_url, workspace_dir, tavily_results)
|
||||
|
||||
tasks = [process_url(url) for url in unique_urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
processed_data = []
|
||||
|
||||
# Save results to DB
|
||||
for res in results:
|
||||
if isinstance(res, dict):
|
||||
processed_data.append(res)
|
||||
if res.get("status_code") and 200 <= res.get("status_code") < 300:
|
||||
success_count += 1
|
||||
|
||||
# Save to DB
|
||||
try:
|
||||
existing = db.query(EndUserWebsiteContent).filter(
|
||||
EndUserWebsiteContent.user_id == user_id,
|
||||
EndUserWebsiteContent.url == res["url"]
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
existing.content = res.get("content")
|
||||
existing.title = res.get("title")
|
||||
existing.status_code = res.get("status_code")
|
||||
existing.crawled_at = datetime.utcnow()
|
||||
else:
|
||||
new_content = EndUserWebsiteContent(
|
||||
user_id=user_id,
|
||||
website_url=website_url,
|
||||
url=res["url"],
|
||||
title=res.get("title"),
|
||||
content=res.get("content"),
|
||||
status_code=res.get("status_code"),
|
||||
crawled_at=datetime.utcnow()
|
||||
)
|
||||
db.add(new_content)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save content to DB for {res['url']}: {e}")
|
||||
|
||||
db.commit()
|
||||
|
||||
# 5. Update Task Log if task_id provided
|
||||
if task_id:
|
||||
log = DeepWebsiteCrawlExecutionLog(
|
||||
task_id=task_id,
|
||||
status="success",
|
||||
result_data={
|
||||
"total_urls": len(unique_urls),
|
||||
"sitemap_urls": len(sitemap_urls),
|
||||
"tavily_urls": len(tavily_urls),
|
||||
"success_count": success_count,
|
||||
"processed_urls": processed_data[:100] # Store only a subset to avoid huge JSON
|
||||
},
|
||||
execution_time_ms=int((datetime.utcnow() - execution_start).total_seconds() * 1000)
|
||||
)
|
||||
db.add(log)
|
||||
|
||||
# Update task
|
||||
task = db.query(DeepWebsiteCrawlTask).filter(DeepWebsiteCrawlTask.id == task_id).first()
|
||||
if task:
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_success = datetime.utcnow()
|
||||
task.status = "active"
|
||||
task.consecutive_failures = 0
|
||||
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"total_urls": len(unique_urls),
|
||||
"sitemap_urls": len(sitemap_urls),
|
||||
"tavily_urls": len(tavily_urls),
|
||||
"processed_urls": processed_data
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Deep crawl failed: {e}")
|
||||
if task_id:
|
||||
log = DeepWebsiteCrawlExecutionLog(
|
||||
task_id=task_id,
|
||||
status="failed",
|
||||
error_message=str(e),
|
||||
execution_time_ms=int((datetime.utcnow() - execution_start).total_seconds() * 1000)
|
||||
)
|
||||
db.add(log)
|
||||
task = db.query(DeepWebsiteCrawlTask).filter(DeepWebsiteCrawlTask.id == task_id).first()
|
||||
if task:
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_failure = datetime.utcnow()
|
||||
task.failure_reason = str(e)
|
||||
task.consecutive_failures += 1
|
||||
db.commit()
|
||||
raise e
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
async def _process_single_url(self, url: str, user_id: str, website_url: str, workspace_dir: str, tavily_results: List[Dict]):
|
||||
"""Check liveness, extract content, and save."""
|
||||
status_code = None
|
||||
error = None
|
||||
content = None
|
||||
title = None
|
||||
|
||||
# 1. Liveness Check
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
|
||||
resp = await client.get(url)
|
||||
status_code = resp.status_code
|
||||
except Exception as e:
|
||||
error = str(e)
|
||||
status_code = 0 # Failed
|
||||
|
||||
# 2. Get content (from Tavily results or generic extraction if needed)
|
||||
# Check if we have content from Tavily
|
||||
tavily_match = next((r for r in tavily_results if r.get("url") == url), None)
|
||||
|
||||
if tavily_match:
|
||||
content = tavily_match.get("raw_content") or tavily_match.get("content")
|
||||
title = tavily_match.get("title")
|
||||
elif status_code and 200 <= status_code < 300:
|
||||
# Simple fetch content if valid
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
resp = await client.get(url)
|
||||
content = resp.text
|
||||
# Naive title extraction
|
||||
if "<title>" in content:
|
||||
start = content.find("<title>") + 7
|
||||
end = content.find("</title>")
|
||||
if start > 6 and end > start:
|
||||
title = content[start:end]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3. Save to Document
|
||||
if content and title:
|
||||
safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()[:50]
|
||||
if not safe_title:
|
||||
safe_title = "untitled"
|
||||
filename = f"{safe_title}_{int(datetime.utcnow().timestamp())}.txt"
|
||||
filepath = os.path.join(workspace_dir, filename)
|
||||
try:
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(f"URL: {url}\n")
|
||||
f.write(f"Title: {title}\n")
|
||||
f.write(f"Date: {datetime.utcnow()}\n\n")
|
||||
f.write(content)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to write file for {url}: {e}")
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"status_code": status_code,
|
||||
"error": error,
|
||||
"title": title,
|
||||
"content": content
|
||||
}
|
||||
@@ -214,25 +214,71 @@ class ExaService:
|
||||
List of processed competitor data
|
||||
"""
|
||||
competitors = []
|
||||
user_domain = urlparse(user_url).netloc
|
||||
try:
|
||||
user_domain = urlparse(user_url).netloc
|
||||
except Exception:
|
||||
user_domain = ""
|
||||
|
||||
# Extract results from the SDK response
|
||||
results = getattr(search_result, 'results', [])
|
||||
# Handle case where search_result might be a dict or an object
|
||||
if isinstance(search_result, dict):
|
||||
results = search_result.get('results', [])
|
||||
else:
|
||||
results = getattr(search_result, 'results', [])
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
# Extract basic information from the result object
|
||||
competitor_url = getattr(result, 'url', '')
|
||||
competitor_domain = urlparse(competitor_url).netloc
|
||||
# Helper to safely get attribute or dict key
|
||||
def get_val(obj, key, default=None):
|
||||
if isinstance(obj, dict):
|
||||
return obj.get(key, default)
|
||||
return getattr(obj, key, default)
|
||||
|
||||
# Extract basic information
|
||||
raw_url = get_val(result, 'url', '')
|
||||
# Clean URL (remove backticks and whitespace that might be in the response)
|
||||
competitor_url = raw_url.strip().strip('`').strip() if raw_url else ''
|
||||
|
||||
# Skip if it's the same domain as the user
|
||||
if competitor_domain == user_domain:
|
||||
# Fallback to ID if URL is missing/empty but ID looks like a URL
|
||||
if not competitor_url:
|
||||
raw_id = get_val(result, 'id', '')
|
||||
cleaned_id = raw_id.strip().strip('`').strip() if raw_id else ''
|
||||
if cleaned_id and (cleaned_id.startswith('http://') or cleaned_id.startswith('https://')):
|
||||
competitor_url = cleaned_id
|
||||
|
||||
if not competitor_url:
|
||||
continue
|
||||
|
||||
try:
|
||||
competitor_domain = urlparse(competitor_url).netloc
|
||||
except Exception:
|
||||
competitor_domain = ""
|
||||
|
||||
# Skip if it's the same domain as the user (fuzzy match)
|
||||
if user_domain and competitor_domain and (user_domain in competitor_domain or competitor_domain in user_domain):
|
||||
continue
|
||||
|
||||
# Extract content insights
|
||||
summary = getattr(result, 'summary', '')
|
||||
highlights = getattr(result, 'highlights', [])
|
||||
highlight_scores = getattr(result, 'highlight_scores', [])
|
||||
summary = get_val(result, 'summary', '')
|
||||
highlights = get_val(result, 'highlights', [])
|
||||
highlight_scores = get_val(result, 'highlight_scores', [])
|
||||
subpages = get_val(result, 'subpages', [])
|
||||
|
||||
# Ensure subpages are dicts
|
||||
processed_subpages = []
|
||||
if subpages:
|
||||
for sp in subpages:
|
||||
if isinstance(sp, dict):
|
||||
processed_subpages.append(sp)
|
||||
elif hasattr(sp, '__dict__'):
|
||||
processed_subpages.append(sp.__dict__)
|
||||
else:
|
||||
processed_subpages.append({
|
||||
"id": getattr(sp, 'id', ''),
|
||||
"url": getattr(sp, 'url', ''),
|
||||
"title": getattr(sp, 'title', '')
|
||||
})
|
||||
subpages = processed_subpages
|
||||
|
||||
# Calculate competitive relevance score
|
||||
relevance_score = self._calculate_relevance_score(result, user_url)
|
||||
@@ -240,14 +286,15 @@ class ExaService:
|
||||
competitor_data = {
|
||||
"url": competitor_url,
|
||||
"domain": competitor_domain,
|
||||
"title": getattr(result, 'title', ''),
|
||||
"published_date": getattr(result, 'published_date', None),
|
||||
"author": getattr(result, 'author', None),
|
||||
"favicon": getattr(result, 'favicon', None),
|
||||
"image": getattr(result, 'image', None),
|
||||
"title": get_val(result, 'title', ''),
|
||||
"published_date": get_val(result, 'published_date', None),
|
||||
"author": get_val(result, 'author', None),
|
||||
"favicon": get_val(result, 'favicon', None),
|
||||
"image": get_val(result, 'image', None),
|
||||
"summary": summary,
|
||||
"highlights": highlights,
|
||||
"highlight_scores": highlight_scores,
|
||||
"subpages": subpages,
|
||||
"relevance_score": relevance_score,
|
||||
"competitive_insights": self._extract_competitive_insights(summary, highlights),
|
||||
"content_analysis": self._analyze_content_quality(result)
|
||||
@@ -439,6 +486,11 @@ class ExaService:
|
||||
|
||||
# Log the raw Exa API response for debugging
|
||||
logger.info(f"Raw Exa social media response for {user_url}:")
|
||||
if hasattr(result, 'to_json'):
|
||||
logger.info(result.to_json())
|
||||
else:
|
||||
logger.info(str(result))
|
||||
|
||||
logger.info(f" - Request ID: {getattr(result, 'request_id', 'N/A')}")
|
||||
logger.info(f" └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}")
|
||||
# Note: Full raw response contains verbose content - logging only summary
|
||||
@@ -477,9 +529,22 @@ class ExaService:
|
||||
import json
|
||||
import re
|
||||
|
||||
if answer_text.strip().startswith('{'):
|
||||
logger.warning(f"Parsing Exa answer text: {answer_text[:200]}...")
|
||||
|
||||
# Clean markdown code blocks if present
|
||||
clean_text = answer_text.strip()
|
||||
if clean_text.startswith('```json'):
|
||||
clean_text = clean_text[7:]
|
||||
if clean_text.startswith('```'):
|
||||
clean_text = clean_text[3:]
|
||||
if clean_text.endswith('```'):
|
||||
clean_text = clean_text[:-3]
|
||||
|
||||
clean_text = clean_text.strip()
|
||||
|
||||
if clean_text.startswith('{'):
|
||||
# Direct JSON format
|
||||
answer_data = json.loads(answer_text.strip())
|
||||
answer_data = json.loads(clean_text)
|
||||
else:
|
||||
# Parse markdown format with URLs
|
||||
answer_data = {
|
||||
|
||||
@@ -26,7 +26,7 @@ async def generate_research_persona_task(user_id: str):
|
||||
logger.info(f"Scheduled research persona generation started for user {user_id}")
|
||||
|
||||
# Get database session
|
||||
db = get_db_session()
|
||||
db = get_db_session(user_id)
|
||||
if not db:
|
||||
logger.error(f"Failed to get database session for research persona generation (user: {user_id})")
|
||||
return
|
||||
|
||||
@@ -9,13 +9,14 @@ from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
from fastapi import HTTPException
|
||||
|
||||
from sqlalchemy import text
|
||||
from services.database import get_db_session
|
||||
from models.onboarding import PersonaData, OnboardingSession
|
||||
from models.research_persona_models import ResearchPersona
|
||||
from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
from services.onboarding.database_service import OnboardingDatabaseService
|
||||
from services.persona_data_service import PersonaDataService
|
||||
from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService
|
||||
|
||||
|
||||
class ResearchPersonaService:
|
||||
@@ -24,10 +25,62 @@ class ResearchPersonaService:
|
||||
CACHE_TTL_DAYS = 7 # 7-day cache TTL
|
||||
|
||||
def __init__(self, db_session=None):
|
||||
self.db = db_session or get_db_session()
|
||||
self.db = db_session
|
||||
self.prompt_builder = ResearchPersonaPromptBuilder()
|
||||
self.onboarding_service = OnboardingDatabaseService(db=self.db)
|
||||
self.persona_data_service = PersonaDataService(db_session=self.db)
|
||||
# self.persona_data_service was initialized here but unused in this service
|
||||
self.integration_service = OnboardingDataIntegrationService()
|
||||
self._research_persona_cols_checked = False
|
||||
|
||||
def _get_session(self, user_id: str):
|
||||
"""Helper to get a database session."""
|
||||
if self.db:
|
||||
return self.db, False
|
||||
return get_db_session(user_id), True
|
||||
|
||||
def _ensure_research_persona_columns(self, session_db) -> None:
|
||||
"""Ensure research_persona columns exist in persona_data table (runtime migration)."""
|
||||
if self._research_persona_cols_checked:
|
||||
return
|
||||
|
||||
try:
|
||||
# Check if columns exist using PRAGMA (SQLite) or information_schema (PostgreSQL)
|
||||
db_url = str(session_db.bind.url) if session_db.bind else ""
|
||||
|
||||
if 'sqlite' in db_url.lower():
|
||||
# SQLite: Use PRAGMA to check columns
|
||||
result = session_db.execute(text("PRAGMA table_info(persona_data)"))
|
||||
cols = {row[1] for row in result} # Column name is at index 1
|
||||
|
||||
if 'research_persona' not in cols:
|
||||
logger.info("Adding missing column research_persona to persona_data table")
|
||||
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona JSON"))
|
||||
session_db.commit()
|
||||
|
||||
if 'research_persona_generated_at' not in cols:
|
||||
logger.info("Adding missing column research_persona_generated_at to persona_data table")
|
||||
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona_generated_at TIMESTAMP"))
|
||||
session_db.commit()
|
||||
else:
|
||||
# PostgreSQL: Try to query the columns (will fail if they don't exist)
|
||||
try:
|
||||
session_db.execute(text("SELECT research_persona, research_persona_generated_at FROM persona_data LIMIT 0"))
|
||||
except Exception:
|
||||
# Columns don't exist, add them
|
||||
logger.info("Adding missing columns research_persona and research_persona_generated_at to persona_data table")
|
||||
try:
|
||||
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona JSONB"))
|
||||
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona_generated_at TIMESTAMP"))
|
||||
session_db.commit()
|
||||
except Exception as alter_err:
|
||||
logger.error(f"Failed to add research_persona columns: {alter_err}")
|
||||
session_db.rollback()
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error ensuring research_persona columns: {e}")
|
||||
session_db.rollback()
|
||||
raise
|
||||
finally:
|
||||
self._research_persona_cols_checked = True
|
||||
|
||||
def get_cached_only(
|
||||
self,
|
||||
@@ -46,9 +99,16 @@ class ResearchPersonaService:
|
||||
Returns:
|
||||
ResearchPersona if exists in database, None otherwise
|
||||
"""
|
||||
db = None
|
||||
should_close = False
|
||||
try:
|
||||
db, should_close = self._get_session(user_id)
|
||||
if not db:
|
||||
logger.error(f"Could not get database session for user {user_id}")
|
||||
return None
|
||||
|
||||
# Get persona data record
|
||||
persona_data = self._get_persona_data_record(user_id)
|
||||
persona_data = self._get_persona_data_record(user_id, db)
|
||||
|
||||
if not persona_data:
|
||||
logger.debug(f"[get_cached_only] No persona data record found for user {user_id}")
|
||||
@@ -110,6 +170,9 @@ class ResearchPersonaService:
|
||||
except Exception as e:
|
||||
logger.error(f"[get_cached_only] ❌ Error getting research persona for user {user_id}: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
if should_close and db:
|
||||
db.close()
|
||||
|
||||
def get_or_generate(
|
||||
self,
|
||||
@@ -126,9 +189,16 @@ class ResearchPersonaService:
|
||||
Returns:
|
||||
ResearchPersona if successful, None otherwise
|
||||
"""
|
||||
db = None
|
||||
should_close = False
|
||||
try:
|
||||
db, should_close = self._get_session(user_id)
|
||||
if not db:
|
||||
logger.error(f"Could not get database session for get_or_generate (user {user_id})")
|
||||
return None
|
||||
|
||||
# Get persona data record
|
||||
persona_data = self._get_persona_data_record(user_id)
|
||||
persona_data = self._get_persona_data_record(user_id, db)
|
||||
|
||||
if not persona_data:
|
||||
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
|
||||
@@ -168,18 +238,14 @@ class ResearchPersonaService:
|
||||
# 3. Parsing of existing persona failed
|
||||
try:
|
||||
logger.info(f"Generating research persona for user {user_id}")
|
||||
research_persona = self.generate_research_persona(user_id)
|
||||
research_persona = self.generate_research_persona(user_id, db)
|
||||
except HTTPException:
|
||||
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
|
||||
raise
|
||||
|
||||
if research_persona:
|
||||
# Save to database
|
||||
if self.save_research_persona(user_id, research_persona):
|
||||
logger.info(f"✅ Research persona generated and saved for user {user_id}")
|
||||
else:
|
||||
logger.warning(f"Failed to save research persona for user {user_id}")
|
||||
|
||||
# generate_research_persona saves it automatically now
|
||||
logger.info(f"✅ Research persona generated and saved for user {user_id}")
|
||||
return research_persona
|
||||
else:
|
||||
# Log detailed error for debugging expensive failures
|
||||
@@ -196,22 +262,36 @@ class ResearchPersonaService:
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
|
||||
return None
|
||||
finally:
|
||||
if should_close and db:
|
||||
db.close()
|
||||
|
||||
def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
|
||||
def generate_research_persona(self, user_id: str, db=None) -> Optional[ResearchPersona]:
|
||||
"""
|
||||
Generate a new research persona for the user.
|
||||
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
db: Optional database session
|
||||
|
||||
Returns:
|
||||
ResearchPersona if successful, None otherwise
|
||||
"""
|
||||
session_db = None
|
||||
should_close = False
|
||||
try:
|
||||
session_db = db
|
||||
if not session_db:
|
||||
session_db, should_close = self._get_session(user_id)
|
||||
|
||||
if not session_db:
|
||||
logger.error(f"Could not get database session for generate_research_persona (user {user_id})")
|
||||
return None
|
||||
|
||||
logger.info(f"Generating research persona for user {user_id}")
|
||||
|
||||
# Collect onboarding data
|
||||
onboarding_data = self._collect_onboarding_data(user_id)
|
||||
onboarding_data = self._collect_onboarding_data(user_id, session_db)
|
||||
|
||||
if not onboarding_data:
|
||||
logger.warning(f"Insufficient onboarding data for user {user_id}")
|
||||
@@ -275,6 +355,12 @@ class ResearchPersonaService:
|
||||
try:
|
||||
research_persona = ResearchPersona(**persona_dict)
|
||||
logger.info(f"✅ Research persona generated successfully for user {user_id}")
|
||||
|
||||
# Save the generated persona
|
||||
save_success = self.save_research_persona(user_id, research_persona, session_db)
|
||||
if not save_success:
|
||||
logger.warning(f"Failed to save generated persona for user {user_id}")
|
||||
|
||||
return research_persona
|
||||
except Exception as validation_error:
|
||||
logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
|
||||
@@ -297,6 +383,9 @@ class ResearchPersonaService:
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating research persona for user {user_id}: {e}")
|
||||
return None
|
||||
finally:
|
||||
if should_close and session_db:
|
||||
session_db.close()
|
||||
|
||||
def is_cache_valid(self, persona_data: PersonaData) -> bool:
|
||||
"""
|
||||
@@ -323,7 +412,8 @@ class ResearchPersonaService:
|
||||
def save_research_persona(
|
||||
self,
|
||||
user_id: str,
|
||||
research_persona: ResearchPersona
|
||||
research_persona: ResearchPersona,
|
||||
db=None
|
||||
) -> bool:
|
||||
"""
|
||||
Save research persona to database.
|
||||
@@ -331,12 +421,23 @@ class ResearchPersonaService:
|
||||
Args:
|
||||
user_id: User ID (Clerk string)
|
||||
research_persona: ResearchPersona to save
|
||||
db: Optional database session
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
session_db = None
|
||||
should_close = False
|
||||
try:
|
||||
persona_data = self._get_persona_data_record(user_id)
|
||||
session_db = db
|
||||
if not session_db:
|
||||
session_db, should_close = self._get_session(user_id)
|
||||
|
||||
if not session_db:
|
||||
logger.error(f"Could not get database session for save_research_persona (user {user_id})")
|
||||
return False
|
||||
|
||||
persona_data = self._get_persona_data_record(user_id, session_db)
|
||||
|
||||
if not persona_data:
|
||||
logger.error(f"No persona data record found for user {user_id}")
|
||||
@@ -349,24 +450,33 @@ class ResearchPersonaService:
|
||||
persona_data.research_persona = persona_dict
|
||||
persona_data.research_persona_generated_at = datetime.utcnow()
|
||||
|
||||
self.db.commit()
|
||||
session_db.commit()
|
||||
|
||||
logger.info(f"✅ Research persona saved for user {user_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving research persona for user {user_id}: {e}")
|
||||
self.db.rollback()
|
||||
if session_db:
|
||||
session_db.rollback()
|
||||
return False
|
||||
finally:
|
||||
if should_close and session_db:
|
||||
session_db.close()
|
||||
|
||||
def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
|
||||
def _get_persona_data_record(self, user_id: str, db=None) -> Optional[PersonaData]:
|
||||
"""Get PersonaData database record for user."""
|
||||
try:
|
||||
session_db = db or self.db
|
||||
if not session_db:
|
||||
logger.error(f"No database session provided for _get_persona_data_record (user {user_id})")
|
||||
return None
|
||||
|
||||
# Ensure research_persona columns exist before querying
|
||||
self.onboarding_service._ensure_research_persona_columns(self.db)
|
||||
self._ensure_research_persona_columns(session_db)
|
||||
|
||||
# Get onboarding session
|
||||
session = self.db.query(OnboardingSession).filter(
|
||||
session = session_db.query(OnboardingSession).filter(
|
||||
OnboardingSession.user_id == user_id
|
||||
).first()
|
||||
|
||||
@@ -374,7 +484,7 @@ class ResearchPersonaService:
|
||||
return None
|
||||
|
||||
# Get persona data
|
||||
persona_data = self.db.query(PersonaData).filter(
|
||||
persona_data = session_db.query(PersonaData).filter(
|
||||
PersonaData.session_id == session.id
|
||||
).first()
|
||||
|
||||
@@ -384,7 +494,7 @@ class ResearchPersonaService:
|
||||
logger.error(f"Error getting persona data record for user {user_id}: {e}")
|
||||
return None
|
||||
|
||||
def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
|
||||
def _collect_onboarding_data(self, user_id: str, db=None) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Collect all onboarding data needed for research persona generation.
|
||||
|
||||
@@ -392,40 +502,44 @@ class ResearchPersonaService:
|
||||
Dictionary with website_analysis, persona_data, research_preferences, business_info
|
||||
"""
|
||||
try:
|
||||
# Get website analysis
|
||||
website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}
|
||||
session_db = db or self.db
|
||||
if not session_db:
|
||||
logger.error(f"No database session provided for _collect_onboarding_data (user {user_id})")
|
||||
return None
|
||||
|
||||
# Get integrated data via SSOT
|
||||
integrated_data = self.integration_service.get_integrated_data_sync(user_id, session_db)
|
||||
|
||||
# Get persona data
|
||||
persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}
|
||||
if not integrated_data:
|
||||
logger.warning(f"No integrated data found for user {user_id}")
|
||||
return None
|
||||
|
||||
website_analysis = integrated_data.get('website_analysis', {})
|
||||
persona_data_dict = integrated_data.get('persona_data', {})
|
||||
research_prefs = integrated_data.get('research_preferences', {})
|
||||
canonical_profile = integrated_data.get('canonical_profile', {})
|
||||
|
||||
# Get research preferences
|
||||
research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}
|
||||
|
||||
# Get business info - construct from persona data and website analysis
|
||||
business_info = {}
|
||||
canonical_business = canonical_profile.get('business_info')
|
||||
if isinstance(canonical_business, dict):
|
||||
business_info.update(canonical_business)
|
||||
|
||||
# Use canonical profile data (SSOT) instead of manual logic if possible
|
||||
# The canonical profile already handles logic for industry/target_audience from various sources
|
||||
if not business_info.get('industry') and canonical_profile.get('industry'):
|
||||
business_info['industry'] = canonical_profile.get('industry')
|
||||
|
||||
# Try to extract from persona data
|
||||
if persona_data_dict:
|
||||
core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
|
||||
if core_persona:
|
||||
if core_persona.get('industry'):
|
||||
business_info['industry'] = core_persona['industry']
|
||||
if core_persona.get('target_audience'):
|
||||
business_info['target_audience'] = core_persona['target_audience']
|
||||
if not business_info.get('target_audience') and canonical_profile.get('target_audience'):
|
||||
business_info['target_audience'] = canonical_profile.get('target_audience')
|
||||
|
||||
# Fallback to website analysis if not in persona
|
||||
# Fallback logic if canonical profile is missing these (though it should have them)
|
||||
if not business_info.get('industry') and website_analysis:
|
||||
target_audience_data = website_analysis.get('target_audience', {})
|
||||
if isinstance(target_audience_data, dict):
|
||||
industry_focus = target_audience_data.get('industry_focus')
|
||||
if industry_focus:
|
||||
business_info['industry'] = industry_focus
|
||||
demographics = target_audience_data.get('demographics')
|
||||
if demographics:
|
||||
business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
|
||||
|
||||
# Check if we have enough data - be more lenient since we can infer from minimal data
|
||||
# We need at least some basic information to generate a meaningful persona
|
||||
has_basic_data = bool(
|
||||
website_analysis or
|
||||
persona_data_dict or
|
||||
@@ -457,20 +571,17 @@ class ResearchPersonaService:
|
||||
business_info['inferred'] = True
|
||||
|
||||
# Get competitor analysis data (if available)
|
||||
competitor_analysis = None
|
||||
try:
|
||||
competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
|
||||
if competitor_analysis:
|
||||
logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")
|
||||
# Use SSOT (Integrated data contains competitor info)
|
||||
competitor_analysis = integrated_data.get('competitor_analysis')
|
||||
if not competitor_analysis:
|
||||
competitor_analysis = []
|
||||
|
||||
return {
|
||||
"website_analysis": website_analysis,
|
||||
"persona_data": persona_data_dict,
|
||||
"research_preferences": research_prefs,
|
||||
"business_info": business_info,
|
||||
"competitor_analysis": competitor_analysis # Add competitor data for better preset generation
|
||||
"competitor_analysis": competitor_analysis
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -258,6 +258,112 @@ class TavilyService:
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
||||
|
||||
return results
|
||||
|
||||
async def crawl(
|
||||
self,
|
||||
url: str,
|
||||
limit: int = 50,
|
||||
max_depth: int = 1,
|
||||
max_breadth: int = 20,
|
||||
extract_depth: str = "basic",
|
||||
include_favicon: bool = False,
|
||||
instructions: str = "",
|
||||
allow_external: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Crawl a website using Tavily API.
|
||||
|
||||
Args:
|
||||
url: The root URL to begin the crawl
|
||||
limit: Total number of links the crawler will process
|
||||
max_depth: Max depth of the crawl
|
||||
max_breadth: Max number of links to follow per level
|
||||
extract_depth: 'basic' or 'advanced'
|
||||
include_favicon: Whether to include favicon
|
||||
instructions: Natural language instructions for the crawler
|
||||
allow_external: Whether to return external links
|
||||
|
||||
Returns:
|
||||
Dict containing crawl results
|
||||
"""
|
||||
try:
|
||||
self._try_initialize()
|
||||
if not self.enabled:
|
||||
raise ValueError("Tavily Service is not enabled - API key missing")
|
||||
|
||||
logger.info(f"Starting Tavily crawl for: {url}")
|
||||
|
||||
payload = {
|
||||
"api_key": self.api_key,
|
||||
"urls": [url] # Tavily extract/crawl might take a list or single URL.
|
||||
# Wait, if this is 'crawl', usually it takes one URL.
|
||||
# Let's double check standard Tavily API.
|
||||
# But since I can't check external docs, I will follow the MCP tool params.
|
||||
# The MCP tool has 'url' (string).
|
||||
}
|
||||
|
||||
# NOTE: Tavily API structure for crawl might be different.
|
||||
# I'll assume there is a /crawl endpoint or similar.
|
||||
# However, looking at standard Tavily python SDK, they often use 'extract' or 'search'.
|
||||
# But 'crawl' is a distinct feature.
|
||||
# I will use a generic request structure based on the tool parameters.
|
||||
|
||||
# Re-constructing payload based on tool params
|
||||
request_payload = {
|
||||
"api_key": self.api_key,
|
||||
"url": url,
|
||||
"limit": limit,
|
||||
"max_depth": max_depth,
|
||||
"max_breadth": max_breadth,
|
||||
"extract_depth": extract_depth,
|
||||
"include_favicon": include_favicon,
|
||||
"instructions": instructions,
|
||||
"allow_external": allow_external
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Assuming the endpoint is /crawl based on the tool name
|
||||
# If it fails, I'll need to adjust.
|
||||
endpoint = f"{self.base_url}/crawl"
|
||||
|
||||
# Note: Tavily might not have a /crawl endpoint exposed this way in REST if it's new.
|
||||
# But let's try.
|
||||
|
||||
# Actually, wait. The user mentioned "Refer to the tavily mcp".
|
||||
# The tool definition `mcp_tavily-remote-mcp_tavily_crawl` has the description.
|
||||
|
||||
# I will proceed with /crawl.
|
||||
|
||||
async with session.post(
|
||||
endpoint,
|
||||
json=request_payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=aiohttp.ClientTimeout(total=300) # Crawling takes longer
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
result = await response.json()
|
||||
logger.info(f"Tavily crawl completed successfully.")
|
||||
return {
|
||||
"success": True,
|
||||
"results": result.get("results", []), # Assuming standard response
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Tavily Crawl API error: {response.status} - {error_text}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Tavily API error: {response.status}",
|
||||
"details": error_text
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Tavily crawl: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"details": "An unexpected error occurred during crawl"
|
||||
}
|
||||
|
||||
async def search_industry_trends(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user