Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

This commit is contained in:
ajaysi
2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions

View File

@@ -0,0 +1,603 @@
from __future__ import annotations
import asyncio
import json
import re
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse
from services.component_logic.web_crawler_logic import WebCrawlerLogic
from services.llm_providers.main_text_generation import llm_text_gen
from services.ai_service_manager import AIServiceManager, AIServiceType
from services.seo_tools.sitemap_service import SitemapService
from services.seo.advertools_service import AdvertoolsService
from utils.logger_utils import get_service_logger
logger = get_service_logger("deep_competitor_analysis")
class DeepCompetitorAnalysisService:
def __init__(self):
self.crawler = WebCrawlerLogic()
self.advertools = AdvertoolsService()
async def run(
self,
*,
user_id: str,
website_analysis: Dict[str, Any],
competitors: List[Dict[str, Any]],
max_competitors: int = 25,
crawl_concurrency: int = 4
) -> Dict[str, Any]:
baseline = self._build_baseline(website_analysis)
normalized_competitors = self._normalize_competitors(competitors, max_competitors=max_competitors)
crawl_results = await self._crawl_competitors(
normalized_competitors,
crawl_concurrency=crawl_concurrency
)
per_competitor_outputs: List[Dict[str, Any]] = []
for competitor_input, crawl_result in crawl_results:
extraction = self._build_extraction_artifact(competitor_input, crawl_result)
ai_analysis = await self._analyze_competitor_with_ai(
user_id=user_id,
baseline=baseline,
competitor_input=competitor_input,
extraction=extraction
)
per_competitor_outputs.append({
"input": competitor_input,
"extraction": extraction,
"ai_analysis": ai_analysis
})
aggregation = await self._aggregate_with_ai(
user_id=user_id,
baseline=baseline,
competitors=per_competitor_outputs
)
return {
"baseline": baseline,
"competitors": per_competitor_outputs,
"aggregation": aggregation,
"metadata": {
"generated_at": datetime.utcnow().isoformat(),
"competitors_requested": len(normalized_competitors),
"competitors_analyzed": len(per_competitor_outputs),
"crawl_concurrency": crawl_concurrency
}
}
async def generate_weekly_strategy_brief(
self,
*,
user_id: str,
website_analysis: Dict[str, Any],
competitors: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""
Generates a weekly strategic intelligence brief by analyzing
recent competitor changes and market shifts.
"""
sitemap_service = SitemapService()
ai_manager = AIServiceManager()
# Stage 1: Data Collection (User + Competitors)
baseline = self._build_baseline(website_analysis)
normalized_competitors = self._normalize_competitors(competitors, max_competitors=10)
# Fetch competitor sitemaps for recent changes
competitor_changes = []
seven_days_ago = datetime.utcnow() - timedelta(days=7)
ninety_days_ago = datetime.utcnow() - timedelta(days=90)
for comp in normalized_competitors:
try:
# Stage 1: Advertools Deep Intelligence
# Discover exact sitemap URL first (essential for Advertools)
discovered_sitemap = await sitemap_service.discover_sitemap_url(comp['url'])
effective_url = discovered_sitemap if discovered_sitemap else comp['url']
adv_result = await self.advertools.analyze_sitemap(effective_url)
# REUSE: Use existing SitemapService.analyze_sitemap for robust Stage 1 & 2
analysis_result = await sitemap_service.analyze_sitemap(
sitemap_url=effective_url,
analyze_content_trends=True,
analyze_publishing_patterns=True,
include_ai_insights=False,
user_id=user_id
)
if analysis_result and analysis_result.get('urls'):
urls = analysis_result['urls']
structure = analysis_result.get('structure_analysis', {})
# Enhancement 1: Keyword Clustering (NLP from URLs) - REUSE from SitemapService
keyword_clusters = structure.get('keyword_clusters', {})
# Enhancement 2: Strategic Pillar Mapping - REUSE from SitemapService
pillars = structure.get('strategic_pillars', {})
# Enhancement 3: Advertools Site Hierarchy (from folders)
site_hierarchy = adv_result.get('metrics', {}).get('top_pillars', {}) if adv_result.get('success') else {}
# Enhancement 4: Content Cadence Trend (Last 7 days vs 90 days)
recent_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), seven_days_ago)]
historical_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), ninety_days_ago)]
recent_velocity = len(recent_urls) / 7
historical_velocity = len(historical_urls) / 90
cadence_shift = ((recent_velocity - historical_velocity) / max(historical_velocity, 0.01)) * 100
# Advertools Word Frequency (Audit top 5 recent URLs)
top_themes = []
if recent_urls:
audit_urls = [u['loc'] for u in recent_urls[:5]]
# Use thread-safe audit_content from AdvertoolsService
audit_result = await self.advertools.audit_content(audit_urls)
if audit_result.get('success'):
top_themes = audit_result.get('themes', [])
competitor_changes.append({
"domain": comp['domain'],
"name": comp['name'],
"new_content_count": len(recent_urls),
"recent_topics": [self._extract_topic_from_url(u['loc']) for u in recent_urls[:10]],
"total_pages": len(urls),
"keyword_clusters": keyword_clusters,
"strategic_pillars": pillars,
"site_hierarchy": site_hierarchy,
"top_themes": top_themes,
"cadence_shift_percent": round(cadence_shift, 1),
"publishing_velocity": round(recent_velocity, 2),
"stale_content_pct": adv_result.get('metrics', {}).get('stale_content_percentage', 0) if adv_result.get('success') else 0
})
except Exception as e:
logger.warning(f"Failed to fetch sitemap for {comp['domain']}: {e}")
# Stage 2: Differential Analysis (Non-AI Aggregation)
avg_competitor_velocity = sum(c['publishing_velocity'] for c in competitor_changes) / len(competitor_changes) if competitor_changes else 0
market_clusters = self._aggregate_clusters([c['keyword_clusters'] for c in competitor_changes])
# Stage 3: AI Strategic Intelligence
# Extract rich user context from baseline
brand_analysis = baseline.get("brand_analysis", {})
seo_audit = baseline.get("seo_audit", {})
user_niche = brand_analysis.get("industry") or "General Business"
user_topics = brand_analysis.get("topics") or []
if not user_topics and seo_audit.get("keywords"):
user_topics = seo_audit.get("keywords")[:5]
analysis_context = {
"user_profile": {
"website_url": baseline.get("website_url"),
"industry": user_niche,
"niche_description": brand_analysis.get("description") or brand_analysis.get("summary") or "",
"core_topics": user_topics,
"target_audience": baseline.get("target_audience") or {},
"business_objectives": brand_analysis.get("objectives") or "Growth",
"brand_voice": brand_analysis.get("voice") or "Professional",
"augmented_themes": brand_analysis.get("augmented_themes", []) # Added from Advertools
},
"market_intelligence": {
"market_clusters": market_clusters,
"competitors_analyzed_count": len(competitor_changes),
"market_opportunities_detected": ["Content Velocity Gap", "Topic Authority Shift", "Stale Content Replacement"],
"competitor_hierarchies": {c['name']: c['site_hierarchy'] for c in competitor_changes},
"competitor_content_themes": {c['name']: c['top_themes'] for c in competitor_changes}
},
"competitive_landscape_detailed": competitor_changes,
}
# Call AI for strategic intelligence
strategic_intelligence = await ai_manager.generate_strategic_intelligence(analysis_context, user_id=user_id)
content_gaps = await ai_manager.generate_content_gap_analysis(analysis_context, user_id=user_id)
# Stage 4: Result Assembly
report = {
"week_commencing": seven_days_ago.date().isoformat(),
"generated_at": datetime.utcnow().isoformat(),
"metrics": {
"market_velocity": round(avg_competitor_velocity, 2),
"market_clusters": market_clusters[:5],
"aggressive_competitors": [c['name'] for c in competitor_changes if c['cadence_shift_percent'] > 50]
},
"insights": {
"the_big_move": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[0] if strategic_intelligence.get("success") else {},
"low_hanging_fruit": content_gaps.get("data", {}).get("content_recommendations", []) if content_gaps.get("success") else [],
"threat_alerts": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[1:] if strategic_intelligence.get("success") else []
},
"raw_data": {
"competitor_changes": competitor_changes
}
}
return report
def _is_newer_than(self, lastmod: Optional[str], threshold: datetime) -> bool:
if not lastmod:
return False
try:
# Handle various ISO formats
dt_str = lastmod.replace('Z', '+00:00')
return datetime.fromisoformat(dt_str).replace(tzinfo=None) > threshold
except:
return False
def _aggregate_clusters(self, clusters_list: List[Dict[str, int]]) -> List[str]:
"""Aggregate clusters across competitors to find market-wide themes."""
master: Dict[str, int] = {}
for cluster in clusters_list:
for k, v in cluster.items():
master[k] = master.get(k, 0) + 1 # Count competitor occurrences
return sorted(master, key=lambda x: master[x], reverse=True)[:10]
def _extract_topic_from_url(self, url: str) -> str:
"""Helper to get a readable topic from a URL slug."""
try:
path = urlparse(url).path
slug = path.strip('/').split('/')[-1]
return slug.replace('-', ' ').replace('_', ' ').capitalize()
except:
return "New Content"
def _build_baseline(self, website_analysis: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(website_analysis, dict):
website_analysis = {}
baseline = {
"website_url": website_analysis.get("website_url"),
"brand_analysis": website_analysis.get("brand_analysis") or {},
"content_strategy_insights": website_analysis.get("content_strategy_insights") or {},
"seo_audit": website_analysis.get("seo_audit") or {},
"style_guidelines": website_analysis.get("style_guidelines") or {},
"style_patterns": website_analysis.get("style_patterns") or {}
}
return baseline
def _normalize_competitors(self, competitors: List[Dict[str, Any]], *, max_competitors: int) -> List[Dict[str, Any]]:
if not isinstance(competitors, list):
return []
seen_domains = set()
normalized: List[Dict[str, Any]] = []
for comp in competitors:
if not isinstance(comp, dict):
continue
raw_url = comp.get("url") or comp.get("website_url") or comp.get("domain") or ""
url = self._normalize_url(raw_url)
if not url:
continue
domain = self._extract_domain(url)
if not domain or domain in seen_domains:
continue
seen_domains.add(domain)
normalized.append({
"url": url,
"domain": domain,
"name": comp.get("name") or comp.get("title") or domain,
"summary": comp.get("summary") or comp.get("description") or ""
})
if len(normalized) >= max_competitors:
break
return normalized
def _normalize_url(self, raw: str) -> Optional[str]:
if not raw or not isinstance(raw, str):
return None
raw = raw.strip()
if not raw:
return None
if not raw.startswith(("http://", "https://")):
raw = "https://" + raw
try:
parsed = urlparse(raw)
if not parsed.scheme or not parsed.netloc:
return None
return f"{parsed.scheme}://{parsed.netloc}"
except Exception:
return None
def _extract_domain(self, url: str) -> Optional[str]:
try:
parsed = urlparse(url)
domain = (parsed.netloc or "").lower()
if domain.startswith("www."):
domain = domain[4:]
return domain or None
except Exception:
return None
async def _crawl_competitors(
self,
competitors: List[Dict[str, Any]],
*,
crawl_concurrency: int
) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
semaphore = asyncio.Semaphore(max(1, int(crawl_concurrency)))
async def crawl_one(comp: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
async with semaphore:
url = comp.get("url")
if not url:
return comp, {"success": False, "error": "missing_url"}
try:
return comp, await self.crawler.crawl_website(url)
except Exception as e:
return comp, {"success": False, "error": str(e)}
tasks = [crawl_one(c) for c in competitors]
return await asyncio.gather(*tasks)
def _build_extraction_artifact(self, competitor_input: Dict[str, Any], crawl_result: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(crawl_result, dict) or not crawl_result.get("success"):
return {
"fetch_status": {
"status": "failed",
"error": crawl_result.get("error") if isinstance(crawl_result, dict) else "unknown_error"
}
}
content = crawl_result.get("content") if isinstance(crawl_result.get("content"), dict) else {}
title = content.get("title") or ""
description = content.get("description") or ""
headings = content.get("headings") if isinstance(content.get("headings"), list) else []
links = content.get("links") if isinstance(content.get("links"), list) else []
meta_tags = content.get("meta_tags") if isinstance(content.get("meta_tags"), dict) else {}
main_content = content.get("main_content") or ""
content_structure = content.get("content_structure") if isinstance(content.get("content_structure"), dict) else {}
nav_labels = self._extract_nav_labels(links)
h1_h2 = [h for h in headings if isinstance(h, str)][:25]
cta_signals = self._extract_cta_signals(main_content, links)
proof_signals = self._extract_proof_signals(main_content, links)
excerpt = main_content.strip()
if len(excerpt) > 2000:
excerpt = excerpt[:2000]
return {
"fetch_status": {
"status": "ok",
"fetched_url": crawl_result.get("url"),
"timestamp": crawl_result.get("timestamp")
},
"page_meta": {
"title": title,
"meta_description": description,
"og_title": meta_tags.get("og:title"),
"og_description": meta_tags.get("og:description")
},
"structure": {
"headings": h1_h2,
"nav_labels": nav_labels,
"content_structure": content_structure
},
"signals": {
"cta_signals": cta_signals,
"proof_signals": proof_signals
},
"content_excerpt": excerpt
}
def _extract_nav_labels(self, links: List[Dict[str, Any]]) -> List[str]:
labels: List[str] = []
for link in links[:200]:
if not isinstance(link, dict):
continue
text = (link.get("text") or "").strip()
if not text or len(text) > 50:
continue
labels.append(text)
deduped: List[str] = []
seen = set()
for label in labels:
key = label.lower()
if key in seen:
continue
seen.add(key)
deduped.append(label)
if len(deduped) >= 25:
break
return deduped
def _extract_cta_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
text = (main_content or "").lower()
keywords = ["get started", "start", "book", "demo", "trial", "pricing", "contact", "signup", "sign up", "subscribe"]
keyword_hits = [k for k in keywords if k in text]
link_texts = []
for link in links[:200]:
if isinstance(link, dict):
t = (link.get("text") or "").strip()
if t:
link_texts.append(t.lower())
cta_link_hits = [k for k in keywords if any(k in lt for lt in link_texts)]
return {
"keyword_hits": keyword_hits[:10],
"link_cta_hits": list(dict.fromkeys(cta_link_hits))[:10]
}
def _extract_proof_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
text = (main_content or "").lower()
proof_keywords = ["case study", "testimonials", "customers", "trusted by", "reviews", "awards", "partners"]
hits = [k for k in proof_keywords if k in text]
link_hits = []
for link in links[:200]:
if not isinstance(link, dict):
continue
href = (link.get("href") or "").lower()
if any(k.replace(" ", "") in href.replace("-", "").replace("_", "") for k in ["case study", "testimonials", "customers"]):
link_hits.append(href)
return {
"keyword_hits": hits[:10],
"supporting_links": link_hits[:10]
}
async def _analyze_competitor_with_ai(
self,
*,
user_id: str,
baseline: Dict[str, Any],
competitor_input: Dict[str, Any],
extraction: Dict[str, Any]
) -> Dict[str, Any]:
if not isinstance(extraction, dict) or extraction.get("fetch_status", {}).get("status") != "ok":
return {
"status": "skipped",
"reason": "crawl_failed"
}
json_struct = {
"positioning": {
"value_prop": "string",
"target_audience": "string",
"market_tier": "string",
"primary_offer": "string"
},
"content_strategy": {
"themes": ["string"],
"messaging_angles": ["string"],
"cta_patterns": ["string"],
"tone_markers": ["string"]
},
"competitive_advantages": ["string"],
"weaknesses_or_risks": ["string"],
"comparison_to_user_baseline": {
"overlaps": ["string"],
"deltas": ["string"],
"opportunities": ["string"]
},
"confidence": {
"overall": "number",
"notes": ["string"]
}
}
prompt = (
"You are a competitive intelligence analyst.\n"
"Analyze the competitor homepage extraction and compare it to the user's Step 2 baseline insights.\n"
"Return strictly the requested JSON.\n\n"
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
f"Competitor input: {json.dumps(competitor_input, ensure_ascii=False)}\n\n"
f"Homepage extraction: {json.dumps(extraction, ensure_ascii=False)}\n"
)
try:
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
parsed = self._safe_json_parse(raw)
if isinstance(parsed, dict):
return parsed
return {"status": "failed", "error": "invalid_ai_json"}
except Exception as e:
logger.warning(f"AI competitor analysis failed for {competitor_input.get('domain')}: {e}")
return {"status": "failed", "error": str(e)}
async def _aggregate_with_ai(
self,
*,
user_id: str,
baseline: Dict[str, Any],
competitors: List[Dict[str, Any]]
) -> Dict[str, Any]:
json_struct = {
"market_map": {
"clusters": [
{
"cluster_name": "string",
"description": "string",
"competitors": ["string"]
}
]
},
"common_patterns": {
"common_themes": ["string"],
"common_ctas": ["string"],
"common_proof_signals": ["string"]
},
"content_gaps_and_opportunities": [
{
"gap": "string",
"why_it_matters": "string",
"recommended_content_types": ["string"],
"impact": "string",
"effort": "string"
}
],
"strategic_recommendations": [
{
"action": "string",
"expected_impact": "string",
"effort": "string",
"first_steps": ["string"]
}
],
"warnings": ["string"]
}
compact = []
for item in competitors:
comp = item.get("input") if isinstance(item, dict) else None
ai = item.get("ai_analysis") if isinstance(item, dict) else None
if isinstance(comp, dict) and isinstance(ai, dict):
compact.append({
"domain": comp.get("domain"),
"name": comp.get("name"),
"ai_analysis": ai
})
prompt = (
"You are a senior strategy consultant.\n"
"Using the user's Step 2 baseline insights and per-competitor analyses, produce an aggregated market view.\n"
"Return strictly the requested JSON.\n\n"
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
f"Per-competitor analyses: {json.dumps(compact, ensure_ascii=False)}\n"
)
try:
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
parsed = self._safe_json_parse(raw)
if isinstance(parsed, dict):
return parsed
return {"warnings": ["invalid_ai_json"]}
except Exception as e:
logger.warning(f"AI aggregation failed: {e}")
return {"warnings": [str(e)]}
def _safe_json_parse(self, text: str) -> Any:
if not isinstance(text, str):
return None
cleaned = text.strip()
cleaned = re.sub(r"^```json\\s*", "", cleaned)
cleaned = re.sub(r"^```\\s*", "", cleaned)
cleaned = re.sub(r"```\\s*$", "", cleaned)
cleaned = cleaned.strip()
try:
return json.loads(cleaned)
except Exception:
match = re.search(r"\\{[\\s\\S]*\\}", cleaned)
if match:
try:
return json.loads(match.group(0))
except Exception:
return None
return None

View File

@@ -0,0 +1,270 @@
"""
Deep Crawl Service for Onboarding Step 3
Handles deep crawling of user's website, combining Sitemap and Tavily data.
"""
import os
import asyncio
import httpx
from typing import Dict, List, Any, Optional
from datetime import datetime
from loguru import logger
from urllib.parse import urlparse
from services.seo_tools.sitemap_service import SitemapService
from services.research.tavily_service import TavilyService
from services.database import get_session_for_user
from models.crawled_content import EndUserWebsiteContent
from models.website_analysis_monitoring_models import DeepWebsiteCrawlTask, DeepWebsiteCrawlExecutionLog
class DeepCrawlService:
def __init__(self):
self.sitemap_service = SitemapService()
self.tavily_service = TavilyService()
async def execute_deep_crawl(self, user_id: str, website_url: str, task_id: Optional[int] = None) -> Dict[str, Any]:
"""
Execute deep crawl for a user's website.
1. Fetch URLs from Sitemap.
2. Crawl using Tavily.
3. Deduplicate URLs.
4. Check liveness (status code).
5. Save content to DB and File.
"""
logger.info(f"Starting deep crawl for {website_url} (User: {user_id})")
execution_start = datetime.utcnow()
db = get_session_for_user(user_id)
if not db:
raise Exception("Database connection failed")
try:
# 1. Sitemap Discovery
sitemap_urls = set()
try:
# Discover sitemap URL
sitemap_url = await self.sitemap_service.discover_sitemap_url(website_url)
if not sitemap_url:
sitemap_url = f"{website_url.rstrip('/')}/sitemap.xml"
# Analyze sitemap to get URLs
# We use analyze_sitemap directly to get raw URLs
sitemap_data = await self.sitemap_service.analyze_sitemap(sitemap_url)
for url_entry in sitemap_data.get("urls", []):
if isinstance(url_entry, dict) and "loc" in url_entry:
sitemap_urls.add(url_entry["loc"])
logger.info(f"Found {len(sitemap_urls)} URLs from sitemap")
except Exception as e:
logger.warning(f"Sitemap analysis failed: {e}")
# 2. Tavily Crawl
tavily_urls = set()
tavily_results = []
try:
# Use intelligent instructions
instructions = "Find all blog posts, articles, and main content pages. Ignore login, signup, and admin pages."
crawl_result = await self.tavily_service.crawl(
url=website_url,
limit=50, # Limit to avoid excessive costs/time
max_depth=2,
extract_depth="basic",
instructions=instructions
)
if crawl_result.get("success"):
for res in crawl_result.get("results", []):
url = res.get("url")
if url:
tavily_urls.add(url)
tavily_results.append(res)
logger.info(f"Found {len(tavily_urls)} URLs from Tavily")
except Exception as e:
logger.warning(f"Tavily crawl failed: {e}")
# 3. Merge and Deduplicate
all_urls = sitemap_urls.union(tavily_urls)
unique_urls = list(all_urls)
logger.info(f"Total unique URLs to process: {len(unique_urls)}")
# 4. Process URLs (Liveness & Save)
processed_count = 0
success_count = 0
# Create directory for documents if not exists
# We'll save in workspace/{user_id}/crawled_content/
# Note: Path logic should be consistent with project structure
# Assuming workspace path is available via env or config, or constructing it.
# Using relative path for now, adjusted to project root.
# The memory says: workspace/workspace_{user_id}/db/alwrity.db
# So workspace root is workspace/workspace_{user_id}/
workspace_dir = f"workspace/workspace_{user_id}/crawled_content"
os.makedirs(workspace_dir, exist_ok=True)
# Limit concurrent checks
sem = asyncio.Semaphore(10)
async def process_url(url):
async with sem:
return await self._process_single_url(url, user_id, website_url, workspace_dir, tavily_results)
tasks = [process_url(url) for url in unique_urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
processed_data = []
# Save results to DB
for res in results:
if isinstance(res, dict):
processed_data.append(res)
if res.get("status_code") and 200 <= res.get("status_code") < 300:
success_count += 1
# Save to DB
try:
existing = db.query(EndUserWebsiteContent).filter(
EndUserWebsiteContent.user_id == user_id,
EndUserWebsiteContent.url == res["url"]
).first()
if existing:
existing.content = res.get("content")
existing.title = res.get("title")
existing.status_code = res.get("status_code")
existing.crawled_at = datetime.utcnow()
else:
new_content = EndUserWebsiteContent(
user_id=user_id,
website_url=website_url,
url=res["url"],
title=res.get("title"),
content=res.get("content"),
status_code=res.get("status_code"),
crawled_at=datetime.utcnow()
)
db.add(new_content)
except Exception as e:
logger.error(f"Failed to save content to DB for {res['url']}: {e}")
db.commit()
# 5. Update Task Log if task_id provided
if task_id:
log = DeepWebsiteCrawlExecutionLog(
task_id=task_id,
status="success",
result_data={
"total_urls": len(unique_urls),
"sitemap_urls": len(sitemap_urls),
"tavily_urls": len(tavily_urls),
"success_count": success_count,
"processed_urls": processed_data[:100] # Store only a subset to avoid huge JSON
},
execution_time_ms=int((datetime.utcnow() - execution_start).total_seconds() * 1000)
)
db.add(log)
# Update task
task = db.query(DeepWebsiteCrawlTask).filter(DeepWebsiteCrawlTask.id == task_id).first()
if task:
task.last_executed = datetime.utcnow()
task.last_success = datetime.utcnow()
task.status = "active"
task.consecutive_failures = 0
db.commit()
return {
"success": True,
"total_urls": len(unique_urls),
"sitemap_urls": len(sitemap_urls),
"tavily_urls": len(tavily_urls),
"processed_urls": processed_data
}
except Exception as e:
logger.error(f"Deep crawl failed: {e}")
if task_id:
log = DeepWebsiteCrawlExecutionLog(
task_id=task_id,
status="failed",
error_message=str(e),
execution_time_ms=int((datetime.utcnow() - execution_start).total_seconds() * 1000)
)
db.add(log)
task = db.query(DeepWebsiteCrawlTask).filter(DeepWebsiteCrawlTask.id == task_id).first()
if task:
task.last_executed = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.consecutive_failures += 1
db.commit()
raise e
finally:
db.close()
async def _process_single_url(self, url: str, user_id: str, website_url: str, workspace_dir: str, tavily_results: List[Dict]):
"""Check liveness, extract content, and save."""
status_code = None
error = None
content = None
title = None
# 1. Liveness Check
try:
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
resp = await client.get(url)
status_code = resp.status_code
except Exception as e:
error = str(e)
status_code = 0 # Failed
# 2. Get content (from Tavily results or generic extraction if needed)
# Check if we have content from Tavily
tavily_match = next((r for r in tavily_results if r.get("url") == url), None)
if tavily_match:
content = tavily_match.get("raw_content") or tavily_match.get("content")
title = tavily_match.get("title")
elif status_code and 200 <= status_code < 300:
# Simple fetch content if valid
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
resp = await client.get(url)
content = resp.text
# Naive title extraction
if "<title>" in content:
start = content.find("<title>") + 7
end = content.find("</title>")
if start > 6 and end > start:
title = content[start:end]
except Exception:
pass
# 3. Save to Document
if content and title:
safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()[:50]
if not safe_title:
safe_title = "untitled"
filename = f"{safe_title}_{int(datetime.utcnow().timestamp())}.txt"
filepath = os.path.join(workspace_dir, filename)
try:
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"URL: {url}\n")
f.write(f"Title: {title}\n")
f.write(f"Date: {datetime.utcnow()}\n\n")
f.write(content)
except Exception as e:
logger.warning(f"Failed to write file for {url}: {e}")
return {
"url": url,
"status_code": status_code,
"error": error,
"title": title,
"content": content
}

View File

@@ -214,25 +214,71 @@ class ExaService:
List of processed competitor data
"""
competitors = []
user_domain = urlparse(user_url).netloc
try:
user_domain = urlparse(user_url).netloc
except Exception:
user_domain = ""
# Extract results from the SDK response
results = getattr(search_result, 'results', [])
# Handle case where search_result might be a dict or an object
if isinstance(search_result, dict):
results = search_result.get('results', [])
else:
results = getattr(search_result, 'results', [])
for result in results:
try:
# Extract basic information from the result object
competitor_url = getattr(result, 'url', '')
competitor_domain = urlparse(competitor_url).netloc
# Helper to safely get attribute or dict key
def get_val(obj, key, default=None):
if isinstance(obj, dict):
return obj.get(key, default)
return getattr(obj, key, default)
# Extract basic information
raw_url = get_val(result, 'url', '')
# Clean URL (remove backticks and whitespace that might be in the response)
competitor_url = raw_url.strip().strip('`').strip() if raw_url else ''
# Skip if it's the same domain as the user
if competitor_domain == user_domain:
# Fallback to ID if URL is missing/empty but ID looks like a URL
if not competitor_url:
raw_id = get_val(result, 'id', '')
cleaned_id = raw_id.strip().strip('`').strip() if raw_id else ''
if cleaned_id and (cleaned_id.startswith('http://') or cleaned_id.startswith('https://')):
competitor_url = cleaned_id
if not competitor_url:
continue
try:
competitor_domain = urlparse(competitor_url).netloc
except Exception:
competitor_domain = ""
# Skip if it's the same domain as the user (fuzzy match)
if user_domain and competitor_domain and (user_domain in competitor_domain or competitor_domain in user_domain):
continue
# Extract content insights
summary = getattr(result, 'summary', '')
highlights = getattr(result, 'highlights', [])
highlight_scores = getattr(result, 'highlight_scores', [])
summary = get_val(result, 'summary', '')
highlights = get_val(result, 'highlights', [])
highlight_scores = get_val(result, 'highlight_scores', [])
subpages = get_val(result, 'subpages', [])
# Ensure subpages are dicts
processed_subpages = []
if subpages:
for sp in subpages:
if isinstance(sp, dict):
processed_subpages.append(sp)
elif hasattr(sp, '__dict__'):
processed_subpages.append(sp.__dict__)
else:
processed_subpages.append({
"id": getattr(sp, 'id', ''),
"url": getattr(sp, 'url', ''),
"title": getattr(sp, 'title', '')
})
subpages = processed_subpages
# Calculate competitive relevance score
relevance_score = self._calculate_relevance_score(result, user_url)
@@ -240,14 +286,15 @@ class ExaService:
competitor_data = {
"url": competitor_url,
"domain": competitor_domain,
"title": getattr(result, 'title', ''),
"published_date": getattr(result, 'published_date', None),
"author": getattr(result, 'author', None),
"favicon": getattr(result, 'favicon', None),
"image": getattr(result, 'image', None),
"title": get_val(result, 'title', ''),
"published_date": get_val(result, 'published_date', None),
"author": get_val(result, 'author', None),
"favicon": get_val(result, 'favicon', None),
"image": get_val(result, 'image', None),
"summary": summary,
"highlights": highlights,
"highlight_scores": highlight_scores,
"subpages": subpages,
"relevance_score": relevance_score,
"competitive_insights": self._extract_competitive_insights(summary, highlights),
"content_analysis": self._analyze_content_quality(result)
@@ -439,6 +486,11 @@ class ExaService:
# Log the raw Exa API response for debugging
logger.info(f"Raw Exa social media response for {user_url}:")
if hasattr(result, 'to_json'):
logger.info(result.to_json())
else:
logger.info(str(result))
logger.info(f" - Request ID: {getattr(result, 'request_id', 'N/A')}")
logger.info(f" └─ Cost: ${getattr(getattr(result, 'cost_dollars', None), 'total', 0)}")
# Note: Full raw response contains verbose content - logging only summary
@@ -477,9 +529,22 @@ class ExaService:
import json
import re
if answer_text.strip().startswith('{'):
logger.warning(f"Parsing Exa answer text: {answer_text[:200]}...")
# Clean markdown code blocks if present
clean_text = answer_text.strip()
if clean_text.startswith('```json'):
clean_text = clean_text[7:]
if clean_text.startswith('```'):
clean_text = clean_text[3:]
if clean_text.endswith('```'):
clean_text = clean_text[:-3]
clean_text = clean_text.strip()
if clean_text.startswith('{'):
# Direct JSON format
answer_data = json.loads(answer_text.strip())
answer_data = json.loads(clean_text)
else:
# Parse markdown format with URLs
answer_data = {

View File

@@ -26,7 +26,7 @@ async def generate_research_persona_task(user_id: str):
logger.info(f"Scheduled research persona generation started for user {user_id}")
# Get database session
db = get_db_session()
db = get_db_session(user_id)
if not db:
logger.error(f"Failed to get database session for research persona generation (user: {user_id})")
return

View File

@@ -9,13 +9,14 @@ from datetime import datetime, timedelta
from loguru import logger
from fastapi import HTTPException
from sqlalchemy import text
from services.database import get_db_session
from models.onboarding import PersonaData, OnboardingSession
from models.research_persona_models import ResearchPersona
from .research_persona_prompt_builder import ResearchPersonaPromptBuilder
from services.llm_providers.main_text_generation import llm_text_gen
from services.onboarding.database_service import OnboardingDatabaseService
from services.persona_data_service import PersonaDataService
from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService
class ResearchPersonaService:
@@ -24,10 +25,62 @@ class ResearchPersonaService:
CACHE_TTL_DAYS = 7 # 7-day cache TTL
def __init__(self, db_session=None):
self.db = db_session or get_db_session()
self.db = db_session
self.prompt_builder = ResearchPersonaPromptBuilder()
self.onboarding_service = OnboardingDatabaseService(db=self.db)
self.persona_data_service = PersonaDataService(db_session=self.db)
# self.persona_data_service was initialized here but unused in this service
self.integration_service = OnboardingDataIntegrationService()
self._research_persona_cols_checked = False
def _get_session(self, user_id: str):
"""Helper to get a database session."""
if self.db:
return self.db, False
return get_db_session(user_id), True
def _ensure_research_persona_columns(self, session_db) -> None:
"""Ensure research_persona columns exist in persona_data table (runtime migration)."""
if self._research_persona_cols_checked:
return
try:
# Check if columns exist using PRAGMA (SQLite) or information_schema (PostgreSQL)
db_url = str(session_db.bind.url) if session_db.bind else ""
if 'sqlite' in db_url.lower():
# SQLite: Use PRAGMA to check columns
result = session_db.execute(text("PRAGMA table_info(persona_data)"))
cols = {row[1] for row in result} # Column name is at index 1
if 'research_persona' not in cols:
logger.info("Adding missing column research_persona to persona_data table")
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona JSON"))
session_db.commit()
if 'research_persona_generated_at' not in cols:
logger.info("Adding missing column research_persona_generated_at to persona_data table")
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona_generated_at TIMESTAMP"))
session_db.commit()
else:
# PostgreSQL: Try to query the columns (will fail if they don't exist)
try:
session_db.execute(text("SELECT research_persona, research_persona_generated_at FROM persona_data LIMIT 0"))
except Exception:
# Columns don't exist, add them
logger.info("Adding missing columns research_persona and research_persona_generated_at to persona_data table")
try:
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona JSONB"))
session_db.execute(text("ALTER TABLE persona_data ADD COLUMN research_persona_generated_at TIMESTAMP"))
session_db.commit()
except Exception as alter_err:
logger.error(f"Failed to add research_persona columns: {alter_err}")
session_db.rollback()
raise
except Exception as e:
logger.error(f"Error ensuring research_persona columns: {e}")
session_db.rollback()
raise
finally:
self._research_persona_cols_checked = True
def get_cached_only(
self,
@@ -46,9 +99,16 @@ class ResearchPersonaService:
Returns:
ResearchPersona if exists in database, None otherwise
"""
db = None
should_close = False
try:
db, should_close = self._get_session(user_id)
if not db:
logger.error(f"Could not get database session for user {user_id}")
return None
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
persona_data = self._get_persona_data_record(user_id, db)
if not persona_data:
logger.debug(f"[get_cached_only] No persona data record found for user {user_id}")
@@ -110,6 +170,9 @@ class ResearchPersonaService:
except Exception as e:
logger.error(f"[get_cached_only] ❌ Error getting research persona for user {user_id}: {e}", exc_info=True)
return None
finally:
if should_close and db:
db.close()
def get_or_generate(
self,
@@ -126,9 +189,16 @@ class ResearchPersonaService:
Returns:
ResearchPersona if successful, None otherwise
"""
db = None
should_close = False
try:
db, should_close = self._get_session(user_id)
if not db:
logger.error(f"Could not get database session for get_or_generate (user {user_id})")
return None
# Get persona data record
persona_data = self._get_persona_data_record(user_id)
persona_data = self._get_persona_data_record(user_id, db)
if not persona_data:
logger.warning(f"No persona data found for user {user_id}, cannot generate research persona")
@@ -168,18 +238,14 @@ class ResearchPersonaService:
# 3. Parsing of existing persona failed
try:
logger.info(f"Generating research persona for user {user_id}")
research_persona = self.generate_research_persona(user_id)
research_persona = self.generate_research_persona(user_id, db)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) so they propagate to API
raise
if research_persona:
# Save to database
if self.save_research_persona(user_id, research_persona):
logger.info(f"✅ Research persona generated and saved for user {user_id}")
else:
logger.warning(f"Failed to save research persona for user {user_id}")
# generate_research_persona saves it automatically now
logger.info(f"✅ Research persona generated and saved for user {user_id}")
return research_persona
else:
# Log detailed error for debugging expensive failures
@@ -196,22 +262,36 @@ class ResearchPersonaService:
except Exception as e:
logger.error(f"Error getting/generating research persona for user {user_id}: {e}")
return None
finally:
if should_close and db:
db.close()
def generate_research_persona(self, user_id: str) -> Optional[ResearchPersona]:
def generate_research_persona(self, user_id: str, db=None) -> Optional[ResearchPersona]:
"""
Generate a new research persona for the user.
Args:
user_id: User ID (Clerk string)
db: Optional database session
Returns:
ResearchPersona if successful, None otherwise
"""
session_db = None
should_close = False
try:
session_db = db
if not session_db:
session_db, should_close = self._get_session(user_id)
if not session_db:
logger.error(f"Could not get database session for generate_research_persona (user {user_id})")
return None
logger.info(f"Generating research persona for user {user_id}")
# Collect onboarding data
onboarding_data = self._collect_onboarding_data(user_id)
onboarding_data = self._collect_onboarding_data(user_id, session_db)
if not onboarding_data:
logger.warning(f"Insufficient onboarding data for user {user_id}")
@@ -275,6 +355,12 @@ class ResearchPersonaService:
try:
research_persona = ResearchPersona(**persona_dict)
logger.info(f"✅ Research persona generated successfully for user {user_id}")
# Save the generated persona
save_success = self.save_research_persona(user_id, research_persona, session_db)
if not save_success:
logger.warning(f"Failed to save generated persona for user {user_id}")
return research_persona
except Exception as validation_error:
logger.error(f"Failed to validate ResearchPersona from dict: {validation_error}")
@@ -297,6 +383,9 @@ class ResearchPersonaService:
except Exception as e:
logger.error(f"Error generating research persona for user {user_id}: {e}")
return None
finally:
if should_close and session_db:
session_db.close()
def is_cache_valid(self, persona_data: PersonaData) -> bool:
"""
@@ -323,7 +412,8 @@ class ResearchPersonaService:
def save_research_persona(
self,
user_id: str,
research_persona: ResearchPersona
research_persona: ResearchPersona,
db=None
) -> bool:
"""
Save research persona to database.
@@ -331,12 +421,23 @@ class ResearchPersonaService:
Args:
user_id: User ID (Clerk string)
research_persona: ResearchPersona to save
db: Optional database session
Returns:
True if successful, False otherwise
"""
session_db = None
should_close = False
try:
persona_data = self._get_persona_data_record(user_id)
session_db = db
if not session_db:
session_db, should_close = self._get_session(user_id)
if not session_db:
logger.error(f"Could not get database session for save_research_persona (user {user_id})")
return False
persona_data = self._get_persona_data_record(user_id, session_db)
if not persona_data:
logger.error(f"No persona data record found for user {user_id}")
@@ -349,24 +450,33 @@ class ResearchPersonaService:
persona_data.research_persona = persona_dict
persona_data.research_persona_generated_at = datetime.utcnow()
self.db.commit()
session_db.commit()
logger.info(f"✅ Research persona saved for user {user_id}")
return True
except Exception as e:
logger.error(f"Error saving research persona for user {user_id}: {e}")
self.db.rollback()
if session_db:
session_db.rollback()
return False
finally:
if should_close and session_db:
session_db.close()
def _get_persona_data_record(self, user_id: str) -> Optional[PersonaData]:
def _get_persona_data_record(self, user_id: str, db=None) -> Optional[PersonaData]:
"""Get PersonaData database record for user."""
try:
session_db = db or self.db
if not session_db:
logger.error(f"No database session provided for _get_persona_data_record (user {user_id})")
return None
# Ensure research_persona columns exist before querying
self.onboarding_service._ensure_research_persona_columns(self.db)
self._ensure_research_persona_columns(session_db)
# Get onboarding session
session = self.db.query(OnboardingSession).filter(
session = session_db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).first()
@@ -374,7 +484,7 @@ class ResearchPersonaService:
return None
# Get persona data
persona_data = self.db.query(PersonaData).filter(
persona_data = session_db.query(PersonaData).filter(
PersonaData.session_id == session.id
).first()
@@ -384,7 +494,7 @@ class ResearchPersonaService:
logger.error(f"Error getting persona data record for user {user_id}: {e}")
return None
def _collect_onboarding_data(self, user_id: str) -> Optional[Dict[str, Any]]:
def _collect_onboarding_data(self, user_id: str, db=None) -> Optional[Dict[str, Any]]:
"""
Collect all onboarding data needed for research persona generation.
@@ -392,40 +502,44 @@ class ResearchPersonaService:
Dictionary with website_analysis, persona_data, research_preferences, business_info
"""
try:
# Get website analysis
website_analysis = self.onboarding_service.get_website_analysis(user_id, self.db) or {}
session_db = db or self.db
if not session_db:
logger.error(f"No database session provided for _collect_onboarding_data (user {user_id})")
return None
# Get integrated data via SSOT
integrated_data = self.integration_service.get_integrated_data_sync(user_id, session_db)
# Get persona data
persona_data_dict = self.onboarding_service.get_persona_data(user_id, self.db) or {}
if not integrated_data:
logger.warning(f"No integrated data found for user {user_id}")
return None
website_analysis = integrated_data.get('website_analysis', {})
persona_data_dict = integrated_data.get('persona_data', {})
research_prefs = integrated_data.get('research_preferences', {})
canonical_profile = integrated_data.get('canonical_profile', {})
# Get research preferences
research_prefs = self.onboarding_service.get_research_preferences(user_id, self.db) or {}
# Get business info - construct from persona data and website analysis
business_info = {}
canonical_business = canonical_profile.get('business_info')
if isinstance(canonical_business, dict):
business_info.update(canonical_business)
# Use canonical profile data (SSOT) instead of manual logic if possible
# The canonical profile already handles logic for industry/target_audience from various sources
if not business_info.get('industry') and canonical_profile.get('industry'):
business_info['industry'] = canonical_profile.get('industry')
# Try to extract from persona data
if persona_data_dict:
core_persona = persona_data_dict.get('corePersona') or persona_data_dict.get('core_persona')
if core_persona:
if core_persona.get('industry'):
business_info['industry'] = core_persona['industry']
if core_persona.get('target_audience'):
business_info['target_audience'] = core_persona['target_audience']
if not business_info.get('target_audience') and canonical_profile.get('target_audience'):
business_info['target_audience'] = canonical_profile.get('target_audience')
# Fallback to website analysis if not in persona
# Fallback logic if canonical profile is missing these (though it should have them)
if not business_info.get('industry') and website_analysis:
target_audience_data = website_analysis.get('target_audience', {})
if isinstance(target_audience_data, dict):
industry_focus = target_audience_data.get('industry_focus')
if industry_focus:
business_info['industry'] = industry_focus
demographics = target_audience_data.get('demographics')
if demographics:
business_info['target_audience'] = demographics if isinstance(demographics, str) else str(demographics)
# Check if we have enough data - be more lenient since we can infer from minimal data
# We need at least some basic information to generate a meaningful persona
has_basic_data = bool(
website_analysis or
persona_data_dict or
@@ -457,20 +571,17 @@ class ResearchPersonaService:
business_info['inferred'] = True
# Get competitor analysis data (if available)
competitor_analysis = None
try:
competitor_analysis = self.onboarding_service.get_competitor_analysis(user_id, self.db)
if competitor_analysis:
logger.info(f"Found {len(competitor_analysis)} competitors for research persona generation")
except Exception as e:
logger.debug(f"Could not retrieve competitor analysis for persona generation: {e}")
# Use SSOT (Integrated data contains competitor info)
competitor_analysis = integrated_data.get('competitor_analysis')
if not competitor_analysis:
competitor_analysis = []
return {
"website_analysis": website_analysis,
"persona_data": persona_data_dict,
"research_preferences": research_prefs,
"business_info": business_info,
"competitor_analysis": competitor_analysis # Add competitor data for better preset generation
"competitor_analysis": competitor_analysis
}
except Exception as e:

View File

@@ -258,6 +258,112 @@ class TavilyService:
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
return results
async def crawl(
self,
url: str,
limit: int = 50,
max_depth: int = 1,
max_breadth: int = 20,
extract_depth: str = "basic",
include_favicon: bool = False,
instructions: str = "",
allow_external: bool = True
) -> Dict[str, Any]:
"""
Crawl a website using Tavily API.
Args:
url: The root URL to begin the crawl
limit: Total number of links the crawler will process
max_depth: Max depth of the crawl
max_breadth: Max number of links to follow per level
extract_depth: 'basic' or 'advanced'
include_favicon: Whether to include favicon
instructions: Natural language instructions for the crawler
allow_external: Whether to return external links
Returns:
Dict containing crawl results
"""
try:
self._try_initialize()
if not self.enabled:
raise ValueError("Tavily Service is not enabled - API key missing")
logger.info(f"Starting Tavily crawl for: {url}")
payload = {
"api_key": self.api_key,
"urls": [url] # Tavily extract/crawl might take a list or single URL.
# Wait, if this is 'crawl', usually it takes one URL.
# Let's double check standard Tavily API.
# But since I can't check external docs, I will follow the MCP tool params.
# The MCP tool has 'url' (string).
}
# NOTE: Tavily API structure for crawl might be different.
# I'll assume there is a /crawl endpoint or similar.
# However, looking at standard Tavily python SDK, they often use 'extract' or 'search'.
# But 'crawl' is a distinct feature.
# I will use a generic request structure based on the tool parameters.
# Re-constructing payload based on tool params
request_payload = {
"api_key": self.api_key,
"url": url,
"limit": limit,
"max_depth": max_depth,
"max_breadth": max_breadth,
"extract_depth": extract_depth,
"include_favicon": include_favicon,
"instructions": instructions,
"allow_external": allow_external
}
async with aiohttp.ClientSession() as session:
# Assuming the endpoint is /crawl based on the tool name
# If it fails, I'll need to adjust.
endpoint = f"{self.base_url}/crawl"
# Note: Tavily might not have a /crawl endpoint exposed this way in REST if it's new.
# But let's try.
# Actually, wait. The user mentioned "Refer to the tavily mcp".
# The tool definition `mcp_tavily-remote-mcp_tavily_crawl` has the description.
# I will proceed with /crawl.
async with session.post(
endpoint,
json=request_payload,
headers={"Content-Type": "application/json"},
timeout=aiohttp.ClientTimeout(total=300) # Crawling takes longer
) as response:
if response.status == 200:
result = await response.json()
logger.info(f"Tavily crawl completed successfully.")
return {
"success": True,
"results": result.get("results", []), # Assuming standard response
"timestamp": datetime.utcnow().isoformat()
}
else:
error_text = await response.text()
logger.error(f"Tavily Crawl API error: {response.status} - {error_text}")
return {
"success": False,
"error": f"Tavily API error: {response.status}",
"details": error_text
}
except Exception as e:
logger.error(f"Error in Tavily crawl: {str(e)}")
return {
"success": False,
"error": str(e),
"details": "An unexpected error occurred during crawl"
}
async def search_industry_trends(
self,