Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts
This commit is contained in:
221
backend/services/seo/advertools_service.py
Normal file
221
backend/services/seo/advertools_service.py
Normal file
@@ -0,0 +1,221 @@
|
||||
import advertools as adv
|
||||
import pandas as pd
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
class AdvertoolsService:
|
||||
"""
|
||||
Centralized service for leveraging the Advertools library for deep SEO intelligence.
|
||||
Provides functions for sitemap analysis, content auditing, and link extraction.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logger.bind(service="AdvertoolsService")
|
||||
|
||||
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
|
||||
|
||||
# advertools sitemap_to_df is blocking, run in executor
|
||||
loop = asyncio.get_event_loop()
|
||||
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
|
||||
|
||||
if df is None or df.empty:
|
||||
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
|
||||
|
||||
# Convert lastmod to datetime
|
||||
if 'lastmod' in df.columns:
|
||||
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
|
||||
|
||||
total_urls = len(df)
|
||||
|
||||
# Handle potential empty datetime columns
|
||||
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
||||
now = datetime.now(df['lastmod'].dt.tz)
|
||||
thirty_days_ago = now - timedelta(days=30)
|
||||
recent_urls = df[df['lastmod'] > thirty_days_ago]
|
||||
six_months_ago = now - timedelta(days=180)
|
||||
stale_urls = df[df['lastmod'] < six_months_ago]
|
||||
|
||||
publishing_velocity = len(recent_urls) / 4.0 # URLs per week
|
||||
stale_count = len(stale_urls)
|
||||
else:
|
||||
publishing_velocity = 0
|
||||
stale_count = 0
|
||||
|
||||
# Enhanced Content Pillars (Top folder patterns - 3 levels deep)
|
||||
def extract_hierarchy(url: str):
|
||||
try:
|
||||
parts = urlparse(url).path.strip('/').split('/')
|
||||
if not parts or not parts[0]: return "home"
|
||||
return "/".join(parts[:2]) # Capture top 2 segments
|
||||
except:
|
||||
return "other"
|
||||
|
||||
df['pillar'] = df['loc'].apply(extract_hierarchy)
|
||||
pillars = df['pillar'].value_counts().head(15).to_dict()
|
||||
|
||||
# Return a sample of URLs for auditing (top 15 most recent if available)
|
||||
audit_urls = []
|
||||
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
||||
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
|
||||
else:
|
||||
audit_urls = df['loc'].head(15).tolist()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"metrics": {
|
||||
"total_urls": total_urls,
|
||||
"publishing_velocity": round(publishing_velocity, 2),
|
||||
"stale_content_count": stale_count,
|
||||
"stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
|
||||
"top_pillars": pillars,
|
||||
"audit_sample_urls": audit_urls
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Performs a shallow crawl and theme analysis using word frequency.
|
||||
Uses unique temporary files for thread safety.
|
||||
"""
|
||||
temp_file = None
|
||||
try:
|
||||
self.logger.info(f"Auditing content for {len(url_list)} URLs")
|
||||
|
||||
# Create a unique temporary file
|
||||
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
||||
temp_file = tf.name
|
||||
|
||||
# advertools crawl is blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, lambda: adv.crawl(
|
||||
url_list=url_list,
|
||||
output_file=temp_file,
|
||||
follow_links=False,
|
||||
custom_settings={
|
||||
'LOG_LEVEL': 'WARNING',
|
||||
'CLOSESPIDER_PAGECOUNT': 15, # Guardrail: Max 15 pages
|
||||
'DOWNLOAD_TIMEOUT': 30 # Guardrail: 30s timeout per page
|
||||
}
|
||||
))
|
||||
|
||||
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
||||
return {"success": False, "error": "Crawl failed to generate output or output is empty."}
|
||||
|
||||
crawl_df = pd.read_json(temp_file, lines=True)
|
||||
|
||||
# Extract themes using word frequency
|
||||
text_columns = [col for col in ['body_text', 'h1', 'h2', 'title'] if col in crawl_df.columns]
|
||||
if not text_columns:
|
||||
return {"success": False, "error": "No text content found to analyze."}
|
||||
|
||||
all_text = " ".join(crawl_df[text_columns].fillna("").values.flatten())
|
||||
|
||||
if not all_text.strip():
|
||||
return {"success": False, "error": "Extracted text is empty."}
|
||||
|
||||
word_freq = await loop.run_in_executor(None, lambda: adv.word_frequency([all_text], rm_stopwords=True))
|
||||
top_themes = word_freq.head(20).to_dict(orient='records')
|
||||
|
||||
# Additional metrics: Readability, word count
|
||||
avg_word_count = 0
|
||||
if 'body_text' in crawl_df.columns:
|
||||
crawl_df['word_count'] = crawl_df['body_text'].fillna("").str.split().str.len()
|
||||
avg_word_count = crawl_df['word_count'].mean()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"themes": top_themes,
|
||||
"page_count": len(crawl_df),
|
||||
"avg_word_count": round(avg_word_count, 1),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to audit content: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
if temp_file and os.path.exists(temp_file):
|
||||
try:
|
||||
os.remove(temp_file)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|
||||
|
||||
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyzes linking patterns and social media presence using unique temporary files.
|
||||
"""
|
||||
temp_file = None
|
||||
try:
|
||||
self.logger.info(f"Extracting communication style for {len(url_list)} URLs")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
||||
temp_file = tf.name
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, lambda: adv.crawl(
|
||||
url_list=url_list,
|
||||
output_file=temp_file,
|
||||
follow_links=False,
|
||||
custom_settings={
|
||||
'LOG_LEVEL': 'WARNING',
|
||||
'CLOSESPIDER_PAGECOUNT': 10,
|
||||
'DOWNLOAD_TIMEOUT': 30
|
||||
}
|
||||
))
|
||||
|
||||
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
||||
return {"success": False, "error": "Link extraction crawl failed."}
|
||||
|
||||
crawl_df = pd.read_json(temp_file, lines=True)
|
||||
|
||||
# Extract social links and internal/external stats
|
||||
all_links = []
|
||||
if 'links_url' in crawl_df.columns:
|
||||
for links in crawl_df['links_url'].dropna():
|
||||
if isinstance(links, str):
|
||||
all_links.extend(links.split("@@"))
|
||||
elif isinstance(links, list):
|
||||
all_links.extend(links)
|
||||
|
||||
if not all_links:
|
||||
return {"success": True, "social_links": [], "link_stats": {"total_links_found": 0, "unique_domains": 0}}
|
||||
|
||||
# Analyze links
|
||||
link_df = adv.url_to_df(all_links)
|
||||
|
||||
social_domains = ['twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com', 'youtube.com', 'github.com']
|
||||
social_links = []
|
||||
if not link_df.empty and 'netloc' in link_df.columns:
|
||||
social_links = link_df[link_df['netloc'].isin(social_domains)]['url'].unique().tolist()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"social_links": social_links,
|
||||
"link_stats": {
|
||||
"total_links_found": len(all_links),
|
||||
"unique_domains": link_df['netloc'].nunique() if not link_df.empty else 0
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract communication style: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
if temp_file and os.path.exists(temp_file):
|
||||
try:
|
||||
os.remove(temp_file)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|
||||
94
backend/services/seo/advertools_task_manager.py
Normal file
94
backend/services/seo/advertools_task_manager.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
Advertools Task Restoration Utility
|
||||
Handles creation and restoration of Advertools intelligence tasks for users.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any
|
||||
from loguru import logger
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models.onboarding import WebsiteAnalysis, OnboardingSession
|
||||
from models.advertools_monitoring_models import AdvertoolsTask
|
||||
from services.database import get_all_user_ids, get_session_for_user
|
||||
|
||||
async def restore_advertools_tasks(scheduler: Any) -> int:
|
||||
"""
|
||||
Restore/create Advertools tasks for all users who have completed Step 2.
|
||||
|
||||
Returns:
|
||||
Number of tasks created/restored
|
||||
"""
|
||||
logger.info("Restoring Advertools intelligence tasks...")
|
||||
total_created = 0
|
||||
|
||||
user_ids = get_all_user_ids()
|
||||
for user_id in user_ids:
|
||||
try:
|
||||
db = get_session_for_user(user_id)
|
||||
if not db:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Check if user has completed Step 2 (has WebsiteAnalysis)
|
||||
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
if not session:
|
||||
continue
|
||||
|
||||
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
|
||||
if not analysis or not analysis.website_url:
|
||||
continue
|
||||
|
||||
# Check for existing Advertools tasks
|
||||
existing_audit = db.query(AdvertoolsTask).filter(
|
||||
AdvertoolsTask.user_id == user_id,
|
||||
func.json_extract(AdvertoolsTask.payload, '$.type') == 'content_audit'
|
||||
).first()
|
||||
|
||||
if not existing_audit:
|
||||
# Create weekly content audit task
|
||||
new_audit = AdvertoolsTask(
|
||||
user_id=user_id,
|
||||
website_url=analysis.website_url,
|
||||
status='active',
|
||||
next_execution=datetime.utcnow() + timedelta(days=1), # Start tomorrow
|
||||
frequency_days=7,
|
||||
payload={
|
||||
"type": "content_audit",
|
||||
"website_url": analysis.website_url
|
||||
}
|
||||
)
|
||||
db.add(new_audit)
|
||||
total_created += 1
|
||||
logger.info(f"Created weekly content audit task for user {user_id}")
|
||||
|
||||
existing_health = db.query(AdvertoolsTask).filter(
|
||||
AdvertoolsTask.user_id == user_id,
|
||||
func.json_extract(AdvertoolsTask.payload, '$.type') == 'site_health'
|
||||
).first()
|
||||
|
||||
if not existing_health:
|
||||
# Create weekly site health task
|
||||
new_health = AdvertoolsTask(
|
||||
user_id=user_id,
|
||||
website_url=analysis.website_url,
|
||||
status='active',
|
||||
next_execution=datetime.utcnow() + timedelta(days=2), # Start in 2 days
|
||||
frequency_days=7,
|
||||
payload={
|
||||
"type": "site_health",
|
||||
"website_url": analysis.website_url
|
||||
}
|
||||
)
|
||||
db.add(new_health)
|
||||
total_created += 1
|
||||
logger.info(f"Created weekly site health task for user {user_id}")
|
||||
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error restoring Advertools tasks for user {user_id}: {e}")
|
||||
|
||||
return total_created
|
||||
@@ -12,8 +12,7 @@ from sqlalchemy.orm import Session
|
||||
from loguru import logger
|
||||
|
||||
from utils.logger_utils import get_service_logger
|
||||
from services.onboarding.data_service import OnboardingDataService
|
||||
from services.calendar_generation_datasource_framework.data_processing.comprehensive_user_data import ComprehensiveUserDataProcessor
|
||||
from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService
|
||||
|
||||
logger = get_service_logger("competitive_analyzer")
|
||||
|
||||
@@ -23,8 +22,7 @@ class CompetitiveAnalyzer:
|
||||
def __init__(self, db: Session):
|
||||
"""Initialize the competitive analyzer."""
|
||||
self.db = db
|
||||
self.user_data_service = OnboardingDataService(db)
|
||||
self.comprehensive_processor = ComprehensiveUserDataProcessor(db)
|
||||
self.integration_service = OnboardingDataIntegrationService()
|
||||
|
||||
async def get_competitive_insights(self, user_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -37,8 +35,9 @@ class CompetitiveAnalyzer:
|
||||
Dictionary containing competitive insights
|
||||
"""
|
||||
try:
|
||||
# Get user's research preferences and competitor data
|
||||
research_prefs = self.user_data_service.get_user_research_preferences(user_id)
|
||||
# Get user's research preferences and competitor data via SSOT
|
||||
onboarding_data = await self.integration_service.process_onboarding_data(user_id, self.db)
|
||||
research_prefs = onboarding_data.get('research_preferences', {})
|
||||
competitors = research_prefs.get('competitors', []) if research_prefs else []
|
||||
|
||||
if not competitors:
|
||||
@@ -51,9 +50,8 @@ class CompetitiveAnalyzer:
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Get comprehensive user data including competitor analysis
|
||||
comprehensive_data = self.comprehensive_processor.get_comprehensive_user_data(user_id)
|
||||
competitor_analysis = comprehensive_data.get('competitor_analysis', {})
|
||||
# Get competitor analysis directly from SSOT data
|
||||
competitor_analysis = onboarding_data.get('competitor_analysis', {})
|
||||
|
||||
# Extract competitor keywords and content topics
|
||||
competitor_keywords = self._extract_competitor_keywords(competitor_analysis, competitors)
|
||||
@@ -300,6 +298,7 @@ class CompetitiveAnalyzer:
|
||||
else:
|
||||
keyword_map[keyword] = {
|
||||
'keyword': kw['keyword'],
|
||||
'competitor': kw['competitor'], # Primary competitor
|
||||
'competitors': [kw['competitor']],
|
||||
'source': kw['source'],
|
||||
'volume_estimate': kw['volume_estimate'],
|
||||
|
||||
@@ -9,6 +9,7 @@ OAuth connections from onboarding step 5.
|
||||
from typing import Dict, Any, Optional, List
|
||||
from datetime import datetime, timedelta
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func
|
||||
from loguru import logger
|
||||
|
||||
from utils.logger_utils import get_service_logger
|
||||
@@ -16,9 +17,12 @@ from services.gsc_service import GSCService
|
||||
from services.integrations.bing_oauth import BingOAuthService
|
||||
from services.bing_analytics_storage_service import BingAnalyticsStorageService
|
||||
from services.analytics_cache_service import AnalyticsCacheService
|
||||
from services.onboarding.data_service import OnboardingDataService
|
||||
from api.content_planning.services.content_strategy.onboarding.data_integration import OnboardingDataIntegrationService
|
||||
from .analytics_aggregator import AnalyticsAggregator
|
||||
from .competitive_analyzer import CompetitiveAnalyzer
|
||||
from models.onboarding import SEOPageAudit, WebsiteAnalysis, OnboardingSession
|
||||
from models.website_analysis_monitoring_models import OnboardingFullWebsiteAnalysisTask
|
||||
from models.advertools_monitoring_models import AdvertoolsTask
|
||||
|
||||
logger = get_service_logger("seo_dashboard")
|
||||
|
||||
@@ -30,12 +34,19 @@ class SEODashboardService:
|
||||
self.db = db
|
||||
self.gsc_service = GSCService()
|
||||
self.bing_oauth = BingOAuthService()
|
||||
self.bing_storage = BingAnalyticsStorageService("sqlite:///alwrity.db")
|
||||
# Bing storage is initialized per-user dynamically
|
||||
self.analytics_cache = AnalyticsCacheService()
|
||||
self.user_data_service = OnboardingDataService(db)
|
||||
self.integration_service = OnboardingDataIntegrationService()
|
||||
self.analytics_aggregator = AnalyticsAggregator()
|
||||
self.competitive_analyzer = CompetitiveAnalyzer(db)
|
||||
|
||||
def _get_bing_storage(self, user_id: str) -> BingAnalyticsStorageService:
|
||||
"""Get Bing storage service for user."""
|
||||
from services.database import get_user_db_path
|
||||
db_path = get_user_db_path(user_id)
|
||||
db_url = f"sqlite:///{db_path}"
|
||||
return BingAnalyticsStorageService(db_url)
|
||||
|
||||
async def get_platform_status(self, user_id: str) -> Dict[str, Any]:
|
||||
"""Get connection status for GSC and Bing platforms."""
|
||||
try:
|
||||
@@ -81,8 +92,10 @@ class SEODashboardService:
|
||||
try:
|
||||
# Get user's website URL if not provided
|
||||
if not site_url:
|
||||
# Try to get from website analysis first
|
||||
website_analysis = self.user_data_service.get_user_website_analysis(int(user_id))
|
||||
# Use SSOT for onboarding data
|
||||
onboarding_data = await self.integration_service.process_onboarding_data(user_id, self.db)
|
||||
website_analysis = onboarding_data.get('website_analysis', {})
|
||||
|
||||
if website_analysis and website_analysis.get('website_url'):
|
||||
site_url = website_analysis['website_url']
|
||||
else:
|
||||
@@ -115,6 +128,10 @@ class SEODashboardService:
|
||||
|
||||
# Generate AI insights
|
||||
ai_insights = await self._generate_ai_insights(summary, timeseries, competitor_insights)
|
||||
|
||||
technical_seo_audit = self._get_technical_seo_audit_overview(user_id, site_url)
|
||||
|
||||
advertools_insights = self._get_advertools_insights(user_id, site_url)
|
||||
|
||||
return {
|
||||
"website_url": site_url,
|
||||
@@ -124,12 +141,71 @@ class SEODashboardService:
|
||||
"competitor_insights": competitor_insights,
|
||||
"health_score": health_score,
|
||||
"ai_insights": ai_insights,
|
||||
"technical_seo_audit": technical_seo_audit,
|
||||
"advertools_insights": advertools_insights,
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting dashboard overview for user {user_id}: {e}")
|
||||
raise
|
||||
|
||||
def _get_technical_seo_audit_overview(self, user_id: str, site_url: str) -> Dict[str, Any]:
|
||||
site_key = (site_url or "").rstrip("/")
|
||||
|
||||
try:
|
||||
q = self.db.query(SEOPageAudit).filter(SEOPageAudit.user_id == str(user_id))
|
||||
|
||||
if site_key:
|
||||
q = q.filter(SEOPageAudit.website_url.like(f"{site_key}%"))
|
||||
|
||||
audits = q.order_by(func.coalesce(SEOPageAudit.overall_score, 1000).asc()).all()
|
||||
|
||||
pages_audited = len(audits)
|
||||
scores = [a.overall_score for a in audits if isinstance(a.overall_score, int)]
|
||||
avg_score = round(sum(scores) / len(scores)) if scores else 0
|
||||
fix_scheduled_pages = len([a for a in audits if a.status == 'fix_scheduled'])
|
||||
|
||||
worst_pages = [
|
||||
{
|
||||
"page_url": a.page_url,
|
||||
"overall_score": a.overall_score,
|
||||
"status": a.status,
|
||||
"issues_count": len(a.issues or []) if isinstance(a.issues, list) else 0
|
||||
}
|
||||
for a in audits[:10]
|
||||
]
|
||||
|
||||
task = self.db.query(OnboardingFullWebsiteAnalysisTask).filter(
|
||||
OnboardingFullWebsiteAnalysisTask.user_id == str(user_id),
|
||||
OnboardingFullWebsiteAnalysisTask.website_url.like(f"{site_key}%")
|
||||
).order_by(OnboardingFullWebsiteAnalysisTask.updated_at.desc()).first()
|
||||
|
||||
task_status = None
|
||||
next_execution = None
|
||||
if task:
|
||||
task_status = task.status
|
||||
next_execution = task.next_execution.isoformat() if task.next_execution else None
|
||||
|
||||
return {
|
||||
"status": "ready" if pages_audited > 0 else ("scheduled" if task_status == "active" else "pending"),
|
||||
"task_status": task_status,
|
||||
"next_execution": next_execution,
|
||||
"pages_audited": pages_audited,
|
||||
"avg_score": avg_score,
|
||||
"fix_scheduled_pages": fix_scheduled_pages,
|
||||
"worst_pages": worst_pages
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to build technical SEO audit overview for user {user_id}: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"error": str(e),
|
||||
"pages_audited": 0,
|
||||
"avg_score": 0,
|
||||
"fix_scheduled_pages": 0,
|
||||
"worst_pages": []
|
||||
}
|
||||
|
||||
async def get_gsc_data(self, user_id: str, site_url: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get GSC data for the specified site."""
|
||||
@@ -181,13 +257,15 @@ class SEODashboardService:
|
||||
|
||||
# Get data from Bing storage service
|
||||
if site_url:
|
||||
bing_data = self.bing_storage.get_analytics_summary(user_id, site_url, days=30)
|
||||
bing_storage = self._get_bing_storage(user_id)
|
||||
bing_data = bing_storage.get_analytics_summary(user_id, site_url, days=30)
|
||||
else:
|
||||
# Get all sites for user
|
||||
sites = self._get_bing_sites(user_id)
|
||||
if sites:
|
||||
logger.info(f"Using first Bing site for analysis: {sites[0]}")
|
||||
bing_data = self.bing_storage.get_analytics_summary(user_id, sites[0], days=30)
|
||||
bing_storage = self._get_bing_storage(user_id)
|
||||
bing_data = bing_storage.get_analytics_summary(user_id, sites[0], days=30)
|
||||
else:
|
||||
logger.warning(f"No Bing sites found for user {user_id}")
|
||||
return {"error": "No Bing sites found", "data": [], "status": "disconnected"}
|
||||
@@ -249,6 +327,46 @@ class SEODashboardService:
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
|
||||
"""Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
|
||||
try:
|
||||
# 1. Get augmented persona themes from WebsiteAnalysis
|
||||
session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
if not session:
|
||||
return {}
|
||||
|
||||
analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
|
||||
|
||||
# 2. Get latest tasks status
|
||||
tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
|
||||
|
||||
audit_status = "pending"
|
||||
health_status = "pending"
|
||||
|
||||
for task in tasks:
|
||||
t_type = task.payload.get('type') if task.payload else None
|
||||
if t_type == 'content_audit':
|
||||
audit_status = task.status
|
||||
elif t_type == 'site_health':
|
||||
health_status = task.status
|
||||
|
||||
brand_analysis = analysis.brand_analysis or {} if analysis else {}
|
||||
seo_audit = analysis.seo_audit or {} if analysis else {}
|
||||
|
||||
return {
|
||||
"augmented_themes": brand_analysis.get('augmented_themes', []),
|
||||
"last_audit": brand_analysis.get('last_advertools_audit'),
|
||||
"site_health": seo_audit.get('site_health', {}),
|
||||
"last_health_check": seo_audit.get('last_advertools_health_check'),
|
||||
"tasks": {
|
||||
"content_audit": audit_status,
|
||||
"site_health": health_status
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch Advertools insights for user {user_id}: {e}")
|
||||
return {}
|
||||
|
||||
def _get_gsc_sites(self, user_id: str) -> List[str]:
|
||||
"""Get GSC sites for user."""
|
||||
try:
|
||||
@@ -394,4 +512,4 @@ class SEODashboardService:
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating AI insights: {e}")
|
||||
return []
|
||||
return []
|
||||
|
||||
603
backend/services/seo/deep_competitor_analysis_service.py
Normal file
603
backend/services/seo/deep_competitor_analysis_service.py
Normal file
@@ -0,0 +1,603 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from services.component_logic.web_crawler_logic import WebCrawlerLogic
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
from services.ai_service_manager import AIServiceManager, AIServiceType
|
||||
from services.seo_tools.sitemap_service import SitemapService
|
||||
from services.seo.advertools_service import AdvertoolsService
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("deep_competitor_analysis")
|
||||
|
||||
|
||||
class DeepCompetitorAnalysisService:
|
||||
def __init__(self):
|
||||
self.crawler = WebCrawlerLogic()
|
||||
self.advertools = AdvertoolsService()
|
||||
|
||||
async def run(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
website_analysis: Dict[str, Any],
|
||||
competitors: List[Dict[str, Any]],
|
||||
max_competitors: int = 25,
|
||||
crawl_concurrency: int = 4
|
||||
) -> Dict[str, Any]:
|
||||
baseline = self._build_baseline(website_analysis)
|
||||
normalized_competitors = self._normalize_competitors(competitors, max_competitors=max_competitors)
|
||||
|
||||
crawl_results = await self._crawl_competitors(
|
||||
normalized_competitors,
|
||||
crawl_concurrency=crawl_concurrency
|
||||
)
|
||||
|
||||
per_competitor_outputs: List[Dict[str, Any]] = []
|
||||
for competitor_input, crawl_result in crawl_results:
|
||||
extraction = self._build_extraction_artifact(competitor_input, crawl_result)
|
||||
ai_analysis = await self._analyze_competitor_with_ai(
|
||||
user_id=user_id,
|
||||
baseline=baseline,
|
||||
competitor_input=competitor_input,
|
||||
extraction=extraction
|
||||
)
|
||||
per_competitor_outputs.append({
|
||||
"input": competitor_input,
|
||||
"extraction": extraction,
|
||||
"ai_analysis": ai_analysis
|
||||
})
|
||||
|
||||
aggregation = await self._aggregate_with_ai(
|
||||
user_id=user_id,
|
||||
baseline=baseline,
|
||||
competitors=per_competitor_outputs
|
||||
)
|
||||
|
||||
return {
|
||||
"baseline": baseline,
|
||||
"competitors": per_competitor_outputs,
|
||||
"aggregation": aggregation,
|
||||
"metadata": {
|
||||
"generated_at": datetime.utcnow().isoformat(),
|
||||
"competitors_requested": len(normalized_competitors),
|
||||
"competitors_analyzed": len(per_competitor_outputs),
|
||||
"crawl_concurrency": crawl_concurrency
|
||||
}
|
||||
}
|
||||
|
||||
async def generate_weekly_strategy_brief(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
website_analysis: Dict[str, Any],
|
||||
competitors: List[Dict[str, Any]]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generates a weekly strategic intelligence brief by analyzing
|
||||
recent competitor changes and market shifts.
|
||||
"""
|
||||
sitemap_service = SitemapService()
|
||||
ai_manager = AIServiceManager()
|
||||
|
||||
# Stage 1: Data Collection (User + Competitors)
|
||||
baseline = self._build_baseline(website_analysis)
|
||||
normalized_competitors = self._normalize_competitors(competitors, max_competitors=10)
|
||||
|
||||
# Fetch competitor sitemaps for recent changes
|
||||
competitor_changes = []
|
||||
seven_days_ago = datetime.utcnow() - timedelta(days=7)
|
||||
ninety_days_ago = datetime.utcnow() - timedelta(days=90)
|
||||
|
||||
for comp in normalized_competitors:
|
||||
try:
|
||||
# Stage 1: Advertools Deep Intelligence
|
||||
# Discover exact sitemap URL first (essential for Advertools)
|
||||
discovered_sitemap = await sitemap_service.discover_sitemap_url(comp['url'])
|
||||
effective_url = discovered_sitemap if discovered_sitemap else comp['url']
|
||||
|
||||
adv_result = await self.advertools.analyze_sitemap(effective_url)
|
||||
|
||||
# REUSE: Use existing SitemapService.analyze_sitemap for robust Stage 1 & 2
|
||||
analysis_result = await sitemap_service.analyze_sitemap(
|
||||
sitemap_url=effective_url,
|
||||
analyze_content_trends=True,
|
||||
analyze_publishing_patterns=True,
|
||||
include_ai_insights=False,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
if analysis_result and analysis_result.get('urls'):
|
||||
urls = analysis_result['urls']
|
||||
structure = analysis_result.get('structure_analysis', {})
|
||||
|
||||
# Enhancement 1: Keyword Clustering (NLP from URLs) - REUSE from SitemapService
|
||||
keyword_clusters = structure.get('keyword_clusters', {})
|
||||
|
||||
# Enhancement 2: Strategic Pillar Mapping - REUSE from SitemapService
|
||||
pillars = structure.get('strategic_pillars', {})
|
||||
|
||||
# Enhancement 3: Advertools Site Hierarchy (from folders)
|
||||
site_hierarchy = adv_result.get('metrics', {}).get('top_pillars', {}) if adv_result.get('success') else {}
|
||||
|
||||
# Enhancement 4: Content Cadence Trend (Last 7 days vs 90 days)
|
||||
recent_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), seven_days_ago)]
|
||||
historical_urls = [u for u in urls if self._is_newer_than(u.get('lastmod'), ninety_days_ago)]
|
||||
|
||||
recent_velocity = len(recent_urls) / 7
|
||||
historical_velocity = len(historical_urls) / 90
|
||||
cadence_shift = ((recent_velocity - historical_velocity) / max(historical_velocity, 0.01)) * 100
|
||||
|
||||
# Advertools Word Frequency (Audit top 5 recent URLs)
|
||||
top_themes = []
|
||||
if recent_urls:
|
||||
audit_urls = [u['loc'] for u in recent_urls[:5]]
|
||||
# Use thread-safe audit_content from AdvertoolsService
|
||||
audit_result = await self.advertools.audit_content(audit_urls)
|
||||
if audit_result.get('success'):
|
||||
top_themes = audit_result.get('themes', [])
|
||||
|
||||
competitor_changes.append({
|
||||
"domain": comp['domain'],
|
||||
"name": comp['name'],
|
||||
"new_content_count": len(recent_urls),
|
||||
"recent_topics": [self._extract_topic_from_url(u['loc']) for u in recent_urls[:10]],
|
||||
"total_pages": len(urls),
|
||||
"keyword_clusters": keyword_clusters,
|
||||
"strategic_pillars": pillars,
|
||||
"site_hierarchy": site_hierarchy,
|
||||
"top_themes": top_themes,
|
||||
"cadence_shift_percent": round(cadence_shift, 1),
|
||||
"publishing_velocity": round(recent_velocity, 2),
|
||||
"stale_content_pct": adv_result.get('metrics', {}).get('stale_content_percentage', 0) if adv_result.get('success') else 0
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch sitemap for {comp['domain']}: {e}")
|
||||
|
||||
# Stage 2: Differential Analysis (Non-AI Aggregation)
|
||||
avg_competitor_velocity = sum(c['publishing_velocity'] for c in competitor_changes) / len(competitor_changes) if competitor_changes else 0
|
||||
market_clusters = self._aggregate_clusters([c['keyword_clusters'] for c in competitor_changes])
|
||||
|
||||
# Stage 3: AI Strategic Intelligence
|
||||
# Extract rich user context from baseline
|
||||
brand_analysis = baseline.get("brand_analysis", {})
|
||||
seo_audit = baseline.get("seo_audit", {})
|
||||
|
||||
user_niche = brand_analysis.get("industry") or "General Business"
|
||||
user_topics = brand_analysis.get("topics") or []
|
||||
if not user_topics and seo_audit.get("keywords"):
|
||||
user_topics = seo_audit.get("keywords")[:5]
|
||||
|
||||
analysis_context = {
|
||||
"user_profile": {
|
||||
"website_url": baseline.get("website_url"),
|
||||
"industry": user_niche,
|
||||
"niche_description": brand_analysis.get("description") or brand_analysis.get("summary") or "",
|
||||
"core_topics": user_topics,
|
||||
"target_audience": baseline.get("target_audience") or {},
|
||||
"business_objectives": brand_analysis.get("objectives") or "Growth",
|
||||
"brand_voice": brand_analysis.get("voice") or "Professional",
|
||||
"augmented_themes": brand_analysis.get("augmented_themes", []) # Added from Advertools
|
||||
},
|
||||
"market_intelligence": {
|
||||
"market_clusters": market_clusters,
|
||||
"competitors_analyzed_count": len(competitor_changes),
|
||||
"market_opportunities_detected": ["Content Velocity Gap", "Topic Authority Shift", "Stale Content Replacement"],
|
||||
"competitor_hierarchies": {c['name']: c['site_hierarchy'] for c in competitor_changes},
|
||||
"competitor_content_themes": {c['name']: c['top_themes'] for c in competitor_changes}
|
||||
},
|
||||
"competitive_landscape_detailed": competitor_changes,
|
||||
}
|
||||
|
||||
# Call AI for strategic intelligence
|
||||
strategic_intelligence = await ai_manager.generate_strategic_intelligence(analysis_context, user_id=user_id)
|
||||
content_gaps = await ai_manager.generate_content_gap_analysis(analysis_context, user_id=user_id)
|
||||
|
||||
# Stage 4: Result Assembly
|
||||
report = {
|
||||
"week_commencing": seven_days_ago.date().isoformat(),
|
||||
"generated_at": datetime.utcnow().isoformat(),
|
||||
"metrics": {
|
||||
"market_velocity": round(avg_competitor_velocity, 2),
|
||||
"market_clusters": market_clusters[:5],
|
||||
"aggressive_competitors": [c['name'] for c in competitor_changes if c['cadence_shift_percent'] > 50]
|
||||
},
|
||||
"insights": {
|
||||
"the_big_move": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[0] if strategic_intelligence.get("success") else {},
|
||||
"low_hanging_fruit": content_gaps.get("data", {}).get("content_recommendations", []) if content_gaps.get("success") else [],
|
||||
"threat_alerts": strategic_intelligence.get("data", {}).get("strategic_insights", [{}])[1:] if strategic_intelligence.get("success") else []
|
||||
},
|
||||
"raw_data": {
|
||||
"competitor_changes": competitor_changes
|
||||
}
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
def _is_newer_than(self, lastmod: Optional[str], threshold: datetime) -> bool:
|
||||
if not lastmod:
|
||||
return False
|
||||
try:
|
||||
# Handle various ISO formats
|
||||
dt_str = lastmod.replace('Z', '+00:00')
|
||||
return datetime.fromisoformat(dt_str).replace(tzinfo=None) > threshold
|
||||
except:
|
||||
return False
|
||||
|
||||
def _aggregate_clusters(self, clusters_list: List[Dict[str, int]]) -> List[str]:
|
||||
"""Aggregate clusters across competitors to find market-wide themes."""
|
||||
master: Dict[str, int] = {}
|
||||
for cluster in clusters_list:
|
||||
for k, v in cluster.items():
|
||||
master[k] = master.get(k, 0) + 1 # Count competitor occurrences
|
||||
return sorted(master, key=lambda x: master[x], reverse=True)[:10]
|
||||
|
||||
def _extract_topic_from_url(self, url: str) -> str:
|
||||
"""Helper to get a readable topic from a URL slug."""
|
||||
try:
|
||||
path = urlparse(url).path
|
||||
slug = path.strip('/').split('/')[-1]
|
||||
return slug.replace('-', ' ').replace('_', ' ').capitalize()
|
||||
except:
|
||||
return "New Content"
|
||||
|
||||
def _build_baseline(self, website_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
||||
if not isinstance(website_analysis, dict):
|
||||
website_analysis = {}
|
||||
|
||||
baseline = {
|
||||
"website_url": website_analysis.get("website_url"),
|
||||
"brand_analysis": website_analysis.get("brand_analysis") or {},
|
||||
"content_strategy_insights": website_analysis.get("content_strategy_insights") or {},
|
||||
"seo_audit": website_analysis.get("seo_audit") or {},
|
||||
"style_guidelines": website_analysis.get("style_guidelines") or {},
|
||||
"style_patterns": website_analysis.get("style_patterns") or {}
|
||||
}
|
||||
|
||||
return baseline
|
||||
|
||||
def _normalize_competitors(self, competitors: List[Dict[str, Any]], *, max_competitors: int) -> List[Dict[str, Any]]:
|
||||
if not isinstance(competitors, list):
|
||||
return []
|
||||
|
||||
seen_domains = set()
|
||||
normalized: List[Dict[str, Any]] = []
|
||||
|
||||
for comp in competitors:
|
||||
if not isinstance(comp, dict):
|
||||
continue
|
||||
|
||||
raw_url = comp.get("url") or comp.get("website_url") or comp.get("domain") or ""
|
||||
url = self._normalize_url(raw_url)
|
||||
if not url:
|
||||
continue
|
||||
|
||||
domain = self._extract_domain(url)
|
||||
if not domain or domain in seen_domains:
|
||||
continue
|
||||
|
||||
seen_domains.add(domain)
|
||||
normalized.append({
|
||||
"url": url,
|
||||
"domain": domain,
|
||||
"name": comp.get("name") or comp.get("title") or domain,
|
||||
"summary": comp.get("summary") or comp.get("description") or ""
|
||||
})
|
||||
|
||||
if len(normalized) >= max_competitors:
|
||||
break
|
||||
|
||||
return normalized
|
||||
|
||||
def _normalize_url(self, raw: str) -> Optional[str]:
|
||||
if not raw or not isinstance(raw, str):
|
||||
return None
|
||||
|
||||
raw = raw.strip()
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
if not raw.startswith(("http://", "https://")):
|
||||
raw = "https://" + raw
|
||||
|
||||
try:
|
||||
parsed = urlparse(raw)
|
||||
if not parsed.scheme or not parsed.netloc:
|
||||
return None
|
||||
return f"{parsed.scheme}://{parsed.netloc}"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _extract_domain(self, url: str) -> Optional[str]:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = (parsed.netloc or "").lower()
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
return domain or None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def _crawl_competitors(
|
||||
self,
|
||||
competitors: List[Dict[str, Any]],
|
||||
*,
|
||||
crawl_concurrency: int
|
||||
) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
|
||||
semaphore = asyncio.Semaphore(max(1, int(crawl_concurrency)))
|
||||
|
||||
async def crawl_one(comp: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
async with semaphore:
|
||||
url = comp.get("url")
|
||||
if not url:
|
||||
return comp, {"success": False, "error": "missing_url"}
|
||||
try:
|
||||
return comp, await self.crawler.crawl_website(url)
|
||||
except Exception as e:
|
||||
return comp, {"success": False, "error": str(e)}
|
||||
|
||||
tasks = [crawl_one(c) for c in competitors]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
def _build_extraction_artifact(self, competitor_input: Dict[str, Any], crawl_result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
if not isinstance(crawl_result, dict) or not crawl_result.get("success"):
|
||||
return {
|
||||
"fetch_status": {
|
||||
"status": "failed",
|
||||
"error": crawl_result.get("error") if isinstance(crawl_result, dict) else "unknown_error"
|
||||
}
|
||||
}
|
||||
|
||||
content = crawl_result.get("content") if isinstance(crawl_result.get("content"), dict) else {}
|
||||
title = content.get("title") or ""
|
||||
description = content.get("description") or ""
|
||||
headings = content.get("headings") if isinstance(content.get("headings"), list) else []
|
||||
links = content.get("links") if isinstance(content.get("links"), list) else []
|
||||
meta_tags = content.get("meta_tags") if isinstance(content.get("meta_tags"), dict) else {}
|
||||
main_content = content.get("main_content") or ""
|
||||
content_structure = content.get("content_structure") if isinstance(content.get("content_structure"), dict) else {}
|
||||
|
||||
nav_labels = self._extract_nav_labels(links)
|
||||
h1_h2 = [h for h in headings if isinstance(h, str)][:25]
|
||||
cta_signals = self._extract_cta_signals(main_content, links)
|
||||
proof_signals = self._extract_proof_signals(main_content, links)
|
||||
|
||||
excerpt = main_content.strip()
|
||||
if len(excerpt) > 2000:
|
||||
excerpt = excerpt[:2000]
|
||||
|
||||
return {
|
||||
"fetch_status": {
|
||||
"status": "ok",
|
||||
"fetched_url": crawl_result.get("url"),
|
||||
"timestamp": crawl_result.get("timestamp")
|
||||
},
|
||||
"page_meta": {
|
||||
"title": title,
|
||||
"meta_description": description,
|
||||
"og_title": meta_tags.get("og:title"),
|
||||
"og_description": meta_tags.get("og:description")
|
||||
},
|
||||
"structure": {
|
||||
"headings": h1_h2,
|
||||
"nav_labels": nav_labels,
|
||||
"content_structure": content_structure
|
||||
},
|
||||
"signals": {
|
||||
"cta_signals": cta_signals,
|
||||
"proof_signals": proof_signals
|
||||
},
|
||||
"content_excerpt": excerpt
|
||||
}
|
||||
|
||||
def _extract_nav_labels(self, links: List[Dict[str, Any]]) -> List[str]:
|
||||
labels: List[str] = []
|
||||
for link in links[:200]:
|
||||
if not isinstance(link, dict):
|
||||
continue
|
||||
text = (link.get("text") or "").strip()
|
||||
if not text or len(text) > 50:
|
||||
continue
|
||||
labels.append(text)
|
||||
deduped: List[str] = []
|
||||
seen = set()
|
||||
for label in labels:
|
||||
key = label.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
deduped.append(label)
|
||||
if len(deduped) >= 25:
|
||||
break
|
||||
return deduped
|
||||
|
||||
def _extract_cta_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
text = (main_content or "").lower()
|
||||
keywords = ["get started", "start", "book", "demo", "trial", "pricing", "contact", "signup", "sign up", "subscribe"]
|
||||
keyword_hits = [k for k in keywords if k in text]
|
||||
|
||||
link_texts = []
|
||||
for link in links[:200]:
|
||||
if isinstance(link, dict):
|
||||
t = (link.get("text") or "").strip()
|
||||
if t:
|
||||
link_texts.append(t.lower())
|
||||
|
||||
cta_link_hits = [k for k in keywords if any(k in lt for lt in link_texts)]
|
||||
return {
|
||||
"keyword_hits": keyword_hits[:10],
|
||||
"link_cta_hits": list(dict.fromkeys(cta_link_hits))[:10]
|
||||
}
|
||||
|
||||
def _extract_proof_signals(self, main_content: str, links: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
text = (main_content or "").lower()
|
||||
proof_keywords = ["case study", "testimonials", "customers", "trusted by", "reviews", "awards", "partners"]
|
||||
hits = [k for k in proof_keywords if k in text]
|
||||
|
||||
link_hits = []
|
||||
for link in links[:200]:
|
||||
if not isinstance(link, dict):
|
||||
continue
|
||||
href = (link.get("href") or "").lower()
|
||||
if any(k.replace(" ", "") in href.replace("-", "").replace("_", "") for k in ["case study", "testimonials", "customers"]):
|
||||
link_hits.append(href)
|
||||
return {
|
||||
"keyword_hits": hits[:10],
|
||||
"supporting_links": link_hits[:10]
|
||||
}
|
||||
|
||||
async def _analyze_competitor_with_ai(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
baseline: Dict[str, Any],
|
||||
competitor_input: Dict[str, Any],
|
||||
extraction: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
if not isinstance(extraction, dict) or extraction.get("fetch_status", {}).get("status") != "ok":
|
||||
return {
|
||||
"status": "skipped",
|
||||
"reason": "crawl_failed"
|
||||
}
|
||||
|
||||
json_struct = {
|
||||
"positioning": {
|
||||
"value_prop": "string",
|
||||
"target_audience": "string",
|
||||
"market_tier": "string",
|
||||
"primary_offer": "string"
|
||||
},
|
||||
"content_strategy": {
|
||||
"themes": ["string"],
|
||||
"messaging_angles": ["string"],
|
||||
"cta_patterns": ["string"],
|
||||
"tone_markers": ["string"]
|
||||
},
|
||||
"competitive_advantages": ["string"],
|
||||
"weaknesses_or_risks": ["string"],
|
||||
"comparison_to_user_baseline": {
|
||||
"overlaps": ["string"],
|
||||
"deltas": ["string"],
|
||||
"opportunities": ["string"]
|
||||
},
|
||||
"confidence": {
|
||||
"overall": "number",
|
||||
"notes": ["string"]
|
||||
}
|
||||
}
|
||||
|
||||
prompt = (
|
||||
"You are a competitive intelligence analyst.\n"
|
||||
"Analyze the competitor homepage extraction and compare it to the user's Step 2 baseline insights.\n"
|
||||
"Return strictly the requested JSON.\n\n"
|
||||
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
|
||||
f"Competitor input: {json.dumps(competitor_input, ensure_ascii=False)}\n\n"
|
||||
f"Homepage extraction: {json.dumps(extraction, ensure_ascii=False)}\n"
|
||||
)
|
||||
|
||||
try:
|
||||
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
|
||||
parsed = self._safe_json_parse(raw)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
return {"status": "failed", "error": "invalid_ai_json"}
|
||||
except Exception as e:
|
||||
logger.warning(f"AI competitor analysis failed for {competitor_input.get('domain')}: {e}")
|
||||
return {"status": "failed", "error": str(e)}
|
||||
|
||||
async def _aggregate_with_ai(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
baseline: Dict[str, Any],
|
||||
competitors: List[Dict[str, Any]]
|
||||
) -> Dict[str, Any]:
|
||||
json_struct = {
|
||||
"market_map": {
|
||||
"clusters": [
|
||||
{
|
||||
"cluster_name": "string",
|
||||
"description": "string",
|
||||
"competitors": ["string"]
|
||||
}
|
||||
]
|
||||
},
|
||||
"common_patterns": {
|
||||
"common_themes": ["string"],
|
||||
"common_ctas": ["string"],
|
||||
"common_proof_signals": ["string"]
|
||||
},
|
||||
"content_gaps_and_opportunities": [
|
||||
{
|
||||
"gap": "string",
|
||||
"why_it_matters": "string",
|
||||
"recommended_content_types": ["string"],
|
||||
"impact": "string",
|
||||
"effort": "string"
|
||||
}
|
||||
],
|
||||
"strategic_recommendations": [
|
||||
{
|
||||
"action": "string",
|
||||
"expected_impact": "string",
|
||||
"effort": "string",
|
||||
"first_steps": ["string"]
|
||||
}
|
||||
],
|
||||
"warnings": ["string"]
|
||||
}
|
||||
|
||||
compact = []
|
||||
for item in competitors:
|
||||
comp = item.get("input") if isinstance(item, dict) else None
|
||||
ai = item.get("ai_analysis") if isinstance(item, dict) else None
|
||||
if isinstance(comp, dict) and isinstance(ai, dict):
|
||||
compact.append({
|
||||
"domain": comp.get("domain"),
|
||||
"name": comp.get("name"),
|
||||
"ai_analysis": ai
|
||||
})
|
||||
|
||||
prompt = (
|
||||
"You are a senior strategy consultant.\n"
|
||||
"Using the user's Step 2 baseline insights and per-competitor analyses, produce an aggregated market view.\n"
|
||||
"Return strictly the requested JSON.\n\n"
|
||||
f"User baseline (Step 2 insights): {json.dumps(baseline, ensure_ascii=False)}\n\n"
|
||||
f"Per-competitor analyses: {json.dumps(compact, ensure_ascii=False)}\n"
|
||||
)
|
||||
|
||||
try:
|
||||
raw = llm_text_gen(prompt, json_struct=json_struct, user_id=user_id)
|
||||
parsed = self._safe_json_parse(raw)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
return {"warnings": ["invalid_ai_json"]}
|
||||
except Exception as e:
|
||||
logger.warning(f"AI aggregation failed: {e}")
|
||||
return {"warnings": [str(e)]}
|
||||
|
||||
def _safe_json_parse(self, text: str) -> Any:
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
cleaned = text.strip()
|
||||
cleaned = re.sub(r"^```json\\s*", "", cleaned)
|
||||
cleaned = re.sub(r"^```\\s*", "", cleaned)
|
||||
cleaned = re.sub(r"```\\s*$", "", cleaned)
|
||||
cleaned = cleaned.strip()
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except Exception:
|
||||
match = re.search(r"\\{[\\s\\S]*\\}", cleaned)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group(0))
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user