Files
ALwrity/backend/services/scheduler/executors/advertools_executor.py

298 lines
13 KiB
Python

import asyncio
from datetime import datetime, timedelta
from typing import Any, Dict, List
from urllib.parse import urlparse
from loguru import logger
from sqlalchemy.orm import Session
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
from services.seo.advertools_service import AdvertoolsService
from services.seo_tools.sitemap_service import SitemapService
from models.advertools_monitoring_models import AdvertoolsTask, AdvertoolsExecutionLog
from models.onboarding import WebsiteAnalysis, OnboardingSession
class AdvertoolsExecutor:
"""
Executor for Advertools-based SEO intelligence tasks.
Handles 'content_audit' and 'site_health' task types.
"""
def __init__(self):
self.advertools_service = AdvertoolsService()
self.sitemap_service = SitemapService()
self.logger = logger.bind(service="AdvertoolsExecutor")
async def execute_task(self, task_stub: Any, db: Session, **kwargs) -> Dict[str, Any]:
"""
Execute an Advertools intelligence task.
Args:
task_stub: Tuple or object containing (id, user_id, payload)
db: Database session
Returns:
Execution result dictionary
"""
start_time = datetime.utcnow()
task_id = getattr(task_stub, 'id', None)
user_id = getattr(task_stub, 'user_id', None)
payload = getattr(task_stub, 'payload', {}) or {}
task_type = payload.get('type')
website_url = payload.get('website_url')
self.logger.info(f"🚀 Starting Advertools task {task_id} ({task_type}) for {website_url}")
# Find the actual task record to update state
task_record = None
if isinstance(task_id, int):
task_record = db.query(AdvertoolsTask).filter(AdvertoolsTask.id == task_id).first()
try:
if not website_url:
raise ValueError("Missing website_url in payload")
# 1. Discover exact sitemap URL first (essential for Advertools)
discovered_sitemap = await self.sitemap_service.discover_sitemap_url(website_url)
effective_url = discovered_sitemap if discovered_sitemap else website_url
# Set status to running for UI feedback
if task_record:
task_record.status = 'running'
db.commit()
result = {}
if task_type == 'content_audit':
# Phase 1: Get sitemap analysis (freshness, URL structure, pillars)
sitemap_result = await self.advertools_service.analyze_sitemap(effective_url)
audit_urls = []
url_structure = {}
freshness = {}
if sitemap_result.get('success'):
metrics = sitemap_result.get('metrics', {})
audit_urls = metrics.get('audit_sample_urls', [])
url_structure = metrics.get('url_structure', {})
freshness = {
"freshness_score": metrics.get('freshness_score'),
"publishing_velocity": metrics.get('publishing_velocity'),
"stale_content_percentage": metrics.get('stale_content_percentage'),
"publishing_recency": metrics.get('publishing_recency'),
"publishing_trend": metrics.get('publishing_trend'),
}
if not audit_urls:
audit_urls = [website_url]
# Phase 2: Theme analysis via content audit
audit_result = await self.advertools_service.audit_content(audit_urls)
# Phase 3: Site structure analysis (links, redirects, image SEO)
site_domain = urlparse(website_url).netloc or website_url
structure_result = await self.advertools_service.analyze_site_structure(
audit_urls, site_domain=site_domain
)
# Phase 4: Robots.txt compliance analysis
robots_result = await self.advertools_service.analyze_robots_txt(website_url)
# Phase 5: Crawl budget analysis
budget_result = await self.advertools_service.analyze_crawl_budget(
effective_url, site_domain
)
# Merge results
result = {
"success": audit_result.get('success', False) or structure_result.get('success', False),
"themes": audit_result.get('themes', []),
"page_count": audit_result.get('page_count', 0),
"avg_word_count": audit_result.get('avg_word_count', 0),
"link_health": structure_result.get('link_health', {}),
"redirect_audit": structure_result.get('redirect_audit', {}),
"image_seo": structure_result.get('image_seo', {}),
"page_status": structure_result.get('page_status', {}),
"url_structure": url_structure,
"freshness": freshness,
"robots_txt": robots_result,
"crawl_budget": budget_result,
"timestamp": datetime.utcnow().isoformat()
}
if result.get('success'):
await self._update_persona_augmentation(user_id, website_url, result, db)
elif task_type == 'site_health':
# Site health: freshness, velocity, URL structure
result = await self.advertools_service.analyze_sitemap(effective_url)
if result.get('success'):
await self._update_site_health_metrics(user_id, website_url, result, db)
else:
raise ValueError(f"Unknown task type: {task_type}")
success = result.get('success', False)
execution_time_ms = int((datetime.utcnow() - start_time).total_seconds() * 1000)
# Update task state
if task_record:
task_record.last_executed = datetime.utcnow()
if success:
task_record.last_success = datetime.utcnow()
task_record.consecutive_failures = 0
task_record.status = 'active'
# Smart Scheduling with Backoff reset
freq_days = task_record.frequency_days or 7
task_record.next_execution = datetime.utcnow() + timedelta(days=freq_days)
else:
task_record.last_failure = datetime.utcnow()
task_record.failure_reason = result.get('error', 'Unknown error')
task_record.consecutive_failures = (task_record.consecutive_failures or 0) + 1
# Exponential Backoff for repeated failures (up to 30 days)
backoff_days = min(30, (task_record.frequency_days or 7) * (2 ** (task_record.consecutive_failures - 1)))
task_record.next_execution = datetime.utcnow() + timedelta(days=backoff_days)
if task_record.consecutive_failures >= 5:
task_record.status = 'failed' # Mark as failed after 5 attempts
# Create execution log
if isinstance(task_id, int):
log_entry = AdvertoolsExecutionLog(
task_id=task_id,
status='success' if success else 'failed',
result_data=result,
error_message=result.get('error'),
execution_time_ms=execution_time_ms
)
db.add(log_entry)
db.commit()
if success:
self.logger.info(f"✅ Advertools task {task_id} completed successfully")
else:
self.logger.warning(f"⚠️ Advertools task {task_id} failed: {result.get('error')}")
return result
except Exception as e:
db.rollback()
self.logger.error(f"❌ Advertools task execution failed: {e}")
# Try to update task record with failure even if main logic failed
if task_record:
try:
task_record.last_executed = datetime.utcnow()
task_record.last_failure = datetime.utcnow()
task_record.failure_reason = str(e)
task_record.consecutive_failures = (task_record.consecutive_failures or 0) + 1
db.commit()
except:
db.rollback()
return {"success": False, "error": str(e)}
async def _update_persona_augmentation(self, user_id: str, website_url: str, audit_result: Dict[str, Any], db: Session):
"""
Updates the user's Brand Persona with discovered themes, site structure,
link health, and redirect data from the content audit.
"""
try:
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
self.logger.warning(f"No onboarding session found for user {user_id}")
return
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
if not analysis:
self.logger.warning(f"No website analysis found for user {user_id}")
return
current_brand = analysis.brand_analysis or {}
# Core themes
current_brand['augmented_themes'] = audit_result.get('themes', [])
# Link health
current_brand['link_health'] = audit_result.get('link_health', {})
# Redirect audit
current_brand['redirect_audit'] = audit_result.get('redirect_audit', {})
# Image SEO
current_brand['image_seo'] = audit_result.get('image_seo', {})
# Page status distribution
current_brand['page_status'] = audit_result.get('page_status', {})
# URL structure analysis
current_brand['url_structure'] = audit_result.get('url_structure', {})
# Freshness
current_brand['freshness'] = audit_result.get('freshness', {})
# Robots.txt compliance
current_brand['robots_txt'] = audit_result.get('robots_txt', {})
# Crawl budget analysis
current_brand['crawl_budget'] = audit_result.get('crawl_budget', {})
current_brand['last_advertools_audit'] = datetime.utcnow().isoformat()
from sqlalchemy.orm.attributes import flag_modified
flag_modified(analysis, "brand_analysis")
if 'avg_word_count' in audit_result:
current_strategy = analysis.content_strategy_insights or {}
current_strategy['avg_content_length'] = audit_result['avg_word_count']
analysis.content_strategy_insights = current_strategy
flag_modified(analysis, "content_strategy_insights")
self.logger.info(f"Updated persona augmentation for {user_id}")
except Exception as e:
self.logger.error(f"Failed to update persona augmentation: {e}")
raise e
async def _update_site_health_metrics(self, user_id: str, website_url: str, health_result: Dict[str, Any], db: Session):
"""
Updates the WebsiteAnalysis with site health metrics (velocity, freshness,
URL structure analysis, freshness score).
"""
try:
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
return
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
if not analysis:
return
current_seo = analysis.seo_audit or {}
metrics = health_result.get('metrics', {})
current_seo['site_health'] = {
"total_urls": metrics.get('total_urls'),
"publishing_velocity": metrics.get('publishing_velocity'),
"stale_content_count": metrics.get('stale_content_count'),
"stale_content_percentage": metrics.get('stale_content_percentage'),
"freshness_score": metrics.get('freshness_score'),
"publishing_recency": metrics.get('publishing_recency'),
"publishing_trend": metrics.get('publishing_trend'),
"top_pillars": metrics.get('top_pillars'),
"url_structure": metrics.get('url_structure', {})
}
current_seo['last_advertools_health_check'] = datetime.utcnow().isoformat()
analysis.seo_audit = current_seo
from sqlalchemy.orm.attributes import flag_modified
flag_modified(analysis, "seo_audit")
self.logger.info(f"Updated site health metrics for {user_id}")
except Exception as e:
self.logger.error(f"Failed to update site health metrics: {e}")
raise e