Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts
This commit is contained in:
230
backend/services/scheduler/executors/advertools_executor.py
Normal file
230
backend/services/scheduler/executors/advertools_executor.py
Normal file
@@ -0,0 +1,230 @@
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from services.seo.advertools_service import AdvertoolsService
|
||||
from services.seo_tools.sitemap_service import SitemapService
|
||||
from models.advertools_monitoring_models import AdvertoolsTask, AdvertoolsExecutionLog
|
||||
from models.onboarding import WebsiteAnalysis, OnboardingSession
|
||||
|
||||
class AdvertoolsExecutor:
|
||||
"""
|
||||
Executor for Advertools-based SEO intelligence tasks.
|
||||
Handles 'content_audit' and 'site_health' task types.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.advertools_service = AdvertoolsService()
|
||||
self.sitemap_service = SitemapService()
|
||||
self.logger = logger.bind(service="AdvertoolsExecutor")
|
||||
|
||||
async def execute_task(self, task_stub: Any, db: Session, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Execute an Advertools intelligence task.
|
||||
|
||||
Args:
|
||||
task_stub: Tuple or object containing (id, user_id, payload)
|
||||
db: Database session
|
||||
|
||||
Returns:
|
||||
Execution result dictionary
|
||||
"""
|
||||
start_time = datetime.utcnow()
|
||||
task_id = getattr(task_stub, 'id', None)
|
||||
user_id = getattr(task_stub, 'user_id', None)
|
||||
payload = getattr(task_stub, 'payload', {}) or {}
|
||||
|
||||
task_type = payload.get('type')
|
||||
website_url = payload.get('website_url')
|
||||
|
||||
self.logger.info(f"🚀 Starting Advertools task {task_id} ({task_type}) for {website_url}")
|
||||
|
||||
# Find the actual task record to update state
|
||||
task_record = None
|
||||
if isinstance(task_id, int):
|
||||
task_record = db.query(AdvertoolsTask).filter(AdvertoolsTask.id == task_id).first()
|
||||
|
||||
try:
|
||||
if not website_url:
|
||||
raise ValueError("Missing website_url in payload")
|
||||
|
||||
# 1. Discover exact sitemap URL first (essential for Advertools)
|
||||
discovered_sitemap = await self.sitemap_service.discover_sitemap_url(website_url)
|
||||
effective_url = discovered_sitemap if discovered_sitemap else website_url
|
||||
|
||||
# Set status to running for UI feedback
|
||||
if task_record:
|
||||
task_record.status = 'running'
|
||||
db.commit()
|
||||
|
||||
result = {}
|
||||
if task_type == 'content_audit':
|
||||
# Phase 1: Audit content themes using sample URLs from sitemap
|
||||
# First, get the sitemap to find recent URLs
|
||||
sitemap_result = await self.advertools_service.analyze_sitemap(effective_url)
|
||||
|
||||
audit_urls = []
|
||||
if sitemap_result.get('success'):
|
||||
# Use the sample URLs returned by the service
|
||||
audit_urls = sitemap_result.get('metrics', {}).get('audit_sample_urls', [])
|
||||
|
||||
if not audit_urls:
|
||||
# Fallback to homepage if sitemap fails or empty
|
||||
audit_urls = [website_url]
|
||||
|
||||
# Run the audit on the sample
|
||||
result = await self.advertools_service.audit_content(audit_urls)
|
||||
|
||||
if result.get('success'):
|
||||
await self._update_persona_augmentation(user_id, website_url, result, db)
|
||||
|
||||
elif task_type == 'site_health':
|
||||
# Phase 1: Check site health (freshness, velocity)
|
||||
result = await self.advertools_service.analyze_sitemap(effective_url)
|
||||
|
||||
if result.get('success'):
|
||||
await self._update_site_health_metrics(user_id, website_url, result, db)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown task type: {task_type}")
|
||||
|
||||
success = result.get('success', False)
|
||||
execution_time_ms = int((datetime.utcnow() - start_time).total_seconds() * 1000)
|
||||
|
||||
# Update task state
|
||||
if task_record:
|
||||
task_record.last_executed = datetime.utcnow()
|
||||
if success:
|
||||
task_record.last_success = datetime.utcnow()
|
||||
task_record.consecutive_failures = 0
|
||||
task_record.status = 'active'
|
||||
|
||||
# Smart Scheduling with Backoff reset
|
||||
freq_days = task_record.frequency_days or 7
|
||||
task_record.next_execution = datetime.utcnow() + timedelta(days=freq_days)
|
||||
else:
|
||||
task_record.last_failure = datetime.utcnow()
|
||||
task_record.failure_reason = result.get('error', 'Unknown error')
|
||||
task_record.consecutive_failures = (task_record.consecutive_failures or 0) + 1
|
||||
|
||||
# Exponential Backoff for repeated failures (up to 30 days)
|
||||
backoff_days = min(30, (task_record.frequency_days or 7) * (2 ** (task_record.consecutive_failures - 1)))
|
||||
task_record.next_execution = datetime.utcnow() + timedelta(days=backoff_days)
|
||||
|
||||
if task_record.consecutive_failures >= 5:
|
||||
task_record.status = 'failed' # Mark as failed after 5 attempts
|
||||
|
||||
# Create execution log
|
||||
if isinstance(task_id, int):
|
||||
log_entry = AdvertoolsExecutionLog(
|
||||
task_id=task_id,
|
||||
status='success' if success else 'failed',
|
||||
result_data=result,
|
||||
error_message=result.get('error'),
|
||||
execution_time_ms=execution_time_ms
|
||||
)
|
||||
db.add(log_entry)
|
||||
|
||||
db.commit()
|
||||
|
||||
if success:
|
||||
self.logger.info(f"✅ Advertools task {task_id} completed successfully")
|
||||
else:
|
||||
self.logger.warning(f"⚠️ Advertools task {task_id} failed: {result.get('error')}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
self.logger.error(f"❌ Advertools task execution failed: {e}")
|
||||
|
||||
# Try to update task record with failure even if main logic failed
|
||||
if task_record:
|
||||
try:
|
||||
task_record.last_executed = datetime.utcnow()
|
||||
task_record.last_failure = datetime.utcnow()
|
||||
task_record.failure_reason = str(e)
|
||||
task_record.consecutive_failures = (task_record.consecutive_failures or 0) + 1
|
||||
db.commit()
|
||||
except:
|
||||
db.rollback()
|
||||
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def _update_persona_augmentation(self, user_id: str, website_url: str, audit_result: Dict[str, Any], db: Session):
|
||||
"""
|
||||
Updates the user's Brand Persona with discovered themes from the content audit.
|
||||
"""
|
||||
try:
|
||||
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
if not session:
|
||||
self.logger.warning(f"No onboarding session found for user {user_id}")
|
||||
return
|
||||
|
||||
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
|
||||
if not analysis:
|
||||
self.logger.warning(f"No website analysis found for user {user_id}")
|
||||
return
|
||||
|
||||
# Update brand_analysis with augmented themes
|
||||
current_brand = analysis.brand_analysis or {}
|
||||
|
||||
# Add or update the 'augmented_themes' field
|
||||
current_brand['augmented_themes'] = audit_result.get('themes', [])
|
||||
current_brand['last_advertools_audit'] = datetime.utcnow().isoformat()
|
||||
|
||||
# Force SQLAlchemy to detect change in JSON field
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
flag_modified(analysis, "brand_analysis")
|
||||
|
||||
# Also update content_strategy_insights if relevant
|
||||
if 'avg_word_count' in audit_result:
|
||||
current_strategy = analysis.content_strategy_insights or {}
|
||||
current_strategy['avg_content_length'] = audit_result['avg_word_count']
|
||||
analysis.content_strategy_insights = current_strategy
|
||||
flag_modified(analysis, "content_strategy_insights")
|
||||
|
||||
self.logger.info(f"Updated persona augmentation for {user_id}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to update persona augmentation: {e}")
|
||||
raise e
|
||||
|
||||
async def _update_site_health_metrics(self, user_id: str, website_url: str, health_result: Dict[str, Any], db: Session):
|
||||
"""
|
||||
Updates the WebsiteAnalysis with site health metrics (velocity, freshness).
|
||||
"""
|
||||
try:
|
||||
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
if not session:
|
||||
return
|
||||
|
||||
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
|
||||
if not analysis:
|
||||
return
|
||||
|
||||
# Update seo_audit with health metrics
|
||||
current_seo = analysis.seo_audit or {}
|
||||
metrics = health_result.get('metrics', {})
|
||||
|
||||
current_seo['site_health'] = {
|
||||
"total_urls": metrics.get('total_urls'),
|
||||
"publishing_velocity": metrics.get('publishing_velocity'),
|
||||
"stale_content_count": metrics.get('stale_content_count'),
|
||||
"stale_content_percentage": metrics.get('stale_content_percentage'),
|
||||
"top_pillars": metrics.get('top_pillars')
|
||||
}
|
||||
current_seo['last_advertools_health_check'] = datetime.utcnow().isoformat()
|
||||
|
||||
analysis.seo_audit = current_seo
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
flag_modified(analysis, "seo_audit")
|
||||
self.logger.info(f"Updated site health metrics for {user_id}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to update site health metrics: {e}")
|
||||
raise e
|
||||
@@ -15,6 +15,7 @@ from ..core.exception_handler import TaskExecutionError, DatabaseError, Schedule
|
||||
from models.platform_insights_monitoring_models import PlatformInsightsTask, PlatformInsightsExecutionLog
|
||||
from services.bing_analytics_storage_service import BingAnalyticsStorageService
|
||||
from services.integrations.bing_oauth import BingOAuthService
|
||||
from services.database import get_user_db_path
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("bing_insights_executor")
|
||||
@@ -34,8 +35,6 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
def __init__(self):
|
||||
self.logger = logger
|
||||
self.exception_handler = SchedulerExceptionHandler()
|
||||
database_url = os.getenv('DATABASE_URL', 'sqlite:///alwrity.db')
|
||||
self.storage_service = BingAnalyticsStorageService(database_url)
|
||||
self.bing_oauth = BingOAuthService()
|
||||
|
||||
async def execute_task(self, task: PlatformInsightsTask, db: Session) -> TaskExecutionResult:
|
||||
@@ -53,6 +52,11 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
user_id = task.user_id
|
||||
site_url = task.site_url
|
||||
|
||||
# Initialize storage service for this user
|
||||
db_path = get_user_db_path(user_id)
|
||||
database_url = f'sqlite:///{db_path}'
|
||||
storage_service = BingAnalyticsStorageService(database_url)
|
||||
|
||||
try:
|
||||
self.logger.info(
|
||||
f"Executing Bing insights fetch: task_id={task.id} | "
|
||||
@@ -69,7 +73,7 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
db.flush()
|
||||
|
||||
# Fetch insights
|
||||
result = await self._fetch_insights(task, db)
|
||||
result = await self._fetch_insights(task, db, storage_service)
|
||||
|
||||
# Update execution log
|
||||
execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
@@ -184,7 +188,7 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
|
||||
return error_result
|
||||
|
||||
async def _fetch_insights(self, task: PlatformInsightsTask, db: Session) -> TaskExecutionResult:
|
||||
async def _fetch_insights(self, task: PlatformInsightsTask, db: Session, storage_service: BingAnalyticsStorageService) -> TaskExecutionResult:
|
||||
"""
|
||||
Fetch Bing insights data.
|
||||
|
||||
@@ -201,7 +205,7 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
if is_first_run:
|
||||
# First run: Try to load from cache
|
||||
self.logger.info(f"First run for Bing insights task {task.id} - loading cached data")
|
||||
cached_data = self._load_cached_data(user_id, site_url)
|
||||
cached_data = self._load_cached_data(user_id, site_url, storage_service)
|
||||
|
||||
if cached_data:
|
||||
self.logger.info(f"Loaded cached Bing data for user {user_id}")
|
||||
@@ -216,11 +220,11 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
else:
|
||||
# No cached data - try to fetch from API
|
||||
self.logger.info(f"No cached data found, fetching from Bing API")
|
||||
return await self._fetch_fresh_data(user_id, site_url)
|
||||
return await self._fetch_fresh_data(user_id, site_url, storage_service)
|
||||
else:
|
||||
# Subsequent run: Always fetch fresh data
|
||||
self.logger.info(f"Subsequent run for Bing insights task {task.id} - fetching fresh data")
|
||||
return await self._fetch_fresh_data(user_id, site_url)
|
||||
return await self._fetch_fresh_data(user_id, site_url, storage_service)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching Bing insights for user {user_id}: {e}", exc_info=True)
|
||||
@@ -230,11 +234,11 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
result_data={'error': str(e)}
|
||||
)
|
||||
|
||||
def _load_cached_data(self, user_id: str, site_url: Optional[str]) -> Optional[Dict[str, Any]]:
|
||||
def _load_cached_data(self, user_id: str, site_url: Optional[str], storage_service: BingAnalyticsStorageService) -> Optional[Dict[str, Any]]:
|
||||
"""Load most recent cached Bing data from database."""
|
||||
try:
|
||||
# Get analytics summary from storage service
|
||||
summary = self.storage_service.get_analytics_summary(
|
||||
summary = storage_service.get_analytics_summary(
|
||||
user_id=user_id,
|
||||
site_url=site_url or '',
|
||||
days=30
|
||||
@@ -250,7 +254,7 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
self.logger.warning(f"Error loading cached Bing data: {e}")
|
||||
return None
|
||||
|
||||
async def _fetch_fresh_data(self, user_id: str, site_url: Optional[str]) -> TaskExecutionResult:
|
||||
async def _fetch_fresh_data(self, user_id: str, site_url: Optional[str], storage_service: BingAnalyticsStorageService) -> TaskExecutionResult:
|
||||
"""Fetch fresh Bing insights from API."""
|
||||
try:
|
||||
# Check if user has active tokens
|
||||
@@ -288,7 +292,7 @@ class BingInsightsExecutor(TaskExecutor):
|
||||
|
||||
# For now, use stored analytics data (Bing API integration can be added later)
|
||||
# This ensures we have data available even if the API class doesn't exist yet
|
||||
summary = self.storage_service.get_analytics_summary(user_id, site_url, days=30)
|
||||
summary = storage_service.get_analytics_summary(user_id, site_url, days=30)
|
||||
|
||||
if summary and isinstance(summary, dict):
|
||||
# Format insights data from stored analytics
|
||||
|
||||
@@ -0,0 +1,200 @@
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService
|
||||
from models.website_analysis_monitoring_models import (
|
||||
DeepCompetitorAnalysisTask,
|
||||
DeepCompetitorAnalysisExecutionLog
|
||||
)
|
||||
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
|
||||
from services.scheduler.core.failure_detection_service import FailureDetectionService
|
||||
from services.seo.deep_competitor_analysis_service import DeepCompetitorAnalysisService
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("deep_competitor_analysis_executor")
|
||||
|
||||
|
||||
class DeepCompetitorAnalysisExecutor(TaskExecutor):
|
||||
def __init__(self):
|
||||
self.analysis_service = DeepCompetitorAnalysisService()
|
||||
self.integration_service = OnboardingDataIntegrationService()
|
||||
|
||||
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
|
||||
start_time = time.time()
|
||||
|
||||
if not isinstance(task, DeepCompetitorAnalysisTask):
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message="Invalid task type for deep competitor analysis",
|
||||
retryable=False
|
||||
)
|
||||
|
||||
task_log = DeepCompetitorAnalysisExecutionLog(
|
||||
task_id=task.id,
|
||||
status="running",
|
||||
execution_date=datetime.utcnow()
|
||||
)
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
user_id = str(task.user_id)
|
||||
|
||||
try:
|
||||
integrated = self.integration_service.get_integrated_data_sync(user_id, db)
|
||||
website_analysis = integrated.get("website_analysis") if isinstance(integrated, dict) else {}
|
||||
|
||||
payload = task.payload if isinstance(task.payload, dict) else {}
|
||||
competitors = payload.get("competitors")
|
||||
if not isinstance(competitors, list) or not competitors:
|
||||
# Try to get from research_preferences
|
||||
research_prefs = integrated.get("research_preferences") if isinstance(integrated, dict) else {}
|
||||
if isinstance(research_prefs, dict):
|
||||
competitors = research_prefs.get("competitors")
|
||||
|
||||
# If still not found, try to get from competitor_analysis (Step 3 persistence)
|
||||
if not isinstance(competitors, list) or not competitors:
|
||||
competitors = integrated.get("competitor_analysis") if isinstance(integrated, dict) else []
|
||||
|
||||
if not isinstance(competitors, list) or not competitors:
|
||||
logger.warning(f"Deep competitor analysis skipped for user {user_id}: No competitors found")
|
||||
|
||||
task_log.status = "skipped"
|
||||
task_log.result_data = {"status": "skipped", "reason": "no_competitors"}
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Mark task as completed but maybe pause it until user adds competitors?
|
||||
# Or just treat it as success (empty report) so it doesn't retry endlessly
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_success = datetime.utcnow()
|
||||
task.status = "paused" # Pause it so it doesn't run again until triggered manually
|
||||
task.next_execution = None
|
||||
task.consecutive_failures = 0
|
||||
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=True,
|
||||
result_data={"status": "skipped", "reason": "no_competitors"},
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=False
|
||||
)
|
||||
|
||||
max_competitors = int(payload.get("max_competitors") or 25)
|
||||
crawl_concurrency = int(payload.get("crawl_concurrency") or 4)
|
||||
mode = payload.get("mode", "deep_analysis")
|
||||
|
||||
if mode == "strategic_insights":
|
||||
logger.info(f"Executing weekly strategic insights for user {user_id}")
|
||||
report = await self.analysis_service.generate_weekly_strategy_brief(
|
||||
user_id=user_id,
|
||||
website_analysis=website_analysis if isinstance(website_analysis, dict) else {},
|
||||
competitors=competitors
|
||||
)
|
||||
|
||||
# Persist to WebsiteAnalysis history
|
||||
analysis_id = website_analysis.get('id')
|
||||
if analysis_id:
|
||||
from models.onboarding import WebsiteAnalysis
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
wa = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.id == analysis_id).first()
|
||||
if wa:
|
||||
history = wa.strategic_insights_history or []
|
||||
if not isinstance(history, list):
|
||||
history = []
|
||||
history.insert(0, report)
|
||||
wa.strategic_insights_history = history[:52]
|
||||
flag_modified(wa, "strategic_insights_history")
|
||||
db.commit()
|
||||
else:
|
||||
report = await self.analysis_service.run(
|
||||
user_id=user_id,
|
||||
website_analysis=website_analysis if isinstance(website_analysis, dict) else {},
|
||||
competitors=competitors,
|
||||
max_competitors=max_competitors,
|
||||
crawl_concurrency=crawl_concurrency
|
||||
)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_success = datetime.utcnow()
|
||||
|
||||
# If it's a recurring task (strategic_insights), set next execution
|
||||
if mode == "strategic_insights":
|
||||
task.status = "active"
|
||||
task.next_execution = self.calculate_next_execution(task, "weekly", task.last_executed)
|
||||
else:
|
||||
task.status = "paused"
|
||||
task.next_execution = None
|
||||
|
||||
task.consecutive_failures = 0
|
||||
task.failure_pattern = None
|
||||
task.failure_reason = None
|
||||
|
||||
task_log.status = "success"
|
||||
task_log.result_data = report
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.commit()
|
||||
|
||||
try:
|
||||
await self.integration_service.refresh_integrated_data(user_id, db)
|
||||
except Exception as e:
|
||||
logger.warning(f"Deep competitor analysis SSOT refresh failed for user {user_id}: {e}")
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=True,
|
||||
result_data=report,
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=False
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.warning(f"Deep competitor analysis task failed for user {user_id}: {e}")
|
||||
|
||||
failure_detection = FailureDetectionService(db)
|
||||
pattern = failure_detection.analyze_task_failures(task.id, "deep_competitor_analysis", user_id)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_failure = datetime.utcnow()
|
||||
task.failure_reason = str(e)
|
||||
task.consecutive_failures = (task.consecutive_failures or 0) + 1
|
||||
|
||||
if pattern and pattern.should_cool_off:
|
||||
task.status = "needs_intervention"
|
||||
task.failure_pattern = {
|
||||
"consecutive_failures": pattern.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures,
|
||||
"failure_reason": pattern.failure_reason.value,
|
||||
"error_patterns": pattern.error_patterns,
|
||||
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
|
||||
}
|
||||
task.next_execution = None
|
||||
else:
|
||||
task.status = "failed"
|
||||
task.next_execution = datetime.utcnow() + timedelta(minutes=30)
|
||||
|
||||
task_log.status = "failed"
|
||||
task_log.error_message = str(e)
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=(task.status != "needs_intervention"),
|
||||
retry_delay=1800
|
||||
)
|
||||
|
||||
def calculate_next_execution(self, task: Any, frequency: str, last_execution: datetime = None) -> datetime:
|
||||
base = last_execution or datetime.utcnow()
|
||||
if frequency == "weekly":
|
||||
return base + timedelta(days=7)
|
||||
return base + timedelta(days=365)
|
||||
|
||||
@@ -0,0 +1,179 @@
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models.website_analysis_monitoring_models import (
|
||||
DeepWebsiteCrawlTask,
|
||||
DeepWebsiteCrawlExecutionLog
|
||||
)
|
||||
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
|
||||
from services.scheduler.core.failure_detection_service import FailureDetectionService
|
||||
from services.research.deep_crawl_service import DeepCrawlService
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("deep_website_crawl_executor")
|
||||
|
||||
|
||||
class DeepWebsiteCrawlExecutor(TaskExecutor):
|
||||
def __init__(self):
|
||||
self.crawl_service = DeepCrawlService()
|
||||
|
||||
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
|
||||
start_time = time.time()
|
||||
|
||||
if not isinstance(task, DeepWebsiteCrawlTask):
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message="Invalid task type for deep website crawl",
|
||||
retryable=False
|
||||
)
|
||||
|
||||
task_log = DeepWebsiteCrawlExecutionLog(
|
||||
task_id=task.id,
|
||||
status="running",
|
||||
execution_date=datetime.utcnow()
|
||||
)
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
user_id = str(task.user_id)
|
||||
website_url = task.website_url
|
||||
|
||||
try:
|
||||
logger.info(f"Executing deep website crawl for user {user_id}, url {website_url}")
|
||||
|
||||
result = await self.crawl_service.execute_deep_crawl(
|
||||
user_id=user_id,
|
||||
website_url=website_url,
|
||||
task_id=task.id # Pass task_id so service can update logs/task if needed, but we handle some here too.
|
||||
# Actually, the service updates logs and task status.
|
||||
# So we should coordinate.
|
||||
# In DeepCrawlService I wrote logic to update logs/task if task_id provided.
|
||||
# But here we also create a log "running".
|
||||
# The service creates a "success" or "failed" log.
|
||||
# This might result in duplicate logs or "running" log stuck.
|
||||
# Let's see DeepCrawlService again.
|
||||
)
|
||||
|
||||
# The service creates a new log entry for success/failure.
|
||||
# So the "running" log created here will stay as "running" unless updated.
|
||||
# I should probably update the "running" log instead of letting service create new one.
|
||||
# OR, I should remove task_id from service call and handle logging here.
|
||||
# Handling logging here is better for separation of concerns, BUT the service has the detailed stats.
|
||||
# The service returns the stats.
|
||||
# I will remove task_id from service call in future refactor, but for now let's just update the local log here too if needed.
|
||||
# Wait, if service creates a log, I have 2 logs.
|
||||
# I'll modify this executor to NOT pass task_id to service, but rely on return value.
|
||||
# But `DeepCrawlService.execute_deep_crawl` takes task_id as Optional.
|
||||
# If I don't pass it, it returns the result dict.
|
||||
# I'll do that.
|
||||
|
||||
# Re-calling service without task_id
|
||||
# Wait, `execute_deep_crawl` signature: `async def execute_deep_crawl(self, user_id: str, website_url: str, task_id: Optional[int] = None)`
|
||||
|
||||
# If I don't pass task_id, the service won't touch the DB for logs/tasks (except for saving content).
|
||||
# This is cleaner.
|
||||
|
||||
# result = await self.crawl_service.execute_deep_crawl(user_id, website_url)
|
||||
# But wait, in the service I implemented:
|
||||
# `if task_id: log = ... db.add(log) ...`
|
||||
# So if I don't pass task_id, it just returns data. Perfect.
|
||||
|
||||
# Correction: I need to update the file `backend/services/research/deep_crawl_service.py` ?
|
||||
# No, it handles optional task_id.
|
||||
|
||||
# So here I call it without task_id.
|
||||
|
||||
# However, `DeepCrawlService` updates task status (last_executed, etc) if task_id is present.
|
||||
# If I don't pass task_id, I must update task status here.
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_success = datetime.utcnow()
|
||||
task.status = "active" # Keep active for recurring? Or paused?
|
||||
# User said "schedule this task". So likely recurring.
|
||||
# But usually crawl is heavy, maybe weekly.
|
||||
|
||||
# Calculate next execution
|
||||
task.next_execution = self.calculate_next_execution(task, "Weekly", task.last_executed)
|
||||
|
||||
task.consecutive_failures = 0
|
||||
task.failure_pattern = None
|
||||
task.failure_reason = None
|
||||
|
||||
task_log.status = "success"
|
||||
task_log.result_data = result
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=True,
|
||||
result_data=result,
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=False
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.warning(f"Deep website crawl task failed for user {user_id}: {e}")
|
||||
|
||||
failure_detection = FailureDetectionService(db)
|
||||
pattern = failure_detection.analyze_task_failures(task.id, "deep_website_crawl", user_id)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_failure = datetime.utcnow()
|
||||
task.failure_reason = str(e)
|
||||
task.consecutive_failures = (task.consecutive_failures or 0) + 1
|
||||
|
||||
if pattern and pattern.should_cool_off:
|
||||
task.status = "needs_intervention"
|
||||
task.failure_pattern = {
|
||||
"consecutive_failures": pattern.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures,
|
||||
"failure_reason": pattern.failure_reason.value,
|
||||
"error_patterns": pattern.error_patterns,
|
||||
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
|
||||
}
|
||||
task.next_execution = None
|
||||
else:
|
||||
task.status = "failed"
|
||||
task.next_execution = datetime.utcnow() + timedelta(minutes=60) # Retry in hour
|
||||
|
||||
task_log.status = "failed"
|
||||
task_log.error_message = str(e)
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=(task.status != "needs_intervention"),
|
||||
retry_delay=3600
|
||||
)
|
||||
|
||||
def calculate_next_execution(
|
||||
self,
|
||||
task: Any,
|
||||
frequency: str,
|
||||
last_execution: Optional[datetime] = None
|
||||
) -> datetime:
|
||||
"""
|
||||
Calculate next execution time based on frequency.
|
||||
"""
|
||||
if not last_execution:
|
||||
last_execution = datetime.utcnow()
|
||||
|
||||
if frequency == 'Daily':
|
||||
return last_execution + timedelta(days=1)
|
||||
elif frequency == 'Weekly':
|
||||
return last_execution + timedelta(weeks=1)
|
||||
elif frequency == 'Monthly':
|
||||
return last_execution + timedelta(days=30)
|
||||
else:
|
||||
# Default to weekly if unknown
|
||||
return last_execution + timedelta(weeks=1)
|
||||
232
backend/services/scheduler/executors/market_trends_executor.py
Normal file
232
backend/services/scheduler/executors/market_trends_executor.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Market Trends Executor
|
||||
Runs Google Trends (pytrends) periodically and embeds results into the user SIF index.
|
||||
"""
|
||||
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models.website_analysis_monitoring_models import MarketTrendsTask, MarketTrendsExecutionLog
|
||||
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
|
||||
from services.scheduler.core.failure_detection_service import FailureDetectionService
|
||||
from services.intelligence.sif_integration import SIFIntegrationService
|
||||
from services.research.trends.google_trends_service import GoogleTrendsService
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("market_trends_executor")
|
||||
|
||||
|
||||
class MarketTrendsExecutor(TaskExecutor):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
|
||||
start_time = time.time()
|
||||
|
||||
if not isinstance(task, MarketTrendsTask):
|
||||
return TaskExecutionResult(success=False, error_message="Invalid task type for market trends", retryable=False)
|
||||
|
||||
task_log = MarketTrendsExecutionLog(task_id=task.id, status="running", execution_date=datetime.utcnow())
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
user_id = str(task.user_id)
|
||||
website_url = task.website_url
|
||||
payload = task.payload or {}
|
||||
|
||||
try:
|
||||
geo = payload.get("geo") or "US"
|
||||
timeframe = payload.get("timeframe") or "today 12-m"
|
||||
|
||||
sif_service = SIFIntegrationService(user_id)
|
||||
|
||||
keywords = await self._select_keywords_for_user(db=db, user_id=user_id, website_url=website_url)
|
||||
if not keywords:
|
||||
keywords = payload.get("keywords") or []
|
||||
|
||||
keywords = [str(k).strip() for k in (keywords or []) if str(k).strip()]
|
||||
if len(keywords) > 5:
|
||||
keywords = keywords[:5]
|
||||
|
||||
trends_result: Dict[str, Any]
|
||||
if keywords:
|
||||
try:
|
||||
trends_result = await GoogleTrendsService().analyze_trends(
|
||||
keywords=keywords, timeframe=timeframe, geo=geo, user_id=user_id
|
||||
)
|
||||
except Exception as trends_err:
|
||||
trends_result = {
|
||||
"error": str(trends_err),
|
||||
"keywords": keywords,
|
||||
"timeframe": timeframe,
|
||||
"geo": geo,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"cached": False,
|
||||
}
|
||||
else:
|
||||
trends_result = {
|
||||
"error": "No keywords available for market trends run",
|
||||
"keywords": [],
|
||||
"timeframe": timeframe,
|
||||
"geo": geo,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"cached": False,
|
||||
}
|
||||
|
||||
run_id = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
||||
await sif_service.index_market_trends_run(trends_result=trends_result, run_id=run_id)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_success = datetime.utcnow()
|
||||
|
||||
frequency_hours = task.frequency_hours or 72
|
||||
task.next_execution = datetime.utcnow() + timedelta(hours=frequency_hours)
|
||||
task.status = "active"
|
||||
|
||||
task.consecutive_failures = 0
|
||||
task.failure_pattern = None
|
||||
task.failure_reason = None
|
||||
|
||||
task_log.status = "success"
|
||||
task_log.result_data = {
|
||||
"run_id": run_id,
|
||||
"keywords": trends_result.get("keywords", keywords),
|
||||
"geo": geo,
|
||||
"timeframe": timeframe,
|
||||
"cached": trends_result.get("cached", False),
|
||||
}
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=True,
|
||||
result_data=task_log.result_data,
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=False,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.warning(f"Market trends task failed for user {user_id}: {e}")
|
||||
|
||||
failure_detection = FailureDetectionService(db)
|
||||
pattern = failure_detection.analyze_task_failures(task.id, "market_trends", user_id)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_failure = datetime.utcnow()
|
||||
task.failure_reason = str(e)
|
||||
task.consecutive_failures = (task.consecutive_failures or 0) + 1
|
||||
|
||||
if pattern and pattern.should_cool_off:
|
||||
task.status = "needs_intervention"
|
||||
task.failure_pattern = {
|
||||
"consecutive_failures": pattern.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures,
|
||||
"failure_reason": pattern.failure_reason.value,
|
||||
"error_patterns": pattern.error_patterns,
|
||||
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat(),
|
||||
}
|
||||
task.next_execution = None
|
||||
else:
|
||||
task.status = "active"
|
||||
task.next_execution = datetime.utcnow() + timedelta(hours=6)
|
||||
|
||||
task_log.status = "failed"
|
||||
task_log.error_message = str(e)
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=(task.status != "needs_intervention"),
|
||||
retry_delay=21600,
|
||||
)
|
||||
|
||||
async def _select_keywords_for_user(self, db: Session, user_id: str, website_url: str) -> List[str]:
|
||||
keywords: List[str] = []
|
||||
|
||||
try:
|
||||
from sqlalchemy import select, desc
|
||||
from models.enhanced_strategy_models import EnhancedContentStrategy
|
||||
|
||||
stmt = (
|
||||
select(EnhancedContentStrategy)
|
||||
.where(EnhancedContentStrategy.user_id == user_id)
|
||||
.order_by(desc(EnhancedContentStrategy.updated_at))
|
||||
)
|
||||
strategy = db.execute(stmt).scalars().first()
|
||||
if strategy:
|
||||
if strategy.emerging_trends:
|
||||
keywords.extend(self._extract_strings(strategy.emerging_trends))
|
||||
if strategy.industry_trends:
|
||||
keywords.extend(self._extract_strings(strategy.industry_trends))
|
||||
if strategy.market_gaps:
|
||||
keywords.extend(self._extract_strings(strategy.market_gaps))
|
||||
if strategy.competitor_content_strategies:
|
||||
keywords.extend(self._extract_strings(strategy.competitor_content_strategies))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not keywords:
|
||||
try:
|
||||
from sqlalchemy import select, desc
|
||||
from models.onboarding import WebsiteAnalysis, OnboardingSession
|
||||
|
||||
stmt = (
|
||||
select(WebsiteAnalysis)
|
||||
.join(OnboardingSession, WebsiteAnalysis.session_id == OnboardingSession.id)
|
||||
.where(OnboardingSession.user_id == user_id)
|
||||
.order_by(desc(WebsiteAnalysis.created_at))
|
||||
)
|
||||
wa = db.execute(stmt).scalars().first()
|
||||
if wa and wa.content_strategy_insights:
|
||||
ai_strategy = wa.content_strategy_insights.get("ai_strategy", {})
|
||||
topic_clusters = ai_strategy.get("topic_clusters") or []
|
||||
keywords.extend(self._extract_strings(topic_clusters))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
deduped = []
|
||||
seen = set()
|
||||
for k in keywords:
|
||||
kk = str(k).strip()
|
||||
if not kk:
|
||||
continue
|
||||
key = kk.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
deduped.append(kk)
|
||||
|
||||
return deduped[:5]
|
||||
|
||||
def _extract_strings(self, value: Any) -> List[str]:
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, str):
|
||||
return [value]
|
||||
if isinstance(value, list):
|
||||
out: List[str] = []
|
||||
for item in value:
|
||||
out.extend(self._extract_strings(item))
|
||||
return out
|
||||
if isinstance(value, dict):
|
||||
out: List[str] = []
|
||||
for k in ["keyword", "topic", "title", "name", "label"]:
|
||||
if k in value and value.get(k):
|
||||
out.append(str(value.get(k)))
|
||||
return out
|
||||
return [str(value)]
|
||||
|
||||
def calculate_next_execution(self, task: Any, frequency: str, last_execution: datetime = None) -> datetime:
|
||||
base = last_execution or datetime.utcnow()
|
||||
hours = getattr(task, "frequency_hours", 72) or 72
|
||||
return base + timedelta(hours=hours)
|
||||
@@ -21,6 +21,7 @@ from services.gsc_service import GSCService
|
||||
from services.integrations.bing_oauth import BingOAuthService
|
||||
from services.integrations.wordpress_oauth import WordPressOAuthService
|
||||
from services.wix_service import WixService
|
||||
from services.database import get_user_db_path
|
||||
|
||||
logger = get_service_logger("oauth_token_monitoring_executor")
|
||||
|
||||
@@ -289,8 +290,8 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
|
||||
GSC service auto-refreshes tokens if expired when loading credentials.
|
||||
"""
|
||||
try:
|
||||
# Use absolute database path for consistency with onboarding
|
||||
db_path = os.path.abspath("alwrity.db")
|
||||
# Use dynamic database path
|
||||
db_path = get_user_db_path(user_id)
|
||||
gsc_service = GSCService(db_path=db_path)
|
||||
credentials = gsc_service.load_user_credentials(user_id)
|
||||
|
||||
@@ -341,9 +342,8 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
|
||||
Checks token expiration and attempts refresh if needed.
|
||||
"""
|
||||
try:
|
||||
# Use absolute database path for consistency with onboarding
|
||||
db_path = os.path.abspath("alwrity.db")
|
||||
bing_service = BingOAuthService(db_path=db_path)
|
||||
# Initialize Bing service
|
||||
bing_service = BingOAuthService()
|
||||
|
||||
# Get token status (includes expired tokens)
|
||||
token_status = bing_service.get_user_token_status(user_id)
|
||||
@@ -502,8 +502,8 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
|
||||
and require user re-authorization. We only check if token is valid.
|
||||
"""
|
||||
try:
|
||||
# Use absolute database path for consistency with onboarding
|
||||
db_path = os.path.abspath("alwrity.db")
|
||||
# Use dynamic database path
|
||||
db_path = get_user_db_path(user_id)
|
||||
wordpress_service = WordPressOAuthService(db_path=db_path)
|
||||
tokens = wordpress_service.get_user_tokens(user_id)
|
||||
|
||||
|
||||
@@ -0,0 +1,584 @@
|
||||
import asyncio
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models.onboarding import SEOPageAudit
|
||||
from models.website_analysis_monitoring_models import (
|
||||
OnboardingFullWebsiteAnalysisTask,
|
||||
OnboardingFullWebsiteAnalysisExecutionLog
|
||||
)
|
||||
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
|
||||
from services.scheduler.core.failure_detection_service import FailureDetectionService
|
||||
|
||||
from services.seo_analyzer.analyzers import (
|
||||
MetaDataAnalyzer,
|
||||
TechnicalSEOAnalyzer,
|
||||
ContentAnalyzer,
|
||||
URLStructureAnalyzer,
|
||||
AccessibilityAnalyzer,
|
||||
UserExperienceAnalyzer
|
||||
)
|
||||
|
||||
|
||||
class OnboardingFullWebsiteAnalysisExecutor(TaskExecutor):
|
||||
def __init__(self):
|
||||
self.logger = logger.bind(component="OnboardingFullWebsiteAnalysisExecutor")
|
||||
|
||||
self.max_urls_default = 500
|
||||
self.http_timeout_seconds = 25
|
||||
self.http_concurrency = 10
|
||||
|
||||
self.healthy_threshold = 80
|
||||
self.warning_threshold = 60
|
||||
|
||||
self.weights = {
|
||||
'meta': 0.15,
|
||||
'content': 0.20,
|
||||
'technical': 0.20,
|
||||
'performance': 0.20,
|
||||
'accessibility': 0.10,
|
||||
'ux': 0.10,
|
||||
'security': 0.05,
|
||||
}
|
||||
|
||||
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
|
||||
start_time = time.time()
|
||||
|
||||
if not isinstance(task, OnboardingFullWebsiteAnalysisTask):
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message="Invalid task type for onboarding full website analysis",
|
||||
retryable=False
|
||||
)
|
||||
|
||||
task_log = OnboardingFullWebsiteAnalysisExecutionLog(
|
||||
task_id=task.id,
|
||||
status='running',
|
||||
execution_date=datetime.utcnow()
|
||||
)
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
user_id = str(task.user_id)
|
||||
website_url = task.website_url
|
||||
payload = task.payload or {}
|
||||
|
||||
max_urls = int(payload.get('max_urls') or self.max_urls_default)
|
||||
|
||||
try:
|
||||
urls = await self._discover_urls(website_url, max_urls=max_urls)
|
||||
if not urls:
|
||||
raise ValueError("No URLs discovered for full-site analysis")
|
||||
|
||||
results = await self._audit_urls(user_id, website_url, urls, db)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_success = datetime.utcnow()
|
||||
task.status = 'paused'
|
||||
task.next_execution = None
|
||||
task.consecutive_failures = 0
|
||||
task.failure_pattern = None
|
||||
task.failure_reason = None
|
||||
|
||||
task_log.status = 'success'
|
||||
task_log.result_data = results
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=True,
|
||||
result_data=results,
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=False
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
self.logger.error(f"Full-site SEO audit task failed: {e}", exc_info=True)
|
||||
|
||||
failure_detection = FailureDetectionService(db)
|
||||
pattern = failure_detection.analyze_task_failures(task.id, 'onboarding_full_website_analysis', user_id)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_failure = datetime.utcnow()
|
||||
task.failure_reason = str(e)
|
||||
task.consecutive_failures = (task.consecutive_failures or 0) + 1
|
||||
|
||||
if pattern and pattern.should_cool_off:
|
||||
task.status = "needs_intervention"
|
||||
task.failure_pattern = {
|
||||
"consecutive_failures": pattern.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures,
|
||||
"failure_reason": pattern.failure_reason.value,
|
||||
"error_patterns": pattern.error_patterns,
|
||||
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
|
||||
}
|
||||
task.next_execution = None
|
||||
else:
|
||||
task.status = "failed"
|
||||
task.next_execution = datetime.utcnow() + timedelta(minutes=30)
|
||||
|
||||
task_log.status = 'failed'
|
||||
task_log.error_message = str(e)
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=(task.status != "needs_intervention"),
|
||||
retry_delay=1800
|
||||
)
|
||||
|
||||
def calculate_next_execution(
|
||||
self,
|
||||
task: Any,
|
||||
frequency: str,
|
||||
last_execution: Optional[datetime] = None
|
||||
) -> datetime:
|
||||
base = last_execution or datetime.utcnow()
|
||||
return base + timedelta(days=365)
|
||||
|
||||
async def _discover_urls(self, website_url: str, max_urls: int) -> List[str]:
|
||||
base = self._normalize_url(website_url)
|
||||
parsed = urlparse(base)
|
||||
root = f"{parsed.scheme}://{parsed.netloc}"
|
||||
|
||||
sitemap_urls: List[str] = []
|
||||
|
||||
robots = await self._fetch_text(urljoin(root, "/robots.txt"))
|
||||
if robots:
|
||||
for line in robots.splitlines():
|
||||
if line.lower().startswith("sitemap:"):
|
||||
sitemap_urls.append(line.split(":", 1)[1].strip())
|
||||
|
||||
if not sitemap_urls:
|
||||
candidates = [
|
||||
urljoin(root, "/sitemap.xml"),
|
||||
urljoin(root, "/sitemap_index.xml"),
|
||||
urljoin(root, "/wp-sitemap.xml"),
|
||||
]
|
||||
sitemap_urls.extend(candidates)
|
||||
|
||||
discovered: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
for sm in sitemap_urls:
|
||||
if len(discovered) >= max_urls:
|
||||
break
|
||||
urls_from_sm = await self._parse_sitemap(sm, max_urls=max_urls - len(discovered))
|
||||
for u in urls_from_sm:
|
||||
n = self._normalize_url(u)
|
||||
if n not in seen and self._same_site(root, n):
|
||||
seen.add(n)
|
||||
discovered.append(n)
|
||||
if len(discovered) >= max_urls:
|
||||
break
|
||||
|
||||
if not discovered:
|
||||
discovered.append(base)
|
||||
|
||||
return discovered
|
||||
|
||||
async def _parse_sitemap(self, sitemap_url: str, max_urls: int) -> List[str]:
|
||||
xml_text = await self._fetch_text(sitemap_url)
|
||||
if not xml_text:
|
||||
return []
|
||||
|
||||
try:
|
||||
import xml.etree.ElementTree as ET
|
||||
root = ET.fromstring(xml_text)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
ns = ""
|
||||
if root.tag.startswith("{"):
|
||||
ns = root.tag.split("}", 1)[0] + "}"
|
||||
|
||||
urls: List[str] = []
|
||||
|
||||
if root.tag.endswith("sitemapindex"):
|
||||
locs = root.findall(f".//{ns}sitemap/{ns}loc")
|
||||
for loc in locs:
|
||||
if len(urls) >= max_urls:
|
||||
break
|
||||
child_url = (loc.text or "").strip()
|
||||
if not child_url:
|
||||
continue
|
||||
child_urls = await self._parse_sitemap(child_url, max_urls=max_urls - len(urls))
|
||||
urls.extend(child_urls)
|
||||
else:
|
||||
locs = root.findall(f".//{ns}url/{ns}loc")
|
||||
for loc in locs:
|
||||
if len(urls) >= max_urls:
|
||||
break
|
||||
u = (loc.text or "").strip()
|
||||
if u:
|
||||
urls.append(u)
|
||||
|
||||
return urls
|
||||
|
||||
async def _fetch_text(self, url: str) -> Optional[str]:
|
||||
try:
|
||||
timeout = aiohttp.ClientTimeout(total=self.http_timeout_seconds)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url, allow_redirects=True, headers={"User-Agent": "ALwrity-SEO-Audit/1.0"}) as resp:
|
||||
if resp.status >= 400:
|
||||
return None
|
||||
return await resp.text(errors="ignore")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def _audit_urls(self, user_id: str, website_url: str, urls: List[str], db: Session) -> Dict[str, Any]:
|
||||
timeout = aiohttp.ClientTimeout(total=self.http_timeout_seconds)
|
||||
connector = aiohttp.TCPConnector(limit=self.http_concurrency)
|
||||
|
||||
semaphore = asyncio.Semaphore(self.http_concurrency)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
|
||||
async def audit_one(url: str) -> Dict[str, Any]:
|
||||
async with semaphore:
|
||||
return await self._audit_single_url(user_id, website_url, url, session, db)
|
||||
|
||||
audited = await asyncio.gather(*[audit_one(u) for u in urls], return_exceptions=True)
|
||||
|
||||
successes = [r for r in audited if isinstance(r, dict) and r.get('success')]
|
||||
failures = [r for r in audited if not (isinstance(r, dict) and r.get('success'))]
|
||||
|
||||
avg_score = round(sum(r['overall_score'] for r in successes) / len(successes)) if successes else 0
|
||||
fix_scheduled = len([r for r in successes if r.get('status') == 'fix_scheduled'])
|
||||
|
||||
worst_pages = sorted(
|
||||
[{'page_url': r['page_url'], 'overall_score': r['overall_score'], 'status': r.get('status')} for r in successes],
|
||||
key=lambda x: x['overall_score']
|
||||
)[:10]
|
||||
|
||||
return {
|
||||
'website_url': website_url,
|
||||
'pages_discovered': len(urls),
|
||||
'pages_audited': len(successes),
|
||||
'pages_failed': len(failures),
|
||||
'avg_score': avg_score,
|
||||
'fix_scheduled_pages': fix_scheduled,
|
||||
'worst_pages': worst_pages,
|
||||
}
|
||||
|
||||
async def _audit_single_url(
|
||||
self,
|
||||
user_id: str,
|
||||
website_url: str,
|
||||
page_url: str,
|
||||
session: aiohttp.ClientSession,
|
||||
db: Session
|
||||
) -> Dict[str, Any]:
|
||||
fetch_start = time.time()
|
||||
try:
|
||||
async with session.get(page_url, allow_redirects=True, headers={"User-Agent": "ALwrity-SEO-Audit/1.0"}) as resp:
|
||||
status = resp.status
|
||||
content_type = resp.headers.get("Content-Type", "")
|
||||
text = await resp.text(errors="ignore")
|
||||
headers = dict(resp.headers)
|
||||
except Exception as e:
|
||||
self._upsert_page_audit(
|
||||
db=db,
|
||||
user_id=user_id,
|
||||
website_url=website_url,
|
||||
page_url=page_url,
|
||||
overall_score=0,
|
||||
status='error',
|
||||
audit_data={'error': str(e)}
|
||||
)
|
||||
return {'success': False, 'page_url': page_url, 'error': str(e)}
|
||||
|
||||
load_time = time.time() - fetch_start
|
||||
|
||||
if status >= 400 or "text/html" not in content_type.lower():
|
||||
self._upsert_page_audit(
|
||||
db=db,
|
||||
user_id=user_id,
|
||||
website_url=website_url,
|
||||
page_url=page_url,
|
||||
overall_score=0,
|
||||
status='error',
|
||||
audit_data={'http_status': status, 'content_type': content_type}
|
||||
)
|
||||
return {'success': False, 'page_url': page_url, 'error': f'HTTP {status} / {content_type}'}
|
||||
|
||||
soup = BeautifulSoup(text, 'html.parser')
|
||||
|
||||
meta = MetaDataAnalyzer().analyze(soup)
|
||||
content = ContentAnalyzer().analyze(soup)
|
||||
technical = TechnicalSEOAnalyzer().analyze(page_url, soup)
|
||||
url_structure = URLStructureAnalyzer().analyze(page_url)
|
||||
accessibility = AccessibilityAnalyzer().analyze(text)
|
||||
ux = UserExperienceAnalyzer().analyze(text, page_url)
|
||||
|
||||
performance = self._performance_from_fetch(load_time, headers)
|
||||
security = self._security_from_headers(headers)
|
||||
|
||||
category_scores = {
|
||||
'meta': meta.get('score', 0),
|
||||
'content': content.get('score', 0),
|
||||
'technical': technical.get('score', 0),
|
||||
'performance': performance.get('score', 0),
|
||||
'accessibility': accessibility.get('score', 0),
|
||||
'ux': ux.get('score', 0),
|
||||
'security': security.get('score', 0),
|
||||
'url_structure': url_structure.get('score', 0),
|
||||
}
|
||||
|
||||
overall_score = self._weighted_score(category_scores)
|
||||
|
||||
if overall_score >= self.healthy_threshold:
|
||||
page_status = 'healthy'
|
||||
elif overall_score >= self.warning_threshold:
|
||||
page_status = 'needs_review'
|
||||
else:
|
||||
page_status = 'fix_scheduled'
|
||||
|
||||
audit_data = {
|
||||
'meta': meta,
|
||||
'content_health': content,
|
||||
'technical': technical,
|
||||
'performance': performance,
|
||||
'url_structure': url_structure,
|
||||
'accessibility': accessibility,
|
||||
'ux': ux,
|
||||
'security_headers': security,
|
||||
'overall_score': overall_score,
|
||||
}
|
||||
|
||||
issues = self._collect_findings(audit_data, key='issues')
|
||||
warnings = self._collect_findings(audit_data, key='warnings')
|
||||
recommendations = self._collect_findings(audit_data, key='recommendations')
|
||||
|
||||
self._upsert_page_audit(
|
||||
db=db,
|
||||
user_id=user_id,
|
||||
website_url=website_url,
|
||||
page_url=page_url,
|
||||
overall_score=overall_score,
|
||||
status=page_status,
|
||||
category_scores=category_scores,
|
||||
issues=issues,
|
||||
warnings=warnings,
|
||||
recommendations=recommendations,
|
||||
audit_data=audit_data
|
||||
)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'page_url': page_url,
|
||||
'overall_score': overall_score,
|
||||
'status': page_status
|
||||
}
|
||||
|
||||
def _weighted_score(self, category_scores: Dict[str, int]) -> int:
|
||||
total = 0.0
|
||||
for key, weight in self.weights.items():
|
||||
total += float(category_scores.get(key, 0)) * weight
|
||||
return int(round(total))
|
||||
|
||||
def _collect_findings(self, audit_data: Dict[str, Any], key: str) -> List[Dict[str, Any]]:
|
||||
findings: List[Dict[str, Any]] = []
|
||||
for category, data in audit_data.items():
|
||||
if not isinstance(data, dict):
|
||||
continue
|
||||
items = data.get(key)
|
||||
if not isinstance(items, list):
|
||||
continue
|
||||
for item in items:
|
||||
if isinstance(item, dict):
|
||||
enriched = dict(item)
|
||||
enriched.setdefault('category', category)
|
||||
findings.append(enriched)
|
||||
return findings
|
||||
|
||||
def _performance_from_fetch(self, load_time: float, headers: Dict[str, str]) -> Dict[str, Any]:
|
||||
issues: List[Dict[str, Any]] = []
|
||||
warnings: List[Dict[str, Any]] = []
|
||||
recommendations: List[Dict[str, Any]] = []
|
||||
|
||||
if load_time > 3:
|
||||
issues.append({
|
||||
'type': 'critical',
|
||||
'message': f'Page load time too slow ({load_time:.2f}s)',
|
||||
'location': 'Page performance',
|
||||
'current_value': f'{load_time:.2f}s',
|
||||
'fix': 'Optimize page speed (target < 3 seconds)',
|
||||
'code_example': 'Optimize images, minify CSS/JS, use CDN',
|
||||
'action': 'optimize_page_speed'
|
||||
})
|
||||
elif load_time > 2:
|
||||
warnings.append({
|
||||
'type': 'warning',
|
||||
'message': f'Page load time could be improved ({load_time:.2f}s)',
|
||||
'location': 'Page performance',
|
||||
'current_value': f'{load_time:.2f}s',
|
||||
'fix': 'Optimize for faster loading',
|
||||
'code_example': 'Compress images, enable caching',
|
||||
'action': 'improve_page_speed'
|
||||
})
|
||||
|
||||
content_encoding = headers.get('Content-Encoding')
|
||||
if not content_encoding:
|
||||
warnings.append({
|
||||
'type': 'warning',
|
||||
'message': 'No compression detected',
|
||||
'location': 'Server configuration',
|
||||
'fix': 'Enable GZIP/Brotli compression',
|
||||
'code_example': 'Enable compression in server or CDN',
|
||||
'action': 'enable_compression'
|
||||
})
|
||||
|
||||
cache_headers = ['Cache-Control', 'Expires', 'ETag']
|
||||
has_cache = any(headers.get(h) for h in cache_headers)
|
||||
if not has_cache:
|
||||
warnings.append({
|
||||
'type': 'warning',
|
||||
'message': 'No caching headers found',
|
||||
'location': 'Server configuration',
|
||||
'fix': 'Add caching headers',
|
||||
'code_example': 'Cache-Control: max-age=31536000',
|
||||
'action': 'add_caching_headers'
|
||||
})
|
||||
|
||||
score = max(0, 100 - len(issues) * 25 - len(warnings) * 10)
|
||||
return {
|
||||
'score': score,
|
||||
'load_time': load_time,
|
||||
'is_compressed': bool(content_encoding),
|
||||
'has_cache': has_cache,
|
||||
'issues': issues,
|
||||
'warnings': warnings,
|
||||
'recommendations': recommendations
|
||||
}
|
||||
|
||||
def _security_from_headers(self, headers: Dict[str, str]) -> Dict[str, Any]:
|
||||
security_headers = {
|
||||
'X-Frame-Options': headers.get('X-Frame-Options'),
|
||||
'X-Content-Type-Options': headers.get('X-Content-Type-Options'),
|
||||
'X-XSS-Protection': headers.get('X-XSS-Protection'),
|
||||
'Strict-Transport-Security': headers.get('Strict-Transport-Security'),
|
||||
'Content-Security-Policy': headers.get('Content-Security-Policy'),
|
||||
'Referrer-Policy': headers.get('Referrer-Policy')
|
||||
}
|
||||
|
||||
issues: List[Dict[str, Any]] = []
|
||||
warnings: List[Dict[str, Any]] = []
|
||||
recommendations: List[Dict[str, Any]] = []
|
||||
present_headers: List[str] = []
|
||||
missing_headers: List[str] = []
|
||||
|
||||
for header_name, header_value in security_headers.items():
|
||||
if header_value:
|
||||
present_headers.append(header_name)
|
||||
continue
|
||||
|
||||
missing_headers.append(header_name)
|
||||
if header_name in ['X-Frame-Options', 'X-Content-Type-Options']:
|
||||
issues.append({
|
||||
'type': 'critical',
|
||||
'message': f'Missing {header_name} header',
|
||||
'location': 'Server configuration',
|
||||
'fix': f'Add {header_name} header',
|
||||
'code_example': f'{header_name}: DENY' if header_name == 'X-Frame-Options' else f'{header_name}: nosniff',
|
||||
'action': f'add_{header_name.lower().replace("-", "_")}_header'
|
||||
})
|
||||
else:
|
||||
warnings.append({
|
||||
'type': 'warning',
|
||||
'message': f'Missing {header_name} header',
|
||||
'location': 'Server configuration',
|
||||
'fix': f'Add {header_name} header for better security',
|
||||
'code_example': f'{header_name}: max-age=31536000',
|
||||
'action': f'add_{header_name.lower().replace("-", "_")}_header'
|
||||
})
|
||||
|
||||
score = min(100, len(present_headers) * 16)
|
||||
return {
|
||||
'score': score,
|
||||
'present_headers': present_headers,
|
||||
'missing_headers': missing_headers,
|
||||
'total_headers': len(present_headers),
|
||||
'issues': issues,
|
||||
'warnings': warnings,
|
||||
'recommendations': recommendations
|
||||
}
|
||||
|
||||
def _upsert_page_audit(
|
||||
self,
|
||||
db: Session,
|
||||
user_id: str,
|
||||
website_url: str,
|
||||
page_url: str,
|
||||
overall_score: int,
|
||||
status: str,
|
||||
category_scores: Optional[Dict[str, Any]] = None,
|
||||
issues: Optional[List[Dict[str, Any]]] = None,
|
||||
warnings: Optional[List[Dict[str, Any]]] = None,
|
||||
recommendations: Optional[List[Dict[str, Any]]] = None,
|
||||
audit_data: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
existing = db.query(SEOPageAudit).filter(
|
||||
SEOPageAudit.user_id == user_id,
|
||||
SEOPageAudit.page_url == page_url
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
existing.website_url = website_url
|
||||
existing.overall_score = overall_score
|
||||
existing.status = status
|
||||
existing.category_scores = category_scores
|
||||
existing.issues = issues
|
||||
existing.warnings = warnings
|
||||
existing.recommendations = recommendations
|
||||
existing.audit_data = audit_data
|
||||
existing.last_analyzed_at = datetime.utcnow()
|
||||
db.add(existing)
|
||||
else:
|
||||
db.add(SEOPageAudit(
|
||||
user_id=user_id,
|
||||
website_url=website_url,
|
||||
page_url=page_url,
|
||||
overall_score=overall_score,
|
||||
status=status,
|
||||
category_scores=category_scores,
|
||||
issues=issues,
|
||||
warnings=warnings,
|
||||
recommendations=recommendations,
|
||||
audit_data=audit_data,
|
||||
last_analyzed_at=datetime.utcnow()
|
||||
))
|
||||
|
||||
db.commit()
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
u = (url or "").strip()
|
||||
if not u:
|
||||
return ""
|
||||
if not u.startswith("http://") and not u.startswith("https://"):
|
||||
u = "https://" + u
|
||||
parsed = urlparse(u)
|
||||
normalized = parsed._replace(fragment="").geturl()
|
||||
return normalized.rstrip("/")
|
||||
|
||||
def _same_site(self, root: str, url: str) -> bool:
|
||||
try:
|
||||
a = urlparse(root)
|
||||
b = urlparse(url)
|
||||
return a.netloc == b.netloc
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
153
backend/services/scheduler/executors/sif_indexing_executor.py
Normal file
153
backend/services/scheduler/executors/sif_indexing_executor.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""
|
||||
SIF Indexing Executor
|
||||
Executes SIF indexing tasks (Step 2 metadata and User Website Content).
|
||||
"""
|
||||
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models.website_analysis_monitoring_models import (
|
||||
SIFIndexingTask,
|
||||
SIFIndexingExecutionLog
|
||||
)
|
||||
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
|
||||
from services.scheduler.core.failure_detection_service import FailureDetectionService
|
||||
from services.intelligence.sif_integration import SIFIntegrationService
|
||||
from utils.logger_utils import get_service_logger
|
||||
|
||||
logger = get_service_logger("sif_indexing_executor")
|
||||
|
||||
|
||||
class SIFIndexingExecutor(TaskExecutor):
|
||||
"""
|
||||
Executor for SIF indexing tasks.
|
||||
|
||||
Handles:
|
||||
- Indexing Step 2 Website Analysis Data (Metadata)
|
||||
- Harvesting and Indexing User Website Content (Deep Crawl)
|
||||
- Scheduling recurring updates (snapshot refresh)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
|
||||
start_time = time.time()
|
||||
|
||||
if not isinstance(task, SIFIndexingTask):
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message="Invalid task type for SIF indexing",
|
||||
retryable=False
|
||||
)
|
||||
|
||||
task_log = SIFIndexingExecutionLog(
|
||||
task_id=task.id,
|
||||
status="running",
|
||||
execution_date=datetime.utcnow()
|
||||
)
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
user_id = str(task.user_id)
|
||||
website_url = task.website_url
|
||||
|
||||
try:
|
||||
logger.info(f"Executing SIF indexing for user {user_id} ({website_url})")
|
||||
|
||||
# Initialize SIF Service
|
||||
sif_service = SIFIntegrationService(user_id)
|
||||
|
||||
# 1. Sync Step 2 Metadata (WebsiteAnalysis, CompetitorAnalysis)
|
||||
metadata_synced = await sif_service.sync_onboarding_data_to_sif()
|
||||
|
||||
# 2. Sync User Website Content (Deep Crawl / Snapshot)
|
||||
content_synced = await sif_service.sync_user_website_content(website_url)
|
||||
|
||||
# Determine overall success
|
||||
# We consider it a success if at least one operation worked, or if both were attempted without error
|
||||
# But ideally, content sync is the heavy lifter.
|
||||
success = metadata_synced or content_synced
|
||||
|
||||
if not success:
|
||||
logger.warning(f"SIF indexing completed but no data was synced/indexed for {user_id}")
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_success = datetime.utcnow()
|
||||
|
||||
# Schedule next execution (Recurring)
|
||||
frequency_hours = task.frequency_hours or 48
|
||||
task.next_execution = datetime.utcnow() + timedelta(hours=frequency_hours)
|
||||
task.status = "active"
|
||||
|
||||
task.consecutive_failures = 0
|
||||
task.failure_pattern = None
|
||||
task.failure_reason = None
|
||||
|
||||
task_log.status = "success"
|
||||
task_log.result_data = {
|
||||
"metadata_synced": metadata_synced,
|
||||
"content_synced": content_synced,
|
||||
"website_url": website_url
|
||||
}
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=True,
|
||||
result_data=task_log.result_data,
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=False
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.warning(f"SIF indexing task failed for user {user_id}: {e}")
|
||||
|
||||
failure_detection = FailureDetectionService(db)
|
||||
pattern = failure_detection.analyze_task_failures(task.id, "sif_indexing", user_id)
|
||||
|
||||
task.last_executed = datetime.utcnow()
|
||||
task.last_failure = datetime.utcnow()
|
||||
task.failure_reason = str(e)
|
||||
task.consecutive_failures = (task.consecutive_failures or 0) + 1
|
||||
|
||||
if pattern and pattern.should_cool_off:
|
||||
task.status = "needs_intervention"
|
||||
task.failure_pattern = {
|
||||
"consecutive_failures": pattern.consecutive_failures,
|
||||
"recent_failures": pattern.recent_failures,
|
||||
"failure_reason": pattern.failure_reason.value,
|
||||
"error_patterns": pattern.error_patterns,
|
||||
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
|
||||
}
|
||||
task.next_execution = None
|
||||
else:
|
||||
# Retry sooner if it's a transient failure
|
||||
task.status = "active" # Keep active for retry
|
||||
task.next_execution = datetime.utcnow() + timedelta(minutes=60)
|
||||
|
||||
task_log.status = "failed"
|
||||
task_log.error_message = str(e)
|
||||
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
db.add(task_log)
|
||||
db.commit()
|
||||
|
||||
return TaskExecutionResult(
|
||||
success=False,
|
||||
error_message=str(e),
|
||||
execution_time_ms=task_log.execution_time_ms,
|
||||
retryable=(task.status != "needs_intervention"),
|
||||
retry_delay=3600
|
||||
)
|
||||
|
||||
def calculate_next_execution(self, task: Any, frequency: str, last_execution: datetime = None) -> datetime:
|
||||
# Not strictly used here as we handle logic in execute_task, but good for interface compliance
|
||||
base = last_execution or datetime.utcnow()
|
||||
hours = getattr(task, 'frequency_hours', 48) or 48
|
||||
return base + timedelta(hours=hours)
|
||||
@@ -282,11 +282,18 @@ class WebsiteAnalysisExecutor(TaskExecutor):
|
||||
None,
|
||||
partial(self.style_logic.analyze_style_patterns, crawl_result['content'])
|
||||
)
|
||||
|
||||
async def run_seo_audit():
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
partial(self.style_logic.perform_seo_audit, website_url, crawl_result['content'])
|
||||
)
|
||||
|
||||
# Execute style and patterns analysis in parallel
|
||||
style_analysis, patterns_result = await asyncio.gather(
|
||||
style_analysis, patterns_result, seo_audit_result = await asyncio.gather(
|
||||
run_style_analysis(),
|
||||
run_patterns_analysis(),
|
||||
run_seo_audit(),
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
@@ -302,6 +309,12 @@ class WebsiteAnalysisExecutor(TaskExecutor):
|
||||
if isinstance(patterns_result, Exception):
|
||||
self.logger.warning(f"Patterns analysis exception: {patterns_result}")
|
||||
patterns_result = None
|
||||
|
||||
seo_audit = None
|
||||
if isinstance(seo_audit_result, Exception):
|
||||
self.logger.warning(f"SEO audit exception: {seo_audit_result}")
|
||||
else:
|
||||
seo_audit = seo_audit_result
|
||||
|
||||
# Step 3: Generate style guidelines
|
||||
style_guidelines = None
|
||||
@@ -320,6 +333,7 @@ class WebsiteAnalysisExecutor(TaskExecutor):
|
||||
'style_analysis': style_analysis.get('analysis') if style_analysis and style_analysis.get('success') else None,
|
||||
'style_patterns': patterns_result if patterns_result and not isinstance(patterns_result, Exception) else None,
|
||||
'style_guidelines': style_guidelines,
|
||||
'seo_audit': seo_audit,
|
||||
}
|
||||
|
||||
# Step 4: Store results based on task type
|
||||
@@ -366,10 +380,12 @@ class WebsiteAnalysisExecutor(TaskExecutor):
|
||||
):
|
||||
"""Update existing WebsiteAnalysis record for user's website."""
|
||||
try:
|
||||
# Convert Clerk user ID to integer (same as component_logic.py)
|
||||
# Use the same conversion logic as the website analysis API
|
||||
import hashlib
|
||||
user_id_int = int(hashlib.sha256(user_id.encode()).hexdigest()[:15], 16)
|
||||
session = db.query(OnboardingSession).filter(
|
||||
OnboardingSession.user_id == user_id
|
||||
).order_by(OnboardingSession.updated_at.desc()).first()
|
||||
|
||||
if not session:
|
||||
raise ValueError(f"No onboarding session found for user {user_id}")
|
||||
|
||||
# Use WebsiteAnalysisService to update
|
||||
analysis_service = WebsiteAnalysisService(db)
|
||||
@@ -380,13 +396,15 @@ class WebsiteAnalysisExecutor(TaskExecutor):
|
||||
'style_analysis': analysis_data.get('style_analysis'),
|
||||
'style_patterns': analysis_data.get('style_patterns'),
|
||||
'style_guidelines': analysis_data.get('style_guidelines'),
|
||||
'seo_audit': analysis_data.get('seo_audit'),
|
||||
}
|
||||
|
||||
# Save/update analysis
|
||||
analysis_id = analysis_service.save_analysis(
|
||||
session_id=user_id_int,
|
||||
session_id=session.id,
|
||||
website_url=website_url,
|
||||
analysis_data=response_data
|
||||
analysis_data=response_data,
|
||||
preserve_persona=True
|
||||
)
|
||||
|
||||
if analysis_id:
|
||||
@@ -490,3 +508,82 @@ class WebsiteAnalysisExecutor(TaskExecutor):
|
||||
)
|
||||
return last_execution + timedelta(days=task.frequency_days)
|
||||
|
||||
async def _perform_full_site_analysis(self, user_id: str, website_url: str, db: Session):
|
||||
"""
|
||||
Discover sitemap and perform non-AI SEO audit on all found pages.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Starting full site scan for {website_url}")
|
||||
sitemap_service = SitemapService()
|
||||
|
||||
# 1. Discover Sitemap
|
||||
sitemap_url = await sitemap_service.discover_sitemap_url(website_url)
|
||||
if not sitemap_url:
|
||||
self.logger.warning(f"No sitemap found for {website_url}, skipping full site scan")
|
||||
return
|
||||
|
||||
# 2. Get URLs (Raw mode)
|
||||
sitemap_data = await sitemap_service.analyze_sitemap(
|
||||
sitemap_url=sitemap_url,
|
||||
analyze_content_trends=False,
|
||||
analyze_publishing_patterns=False,
|
||||
include_ai_insights=False
|
||||
)
|
||||
|
||||
urls = [u.get('loc') for u in sitemap_data.get('urls', []) if u.get('loc')]
|
||||
self.logger.info(f"Found {len(urls)} URLs in sitemap for {website_url}")
|
||||
|
||||
# 3. Batch Process (Limit to 50 for safety during testing)
|
||||
urls_to_scan = urls[:50]
|
||||
|
||||
for page_url in urls_to_scan:
|
||||
try:
|
||||
# Check if exists
|
||||
existing = db.query(SEOPageAudit).filter(
|
||||
SEOPageAudit.user_id == user_id,
|
||||
SEOPageAudit.page_url == page_url
|
||||
).first()
|
||||
|
||||
# Run in executor to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
# Pass empty content dict to trigger internal fetching in perform_seo_audit
|
||||
audit_result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(self.style_logic.perform_seo_audit, page_url, {})
|
||||
)
|
||||
|
||||
if existing:
|
||||
existing.overall_score = audit_result.get('overall_score')
|
||||
existing.category_scores = {k: v.get('score') for k, v in audit_result.items() if isinstance(v, dict) and 'score' in v}
|
||||
existing.issues = audit_result.get('summary', {}).get('critical_issues', [])
|
||||
existing.warnings = audit_result.get('summary', {}).get('warnings', [])
|
||||
existing.audit_data = audit_result
|
||||
existing.last_analyzed_at = datetime.utcnow()
|
||||
existing.status = 'completed'
|
||||
else:
|
||||
new_audit = SEOPageAudit(
|
||||
user_id=user_id,
|
||||
website_url=website_url,
|
||||
page_url=page_url,
|
||||
overall_score=audit_result.get('overall_score'),
|
||||
category_scores={k: v.get('score') for k, v in audit_result.items() if isinstance(v, dict) and 'score' in v},
|
||||
issues=audit_result.get('summary', {}).get('critical_issues', []),
|
||||
warnings=audit_result.get('summary', {}).get('warnings', []),
|
||||
audit_data=audit_result,
|
||||
analysis_source='scheduled_full_site',
|
||||
status='completed'
|
||||
)
|
||||
db.add(new_audit)
|
||||
|
||||
db.commit() # Commit each page to show progress
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error auditing page {page_url}: {e}")
|
||||
db.rollback()
|
||||
|
||||
self.logger.info(f"Completed full site scan for {website_url}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in full site analysis: {e}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user