Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

This commit is contained in:
ajaysi
2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions

View File

@@ -0,0 +1,230 @@
import asyncio
from datetime import datetime, timedelta
from typing import Any, Dict, List
from loguru import logger
from sqlalchemy.orm import Session
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
from services.seo.advertools_service import AdvertoolsService
from services.seo_tools.sitemap_service import SitemapService
from models.advertools_monitoring_models import AdvertoolsTask, AdvertoolsExecutionLog
from models.onboarding import WebsiteAnalysis, OnboardingSession
class AdvertoolsExecutor:
"""
Executor for Advertools-based SEO intelligence tasks.
Handles 'content_audit' and 'site_health' task types.
"""
def __init__(self):
self.advertools_service = AdvertoolsService()
self.sitemap_service = SitemapService()
self.logger = logger.bind(service="AdvertoolsExecutor")
async def execute_task(self, task_stub: Any, db: Session, **kwargs) -> Dict[str, Any]:
"""
Execute an Advertools intelligence task.
Args:
task_stub: Tuple or object containing (id, user_id, payload)
db: Database session
Returns:
Execution result dictionary
"""
start_time = datetime.utcnow()
task_id = getattr(task_stub, 'id', None)
user_id = getattr(task_stub, 'user_id', None)
payload = getattr(task_stub, 'payload', {}) or {}
task_type = payload.get('type')
website_url = payload.get('website_url')
self.logger.info(f"🚀 Starting Advertools task {task_id} ({task_type}) for {website_url}")
# Find the actual task record to update state
task_record = None
if isinstance(task_id, int):
task_record = db.query(AdvertoolsTask).filter(AdvertoolsTask.id == task_id).first()
try:
if not website_url:
raise ValueError("Missing website_url in payload")
# 1. Discover exact sitemap URL first (essential for Advertools)
discovered_sitemap = await self.sitemap_service.discover_sitemap_url(website_url)
effective_url = discovered_sitemap if discovered_sitemap else website_url
# Set status to running for UI feedback
if task_record:
task_record.status = 'running'
db.commit()
result = {}
if task_type == 'content_audit':
# Phase 1: Audit content themes using sample URLs from sitemap
# First, get the sitemap to find recent URLs
sitemap_result = await self.advertools_service.analyze_sitemap(effective_url)
audit_urls = []
if sitemap_result.get('success'):
# Use the sample URLs returned by the service
audit_urls = sitemap_result.get('metrics', {}).get('audit_sample_urls', [])
if not audit_urls:
# Fallback to homepage if sitemap fails or empty
audit_urls = [website_url]
# Run the audit on the sample
result = await self.advertools_service.audit_content(audit_urls)
if result.get('success'):
await self._update_persona_augmentation(user_id, website_url, result, db)
elif task_type == 'site_health':
# Phase 1: Check site health (freshness, velocity)
result = await self.advertools_service.analyze_sitemap(effective_url)
if result.get('success'):
await self._update_site_health_metrics(user_id, website_url, result, db)
else:
raise ValueError(f"Unknown task type: {task_type}")
success = result.get('success', False)
execution_time_ms = int((datetime.utcnow() - start_time).total_seconds() * 1000)
# Update task state
if task_record:
task_record.last_executed = datetime.utcnow()
if success:
task_record.last_success = datetime.utcnow()
task_record.consecutive_failures = 0
task_record.status = 'active'
# Smart Scheduling with Backoff reset
freq_days = task_record.frequency_days or 7
task_record.next_execution = datetime.utcnow() + timedelta(days=freq_days)
else:
task_record.last_failure = datetime.utcnow()
task_record.failure_reason = result.get('error', 'Unknown error')
task_record.consecutive_failures = (task_record.consecutive_failures or 0) + 1
# Exponential Backoff for repeated failures (up to 30 days)
backoff_days = min(30, (task_record.frequency_days or 7) * (2 ** (task_record.consecutive_failures - 1)))
task_record.next_execution = datetime.utcnow() + timedelta(days=backoff_days)
if task_record.consecutive_failures >= 5:
task_record.status = 'failed' # Mark as failed after 5 attempts
# Create execution log
if isinstance(task_id, int):
log_entry = AdvertoolsExecutionLog(
task_id=task_id,
status='success' if success else 'failed',
result_data=result,
error_message=result.get('error'),
execution_time_ms=execution_time_ms
)
db.add(log_entry)
db.commit()
if success:
self.logger.info(f"✅ Advertools task {task_id} completed successfully")
else:
self.logger.warning(f"⚠️ Advertools task {task_id} failed: {result.get('error')}")
return result
except Exception as e:
db.rollback()
self.logger.error(f"❌ Advertools task execution failed: {e}")
# Try to update task record with failure even if main logic failed
if task_record:
try:
task_record.last_executed = datetime.utcnow()
task_record.last_failure = datetime.utcnow()
task_record.failure_reason = str(e)
task_record.consecutive_failures = (task_record.consecutive_failures or 0) + 1
db.commit()
except:
db.rollback()
return {"success": False, "error": str(e)}
async def _update_persona_augmentation(self, user_id: str, website_url: str, audit_result: Dict[str, Any], db: Session):
"""
Updates the user's Brand Persona with discovered themes from the content audit.
"""
try:
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
self.logger.warning(f"No onboarding session found for user {user_id}")
return
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
if not analysis:
self.logger.warning(f"No website analysis found for user {user_id}")
return
# Update brand_analysis with augmented themes
current_brand = analysis.brand_analysis or {}
# Add or update the 'augmented_themes' field
current_brand['augmented_themes'] = audit_result.get('themes', [])
current_brand['last_advertools_audit'] = datetime.utcnow().isoformat()
# Force SQLAlchemy to detect change in JSON field
from sqlalchemy.orm.attributes import flag_modified
flag_modified(analysis, "brand_analysis")
# Also update content_strategy_insights if relevant
if 'avg_word_count' in audit_result:
current_strategy = analysis.content_strategy_insights or {}
current_strategy['avg_content_length'] = audit_result['avg_word_count']
analysis.content_strategy_insights = current_strategy
flag_modified(analysis, "content_strategy_insights")
self.logger.info(f"Updated persona augmentation for {user_id}")
except Exception as e:
self.logger.error(f"Failed to update persona augmentation: {e}")
raise e
async def _update_site_health_metrics(self, user_id: str, website_url: str, health_result: Dict[str, Any], db: Session):
"""
Updates the WebsiteAnalysis with site health metrics (velocity, freshness).
"""
try:
session = db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
return
analysis = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
if not analysis:
return
# Update seo_audit with health metrics
current_seo = analysis.seo_audit or {}
metrics = health_result.get('metrics', {})
current_seo['site_health'] = {
"total_urls": metrics.get('total_urls'),
"publishing_velocity": metrics.get('publishing_velocity'),
"stale_content_count": metrics.get('stale_content_count'),
"stale_content_percentage": metrics.get('stale_content_percentage'),
"top_pillars": metrics.get('top_pillars')
}
current_seo['last_advertools_health_check'] = datetime.utcnow().isoformat()
analysis.seo_audit = current_seo
from sqlalchemy.orm.attributes import flag_modified
flag_modified(analysis, "seo_audit")
self.logger.info(f"Updated site health metrics for {user_id}")
except Exception as e:
self.logger.error(f"Failed to update site health metrics: {e}")
raise e

View File

@@ -15,6 +15,7 @@ from ..core.exception_handler import TaskExecutionError, DatabaseError, Schedule
from models.platform_insights_monitoring_models import PlatformInsightsTask, PlatformInsightsExecutionLog
from services.bing_analytics_storage_service import BingAnalyticsStorageService
from services.integrations.bing_oauth import BingOAuthService
from services.database import get_user_db_path
from utils.logger_utils import get_service_logger
logger = get_service_logger("bing_insights_executor")
@@ -34,8 +35,6 @@ class BingInsightsExecutor(TaskExecutor):
def __init__(self):
self.logger = logger
self.exception_handler = SchedulerExceptionHandler()
database_url = os.getenv('DATABASE_URL', 'sqlite:///alwrity.db')
self.storage_service = BingAnalyticsStorageService(database_url)
self.bing_oauth = BingOAuthService()
async def execute_task(self, task: PlatformInsightsTask, db: Session) -> TaskExecutionResult:
@@ -53,6 +52,11 @@ class BingInsightsExecutor(TaskExecutor):
user_id = task.user_id
site_url = task.site_url
# Initialize storage service for this user
db_path = get_user_db_path(user_id)
database_url = f'sqlite:///{db_path}'
storage_service = BingAnalyticsStorageService(database_url)
try:
self.logger.info(
f"Executing Bing insights fetch: task_id={task.id} | "
@@ -69,7 +73,7 @@ class BingInsightsExecutor(TaskExecutor):
db.flush()
# Fetch insights
result = await self._fetch_insights(task, db)
result = await self._fetch_insights(task, db, storage_service)
# Update execution log
execution_time_ms = int((time.time() - start_time) * 1000)
@@ -184,7 +188,7 @@ class BingInsightsExecutor(TaskExecutor):
return error_result
async def _fetch_insights(self, task: PlatformInsightsTask, db: Session) -> TaskExecutionResult:
async def _fetch_insights(self, task: PlatformInsightsTask, db: Session, storage_service: BingAnalyticsStorageService) -> TaskExecutionResult:
"""
Fetch Bing insights data.
@@ -201,7 +205,7 @@ class BingInsightsExecutor(TaskExecutor):
if is_first_run:
# First run: Try to load from cache
self.logger.info(f"First run for Bing insights task {task.id} - loading cached data")
cached_data = self._load_cached_data(user_id, site_url)
cached_data = self._load_cached_data(user_id, site_url, storage_service)
if cached_data:
self.logger.info(f"Loaded cached Bing data for user {user_id}")
@@ -216,11 +220,11 @@ class BingInsightsExecutor(TaskExecutor):
else:
# No cached data - try to fetch from API
self.logger.info(f"No cached data found, fetching from Bing API")
return await self._fetch_fresh_data(user_id, site_url)
return await self._fetch_fresh_data(user_id, site_url, storage_service)
else:
# Subsequent run: Always fetch fresh data
self.logger.info(f"Subsequent run for Bing insights task {task.id} - fetching fresh data")
return await self._fetch_fresh_data(user_id, site_url)
return await self._fetch_fresh_data(user_id, site_url, storage_service)
except Exception as e:
self.logger.error(f"Error fetching Bing insights for user {user_id}: {e}", exc_info=True)
@@ -230,11 +234,11 @@ class BingInsightsExecutor(TaskExecutor):
result_data={'error': str(e)}
)
def _load_cached_data(self, user_id: str, site_url: Optional[str]) -> Optional[Dict[str, Any]]:
def _load_cached_data(self, user_id: str, site_url: Optional[str], storage_service: BingAnalyticsStorageService) -> Optional[Dict[str, Any]]:
"""Load most recent cached Bing data from database."""
try:
# Get analytics summary from storage service
summary = self.storage_service.get_analytics_summary(
summary = storage_service.get_analytics_summary(
user_id=user_id,
site_url=site_url or '',
days=30
@@ -250,7 +254,7 @@ class BingInsightsExecutor(TaskExecutor):
self.logger.warning(f"Error loading cached Bing data: {e}")
return None
async def _fetch_fresh_data(self, user_id: str, site_url: Optional[str]) -> TaskExecutionResult:
async def _fetch_fresh_data(self, user_id: str, site_url: Optional[str], storage_service: BingAnalyticsStorageService) -> TaskExecutionResult:
"""Fetch fresh Bing insights from API."""
try:
# Check if user has active tokens
@@ -288,7 +292,7 @@ class BingInsightsExecutor(TaskExecutor):
# For now, use stored analytics data (Bing API integration can be added later)
# This ensures we have data available even if the API class doesn't exist yet
summary = self.storage_service.get_analytics_summary(user_id, site_url, days=30)
summary = storage_service.get_analytics_summary(user_id, site_url, days=30)
if summary and isinstance(summary, dict):
# Format insights data from stored analytics

View File

@@ -0,0 +1,200 @@
import time
from datetime import datetime, timedelta
from typing import Any, Dict
from sqlalchemy.orm import Session
from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService
from models.website_analysis_monitoring_models import (
DeepCompetitorAnalysisTask,
DeepCompetitorAnalysisExecutionLog
)
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
from services.scheduler.core.failure_detection_service import FailureDetectionService
from services.seo.deep_competitor_analysis_service import DeepCompetitorAnalysisService
from utils.logger_utils import get_service_logger
logger = get_service_logger("deep_competitor_analysis_executor")
class DeepCompetitorAnalysisExecutor(TaskExecutor):
def __init__(self):
self.analysis_service = DeepCompetitorAnalysisService()
self.integration_service = OnboardingDataIntegrationService()
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
start_time = time.time()
if not isinstance(task, DeepCompetitorAnalysisTask):
return TaskExecutionResult(
success=False,
error_message="Invalid task type for deep competitor analysis",
retryable=False
)
task_log = DeepCompetitorAnalysisExecutionLog(
task_id=task.id,
status="running",
execution_date=datetime.utcnow()
)
db.add(task_log)
db.commit()
user_id = str(task.user_id)
try:
integrated = self.integration_service.get_integrated_data_sync(user_id, db)
website_analysis = integrated.get("website_analysis") if isinstance(integrated, dict) else {}
payload = task.payload if isinstance(task.payload, dict) else {}
competitors = payload.get("competitors")
if not isinstance(competitors, list) or not competitors:
# Try to get from research_preferences
research_prefs = integrated.get("research_preferences") if isinstance(integrated, dict) else {}
if isinstance(research_prefs, dict):
competitors = research_prefs.get("competitors")
# If still not found, try to get from competitor_analysis (Step 3 persistence)
if not isinstance(competitors, list) or not competitors:
competitors = integrated.get("competitor_analysis") if isinstance(integrated, dict) else []
if not isinstance(competitors, list) or not competitors:
logger.warning(f"Deep competitor analysis skipped for user {user_id}: No competitors found")
task_log.status = "skipped"
task_log.result_data = {"status": "skipped", "reason": "no_competitors"}
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
# Mark task as completed but maybe pause it until user adds competitors?
# Or just treat it as success (empty report) so it doesn't retry endlessly
task.last_executed = datetime.utcnow()
task.last_success = datetime.utcnow()
task.status = "paused" # Pause it so it doesn't run again until triggered manually
task.next_execution = None
task.consecutive_failures = 0
db.commit()
return TaskExecutionResult(
success=True,
result_data={"status": "skipped", "reason": "no_competitors"},
execution_time_ms=task_log.execution_time_ms,
retryable=False
)
max_competitors = int(payload.get("max_competitors") or 25)
crawl_concurrency = int(payload.get("crawl_concurrency") or 4)
mode = payload.get("mode", "deep_analysis")
if mode == "strategic_insights":
logger.info(f"Executing weekly strategic insights for user {user_id}")
report = await self.analysis_service.generate_weekly_strategy_brief(
user_id=user_id,
website_analysis=website_analysis if isinstance(website_analysis, dict) else {},
competitors=competitors
)
# Persist to WebsiteAnalysis history
analysis_id = website_analysis.get('id')
if analysis_id:
from models.onboarding import WebsiteAnalysis
from sqlalchemy.orm.attributes import flag_modified
wa = db.query(WebsiteAnalysis).filter(WebsiteAnalysis.id == analysis_id).first()
if wa:
history = wa.strategic_insights_history or []
if not isinstance(history, list):
history = []
history.insert(0, report)
wa.strategic_insights_history = history[:52]
flag_modified(wa, "strategic_insights_history")
db.commit()
else:
report = await self.analysis_service.run(
user_id=user_id,
website_analysis=website_analysis if isinstance(website_analysis, dict) else {},
competitors=competitors,
max_competitors=max_competitors,
crawl_concurrency=crawl_concurrency
)
task.last_executed = datetime.utcnow()
task.last_success = datetime.utcnow()
# If it's a recurring task (strategic_insights), set next execution
if mode == "strategic_insights":
task.status = "active"
task.next_execution = self.calculate_next_execution(task, "weekly", task.last_executed)
else:
task.status = "paused"
task.next_execution = None
task.consecutive_failures = 0
task.failure_pattern = None
task.failure_reason = None
task_log.status = "success"
task_log.result_data = report
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.commit()
try:
await self.integration_service.refresh_integrated_data(user_id, db)
except Exception as e:
logger.warning(f"Deep competitor analysis SSOT refresh failed for user {user_id}: {e}")
return TaskExecutionResult(
success=True,
result_data=report,
execution_time_ms=task_log.execution_time_ms,
retryable=False
)
except Exception as e:
db.rollback()
logger.warning(f"Deep competitor analysis task failed for user {user_id}: {e}")
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(task.id, "deep_competitor_analysis", user_id)
task.last_executed = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.consecutive_failures = (task.consecutive_failures or 0) + 1
if pattern and pattern.should_cool_off:
task.status = "needs_intervention"
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
task.next_execution = None
else:
task.status = "failed"
task.next_execution = datetime.utcnow() + timedelta(minutes=30)
task_log.status = "failed"
task_log.error_message = str(e)
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.add(task_log)
db.commit()
return TaskExecutionResult(
success=False,
error_message=str(e),
execution_time_ms=task_log.execution_time_ms,
retryable=(task.status != "needs_intervention"),
retry_delay=1800
)
def calculate_next_execution(self, task: Any, frequency: str, last_execution: datetime = None) -> datetime:
base = last_execution or datetime.utcnow()
if frequency == "weekly":
return base + timedelta(days=7)
return base + timedelta(days=365)

View File

@@ -0,0 +1,179 @@
import time
from datetime import datetime, timedelta
from typing import Any, Dict, Optional
from sqlalchemy.orm import Session
from models.website_analysis_monitoring_models import (
DeepWebsiteCrawlTask,
DeepWebsiteCrawlExecutionLog
)
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
from services.scheduler.core.failure_detection_service import FailureDetectionService
from services.research.deep_crawl_service import DeepCrawlService
from utils.logger_utils import get_service_logger
logger = get_service_logger("deep_website_crawl_executor")
class DeepWebsiteCrawlExecutor(TaskExecutor):
def __init__(self):
self.crawl_service = DeepCrawlService()
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
start_time = time.time()
if not isinstance(task, DeepWebsiteCrawlTask):
return TaskExecutionResult(
success=False,
error_message="Invalid task type for deep website crawl",
retryable=False
)
task_log = DeepWebsiteCrawlExecutionLog(
task_id=task.id,
status="running",
execution_date=datetime.utcnow()
)
db.add(task_log)
db.commit()
user_id = str(task.user_id)
website_url = task.website_url
try:
logger.info(f"Executing deep website crawl for user {user_id}, url {website_url}")
result = await self.crawl_service.execute_deep_crawl(
user_id=user_id,
website_url=website_url,
task_id=task.id # Pass task_id so service can update logs/task if needed, but we handle some here too.
# Actually, the service updates logs and task status.
# So we should coordinate.
# In DeepCrawlService I wrote logic to update logs/task if task_id provided.
# But here we also create a log "running".
# The service creates a "success" or "failed" log.
# This might result in duplicate logs or "running" log stuck.
# Let's see DeepCrawlService again.
)
# The service creates a new log entry for success/failure.
# So the "running" log created here will stay as "running" unless updated.
# I should probably update the "running" log instead of letting service create new one.
# OR, I should remove task_id from service call and handle logging here.
# Handling logging here is better for separation of concerns, BUT the service has the detailed stats.
# The service returns the stats.
# I will remove task_id from service call in future refactor, but for now let's just update the local log here too if needed.
# Wait, if service creates a log, I have 2 logs.
# I'll modify this executor to NOT pass task_id to service, but rely on return value.
# But `DeepCrawlService.execute_deep_crawl` takes task_id as Optional.
# If I don't pass it, it returns the result dict.
# I'll do that.
# Re-calling service without task_id
# Wait, `execute_deep_crawl` signature: `async def execute_deep_crawl(self, user_id: str, website_url: str, task_id: Optional[int] = None)`
# If I don't pass task_id, the service won't touch the DB for logs/tasks (except for saving content).
# This is cleaner.
# result = await self.crawl_service.execute_deep_crawl(user_id, website_url)
# But wait, in the service I implemented:
# `if task_id: log = ... db.add(log) ...`
# So if I don't pass task_id, it just returns data. Perfect.
# Correction: I need to update the file `backend/services/research/deep_crawl_service.py` ?
# No, it handles optional task_id.
# So here I call it without task_id.
# However, `DeepCrawlService` updates task status (last_executed, etc) if task_id is present.
# If I don't pass task_id, I must update task status here.
task.last_executed = datetime.utcnow()
task.last_success = datetime.utcnow()
task.status = "active" # Keep active for recurring? Or paused?
# User said "schedule this task". So likely recurring.
# But usually crawl is heavy, maybe weekly.
# Calculate next execution
task.next_execution = self.calculate_next_execution(task, "Weekly", task.last_executed)
task.consecutive_failures = 0
task.failure_pattern = None
task.failure_reason = None
task_log.status = "success"
task_log.result_data = result
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.commit()
return TaskExecutionResult(
success=True,
result_data=result,
execution_time_ms=task_log.execution_time_ms,
retryable=False
)
except Exception as e:
db.rollback()
logger.warning(f"Deep website crawl task failed for user {user_id}: {e}")
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(task.id, "deep_website_crawl", user_id)
task.last_executed = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.consecutive_failures = (task.consecutive_failures or 0) + 1
if pattern and pattern.should_cool_off:
task.status = "needs_intervention"
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
task.next_execution = None
else:
task.status = "failed"
task.next_execution = datetime.utcnow() + timedelta(minutes=60) # Retry in hour
task_log.status = "failed"
task_log.error_message = str(e)
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.add(task_log)
db.commit()
return TaskExecutionResult(
success=False,
error_message=str(e),
execution_time_ms=task_log.execution_time_ms,
retryable=(task.status != "needs_intervention"),
retry_delay=3600
)
def calculate_next_execution(
self,
task: Any,
frequency: str,
last_execution: Optional[datetime] = None
) -> datetime:
"""
Calculate next execution time based on frequency.
"""
if not last_execution:
last_execution = datetime.utcnow()
if frequency == 'Daily':
return last_execution + timedelta(days=1)
elif frequency == 'Weekly':
return last_execution + timedelta(weeks=1)
elif frequency == 'Monthly':
return last_execution + timedelta(days=30)
else:
# Default to weekly if unknown
return last_execution + timedelta(weeks=1)

View File

@@ -0,0 +1,232 @@
"""
Market Trends Executor
Runs Google Trends (pytrends) periodically and embeds results into the user SIF index.
"""
import time
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from sqlalchemy.orm import Session
from models.website_analysis_monitoring_models import MarketTrendsTask, MarketTrendsExecutionLog
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
from services.scheduler.core.failure_detection_service import FailureDetectionService
from services.intelligence.sif_integration import SIFIntegrationService
from services.research.trends.google_trends_service import GoogleTrendsService
from utils.logger_utils import get_service_logger
logger = get_service_logger("market_trends_executor")
class MarketTrendsExecutor(TaskExecutor):
def __init__(self):
pass
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
start_time = time.time()
if not isinstance(task, MarketTrendsTask):
return TaskExecutionResult(success=False, error_message="Invalid task type for market trends", retryable=False)
task_log = MarketTrendsExecutionLog(task_id=task.id, status="running", execution_date=datetime.utcnow())
db.add(task_log)
db.commit()
user_id = str(task.user_id)
website_url = task.website_url
payload = task.payload or {}
try:
geo = payload.get("geo") or "US"
timeframe = payload.get("timeframe") or "today 12-m"
sif_service = SIFIntegrationService(user_id)
keywords = await self._select_keywords_for_user(db=db, user_id=user_id, website_url=website_url)
if not keywords:
keywords = payload.get("keywords") or []
keywords = [str(k).strip() for k in (keywords or []) if str(k).strip()]
if len(keywords) > 5:
keywords = keywords[:5]
trends_result: Dict[str, Any]
if keywords:
try:
trends_result = await GoogleTrendsService().analyze_trends(
keywords=keywords, timeframe=timeframe, geo=geo, user_id=user_id
)
except Exception as trends_err:
trends_result = {
"error": str(trends_err),
"keywords": keywords,
"timeframe": timeframe,
"geo": geo,
"timestamp": datetime.utcnow().isoformat(),
"cached": False,
}
else:
trends_result = {
"error": "No keywords available for market trends run",
"keywords": [],
"timeframe": timeframe,
"geo": geo,
"timestamp": datetime.utcnow().isoformat(),
"cached": False,
}
run_id = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
await sif_service.index_market_trends_run(trends_result=trends_result, run_id=run_id)
task.last_executed = datetime.utcnow()
task.last_success = datetime.utcnow()
frequency_hours = task.frequency_hours or 72
task.next_execution = datetime.utcnow() + timedelta(hours=frequency_hours)
task.status = "active"
task.consecutive_failures = 0
task.failure_pattern = None
task.failure_reason = None
task_log.status = "success"
task_log.result_data = {
"run_id": run_id,
"keywords": trends_result.get("keywords", keywords),
"geo": geo,
"timeframe": timeframe,
"cached": trends_result.get("cached", False),
}
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.commit()
return TaskExecutionResult(
success=True,
result_data=task_log.result_data,
execution_time_ms=task_log.execution_time_ms,
retryable=False,
)
except Exception as e:
db.rollback()
logger.warning(f"Market trends task failed for user {user_id}: {e}")
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(task.id, "market_trends", user_id)
task.last_executed = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.consecutive_failures = (task.consecutive_failures or 0) + 1
if pattern and pattern.should_cool_off:
task.status = "needs_intervention"
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat(),
}
task.next_execution = None
else:
task.status = "active"
task.next_execution = datetime.utcnow() + timedelta(hours=6)
task_log.status = "failed"
task_log.error_message = str(e)
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.add(task_log)
db.commit()
return TaskExecutionResult(
success=False,
error_message=str(e),
execution_time_ms=task_log.execution_time_ms,
retryable=(task.status != "needs_intervention"),
retry_delay=21600,
)
async def _select_keywords_for_user(self, db: Session, user_id: str, website_url: str) -> List[str]:
keywords: List[str] = []
try:
from sqlalchemy import select, desc
from models.enhanced_strategy_models import EnhancedContentStrategy
stmt = (
select(EnhancedContentStrategy)
.where(EnhancedContentStrategy.user_id == user_id)
.order_by(desc(EnhancedContentStrategy.updated_at))
)
strategy = db.execute(stmt).scalars().first()
if strategy:
if strategy.emerging_trends:
keywords.extend(self._extract_strings(strategy.emerging_trends))
if strategy.industry_trends:
keywords.extend(self._extract_strings(strategy.industry_trends))
if strategy.market_gaps:
keywords.extend(self._extract_strings(strategy.market_gaps))
if strategy.competitor_content_strategies:
keywords.extend(self._extract_strings(strategy.competitor_content_strategies))
except Exception:
pass
if not keywords:
try:
from sqlalchemy import select, desc
from models.onboarding import WebsiteAnalysis, OnboardingSession
stmt = (
select(WebsiteAnalysis)
.join(OnboardingSession, WebsiteAnalysis.session_id == OnboardingSession.id)
.where(OnboardingSession.user_id == user_id)
.order_by(desc(WebsiteAnalysis.created_at))
)
wa = db.execute(stmt).scalars().first()
if wa and wa.content_strategy_insights:
ai_strategy = wa.content_strategy_insights.get("ai_strategy", {})
topic_clusters = ai_strategy.get("topic_clusters") or []
keywords.extend(self._extract_strings(topic_clusters))
except Exception:
pass
deduped = []
seen = set()
for k in keywords:
kk = str(k).strip()
if not kk:
continue
key = kk.lower()
if key in seen:
continue
seen.add(key)
deduped.append(kk)
return deduped[:5]
def _extract_strings(self, value: Any) -> List[str]:
if value is None:
return []
if isinstance(value, str):
return [value]
if isinstance(value, list):
out: List[str] = []
for item in value:
out.extend(self._extract_strings(item))
return out
if isinstance(value, dict):
out: List[str] = []
for k in ["keyword", "topic", "title", "name", "label"]:
if k in value and value.get(k):
out.append(str(value.get(k)))
return out
return [str(value)]
def calculate_next_execution(self, task: Any, frequency: str, last_execution: datetime = None) -> datetime:
base = last_execution or datetime.utcnow()
hours = getattr(task, "frequency_hours", 72) or 72
return base + timedelta(hours=hours)

View File

@@ -21,6 +21,7 @@ from services.gsc_service import GSCService
from services.integrations.bing_oauth import BingOAuthService
from services.integrations.wordpress_oauth import WordPressOAuthService
from services.wix_service import WixService
from services.database import get_user_db_path
logger = get_service_logger("oauth_token_monitoring_executor")
@@ -289,8 +290,8 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
GSC service auto-refreshes tokens if expired when loading credentials.
"""
try:
# Use absolute database path for consistency with onboarding
db_path = os.path.abspath("alwrity.db")
# Use dynamic database path
db_path = get_user_db_path(user_id)
gsc_service = GSCService(db_path=db_path)
credentials = gsc_service.load_user_credentials(user_id)
@@ -341,9 +342,8 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
Checks token expiration and attempts refresh if needed.
"""
try:
# Use absolute database path for consistency with onboarding
db_path = os.path.abspath("alwrity.db")
bing_service = BingOAuthService(db_path=db_path)
# Initialize Bing service
bing_service = BingOAuthService()
# Get token status (includes expired tokens)
token_status = bing_service.get_user_token_status(user_id)
@@ -502,8 +502,8 @@ class OAuthTokenMonitoringExecutor(TaskExecutor):
and require user re-authorization. We only check if token is valid.
"""
try:
# Use absolute database path for consistency with onboarding
db_path = os.path.abspath("alwrity.db")
# Use dynamic database path
db_path = get_user_db_path(user_id)
wordpress_service = WordPressOAuthService(db_path=db_path)
tokens = wordpress_service.get_user_tokens(user_id)

View File

@@ -0,0 +1,584 @@
import asyncio
import time
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse
import aiohttp
from bs4 import BeautifulSoup
from loguru import logger
from sqlalchemy.orm import Session
from models.onboarding import SEOPageAudit
from models.website_analysis_monitoring_models import (
OnboardingFullWebsiteAnalysisTask,
OnboardingFullWebsiteAnalysisExecutionLog
)
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
from services.scheduler.core.failure_detection_service import FailureDetectionService
from services.seo_analyzer.analyzers import (
MetaDataAnalyzer,
TechnicalSEOAnalyzer,
ContentAnalyzer,
URLStructureAnalyzer,
AccessibilityAnalyzer,
UserExperienceAnalyzer
)
class OnboardingFullWebsiteAnalysisExecutor(TaskExecutor):
def __init__(self):
self.logger = logger.bind(component="OnboardingFullWebsiteAnalysisExecutor")
self.max_urls_default = 500
self.http_timeout_seconds = 25
self.http_concurrency = 10
self.healthy_threshold = 80
self.warning_threshold = 60
self.weights = {
'meta': 0.15,
'content': 0.20,
'technical': 0.20,
'performance': 0.20,
'accessibility': 0.10,
'ux': 0.10,
'security': 0.05,
}
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
start_time = time.time()
if not isinstance(task, OnboardingFullWebsiteAnalysisTask):
return TaskExecutionResult(
success=False,
error_message="Invalid task type for onboarding full website analysis",
retryable=False
)
task_log = OnboardingFullWebsiteAnalysisExecutionLog(
task_id=task.id,
status='running',
execution_date=datetime.utcnow()
)
db.add(task_log)
db.commit()
user_id = str(task.user_id)
website_url = task.website_url
payload = task.payload or {}
max_urls = int(payload.get('max_urls') or self.max_urls_default)
try:
urls = await self._discover_urls(website_url, max_urls=max_urls)
if not urls:
raise ValueError("No URLs discovered for full-site analysis")
results = await self._audit_urls(user_id, website_url, urls, db)
task.last_executed = datetime.utcnow()
task.last_success = datetime.utcnow()
task.status = 'paused'
task.next_execution = None
task.consecutive_failures = 0
task.failure_pattern = None
task.failure_reason = None
task_log.status = 'success'
task_log.result_data = results
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.commit()
return TaskExecutionResult(
success=True,
result_data=results,
execution_time_ms=task_log.execution_time_ms,
retryable=False
)
except Exception as e:
db.rollback()
self.logger.error(f"Full-site SEO audit task failed: {e}", exc_info=True)
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(task.id, 'onboarding_full_website_analysis', user_id)
task.last_executed = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.consecutive_failures = (task.consecutive_failures or 0) + 1
if pattern and pattern.should_cool_off:
task.status = "needs_intervention"
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
task.next_execution = None
else:
task.status = "failed"
task.next_execution = datetime.utcnow() + timedelta(minutes=30)
task_log.status = 'failed'
task_log.error_message = str(e)
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.add(task_log)
db.commit()
return TaskExecutionResult(
success=False,
error_message=str(e),
execution_time_ms=task_log.execution_time_ms,
retryable=(task.status != "needs_intervention"),
retry_delay=1800
)
def calculate_next_execution(
self,
task: Any,
frequency: str,
last_execution: Optional[datetime] = None
) -> datetime:
base = last_execution or datetime.utcnow()
return base + timedelta(days=365)
async def _discover_urls(self, website_url: str, max_urls: int) -> List[str]:
base = self._normalize_url(website_url)
parsed = urlparse(base)
root = f"{parsed.scheme}://{parsed.netloc}"
sitemap_urls: List[str] = []
robots = await self._fetch_text(urljoin(root, "/robots.txt"))
if robots:
for line in robots.splitlines():
if line.lower().startswith("sitemap:"):
sitemap_urls.append(line.split(":", 1)[1].strip())
if not sitemap_urls:
candidates = [
urljoin(root, "/sitemap.xml"),
urljoin(root, "/sitemap_index.xml"),
urljoin(root, "/wp-sitemap.xml"),
]
sitemap_urls.extend(candidates)
discovered: List[str] = []
seen: Set[str] = set()
for sm in sitemap_urls:
if len(discovered) >= max_urls:
break
urls_from_sm = await self._parse_sitemap(sm, max_urls=max_urls - len(discovered))
for u in urls_from_sm:
n = self._normalize_url(u)
if n not in seen and self._same_site(root, n):
seen.add(n)
discovered.append(n)
if len(discovered) >= max_urls:
break
if not discovered:
discovered.append(base)
return discovered
async def _parse_sitemap(self, sitemap_url: str, max_urls: int) -> List[str]:
xml_text = await self._fetch_text(sitemap_url)
if not xml_text:
return []
try:
import xml.etree.ElementTree as ET
root = ET.fromstring(xml_text)
except Exception:
return []
ns = ""
if root.tag.startswith("{"):
ns = root.tag.split("}", 1)[0] + "}"
urls: List[str] = []
if root.tag.endswith("sitemapindex"):
locs = root.findall(f".//{ns}sitemap/{ns}loc")
for loc in locs:
if len(urls) >= max_urls:
break
child_url = (loc.text or "").strip()
if not child_url:
continue
child_urls = await self._parse_sitemap(child_url, max_urls=max_urls - len(urls))
urls.extend(child_urls)
else:
locs = root.findall(f".//{ns}url/{ns}loc")
for loc in locs:
if len(urls) >= max_urls:
break
u = (loc.text or "").strip()
if u:
urls.append(u)
return urls
async def _fetch_text(self, url: str) -> Optional[str]:
try:
timeout = aiohttp.ClientTimeout(total=self.http_timeout_seconds)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, allow_redirects=True, headers={"User-Agent": "ALwrity-SEO-Audit/1.0"}) as resp:
if resp.status >= 400:
return None
return await resp.text(errors="ignore")
except Exception:
return None
async def _audit_urls(self, user_id: str, website_url: str, urls: List[str], db: Session) -> Dict[str, Any]:
timeout = aiohttp.ClientTimeout(total=self.http_timeout_seconds)
connector = aiohttp.TCPConnector(limit=self.http_concurrency)
semaphore = asyncio.Semaphore(self.http_concurrency)
async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
async def audit_one(url: str) -> Dict[str, Any]:
async with semaphore:
return await self._audit_single_url(user_id, website_url, url, session, db)
audited = await asyncio.gather(*[audit_one(u) for u in urls], return_exceptions=True)
successes = [r for r in audited if isinstance(r, dict) and r.get('success')]
failures = [r for r in audited if not (isinstance(r, dict) and r.get('success'))]
avg_score = round(sum(r['overall_score'] for r in successes) / len(successes)) if successes else 0
fix_scheduled = len([r for r in successes if r.get('status') == 'fix_scheduled'])
worst_pages = sorted(
[{'page_url': r['page_url'], 'overall_score': r['overall_score'], 'status': r.get('status')} for r in successes],
key=lambda x: x['overall_score']
)[:10]
return {
'website_url': website_url,
'pages_discovered': len(urls),
'pages_audited': len(successes),
'pages_failed': len(failures),
'avg_score': avg_score,
'fix_scheduled_pages': fix_scheduled,
'worst_pages': worst_pages,
}
async def _audit_single_url(
self,
user_id: str,
website_url: str,
page_url: str,
session: aiohttp.ClientSession,
db: Session
) -> Dict[str, Any]:
fetch_start = time.time()
try:
async with session.get(page_url, allow_redirects=True, headers={"User-Agent": "ALwrity-SEO-Audit/1.0"}) as resp:
status = resp.status
content_type = resp.headers.get("Content-Type", "")
text = await resp.text(errors="ignore")
headers = dict(resp.headers)
except Exception as e:
self._upsert_page_audit(
db=db,
user_id=user_id,
website_url=website_url,
page_url=page_url,
overall_score=0,
status='error',
audit_data={'error': str(e)}
)
return {'success': False, 'page_url': page_url, 'error': str(e)}
load_time = time.time() - fetch_start
if status >= 400 or "text/html" not in content_type.lower():
self._upsert_page_audit(
db=db,
user_id=user_id,
website_url=website_url,
page_url=page_url,
overall_score=0,
status='error',
audit_data={'http_status': status, 'content_type': content_type}
)
return {'success': False, 'page_url': page_url, 'error': f'HTTP {status} / {content_type}'}
soup = BeautifulSoup(text, 'html.parser')
meta = MetaDataAnalyzer().analyze(soup)
content = ContentAnalyzer().analyze(soup)
technical = TechnicalSEOAnalyzer().analyze(page_url, soup)
url_structure = URLStructureAnalyzer().analyze(page_url)
accessibility = AccessibilityAnalyzer().analyze(text)
ux = UserExperienceAnalyzer().analyze(text, page_url)
performance = self._performance_from_fetch(load_time, headers)
security = self._security_from_headers(headers)
category_scores = {
'meta': meta.get('score', 0),
'content': content.get('score', 0),
'technical': technical.get('score', 0),
'performance': performance.get('score', 0),
'accessibility': accessibility.get('score', 0),
'ux': ux.get('score', 0),
'security': security.get('score', 0),
'url_structure': url_structure.get('score', 0),
}
overall_score = self._weighted_score(category_scores)
if overall_score >= self.healthy_threshold:
page_status = 'healthy'
elif overall_score >= self.warning_threshold:
page_status = 'needs_review'
else:
page_status = 'fix_scheduled'
audit_data = {
'meta': meta,
'content_health': content,
'technical': technical,
'performance': performance,
'url_structure': url_structure,
'accessibility': accessibility,
'ux': ux,
'security_headers': security,
'overall_score': overall_score,
}
issues = self._collect_findings(audit_data, key='issues')
warnings = self._collect_findings(audit_data, key='warnings')
recommendations = self._collect_findings(audit_data, key='recommendations')
self._upsert_page_audit(
db=db,
user_id=user_id,
website_url=website_url,
page_url=page_url,
overall_score=overall_score,
status=page_status,
category_scores=category_scores,
issues=issues,
warnings=warnings,
recommendations=recommendations,
audit_data=audit_data
)
return {
'success': True,
'page_url': page_url,
'overall_score': overall_score,
'status': page_status
}
def _weighted_score(self, category_scores: Dict[str, int]) -> int:
total = 0.0
for key, weight in self.weights.items():
total += float(category_scores.get(key, 0)) * weight
return int(round(total))
def _collect_findings(self, audit_data: Dict[str, Any], key: str) -> List[Dict[str, Any]]:
findings: List[Dict[str, Any]] = []
for category, data in audit_data.items():
if not isinstance(data, dict):
continue
items = data.get(key)
if not isinstance(items, list):
continue
for item in items:
if isinstance(item, dict):
enriched = dict(item)
enriched.setdefault('category', category)
findings.append(enriched)
return findings
def _performance_from_fetch(self, load_time: float, headers: Dict[str, str]) -> Dict[str, Any]:
issues: List[Dict[str, Any]] = []
warnings: List[Dict[str, Any]] = []
recommendations: List[Dict[str, Any]] = []
if load_time > 3:
issues.append({
'type': 'critical',
'message': f'Page load time too slow ({load_time:.2f}s)',
'location': 'Page performance',
'current_value': f'{load_time:.2f}s',
'fix': 'Optimize page speed (target < 3 seconds)',
'code_example': 'Optimize images, minify CSS/JS, use CDN',
'action': 'optimize_page_speed'
})
elif load_time > 2:
warnings.append({
'type': 'warning',
'message': f'Page load time could be improved ({load_time:.2f}s)',
'location': 'Page performance',
'current_value': f'{load_time:.2f}s',
'fix': 'Optimize for faster loading',
'code_example': 'Compress images, enable caching',
'action': 'improve_page_speed'
})
content_encoding = headers.get('Content-Encoding')
if not content_encoding:
warnings.append({
'type': 'warning',
'message': 'No compression detected',
'location': 'Server configuration',
'fix': 'Enable GZIP/Brotli compression',
'code_example': 'Enable compression in server or CDN',
'action': 'enable_compression'
})
cache_headers = ['Cache-Control', 'Expires', 'ETag']
has_cache = any(headers.get(h) for h in cache_headers)
if not has_cache:
warnings.append({
'type': 'warning',
'message': 'No caching headers found',
'location': 'Server configuration',
'fix': 'Add caching headers',
'code_example': 'Cache-Control: max-age=31536000',
'action': 'add_caching_headers'
})
score = max(0, 100 - len(issues) * 25 - len(warnings) * 10)
return {
'score': score,
'load_time': load_time,
'is_compressed': bool(content_encoding),
'has_cache': has_cache,
'issues': issues,
'warnings': warnings,
'recommendations': recommendations
}
def _security_from_headers(self, headers: Dict[str, str]) -> Dict[str, Any]:
security_headers = {
'X-Frame-Options': headers.get('X-Frame-Options'),
'X-Content-Type-Options': headers.get('X-Content-Type-Options'),
'X-XSS-Protection': headers.get('X-XSS-Protection'),
'Strict-Transport-Security': headers.get('Strict-Transport-Security'),
'Content-Security-Policy': headers.get('Content-Security-Policy'),
'Referrer-Policy': headers.get('Referrer-Policy')
}
issues: List[Dict[str, Any]] = []
warnings: List[Dict[str, Any]] = []
recommendations: List[Dict[str, Any]] = []
present_headers: List[str] = []
missing_headers: List[str] = []
for header_name, header_value in security_headers.items():
if header_value:
present_headers.append(header_name)
continue
missing_headers.append(header_name)
if header_name in ['X-Frame-Options', 'X-Content-Type-Options']:
issues.append({
'type': 'critical',
'message': f'Missing {header_name} header',
'location': 'Server configuration',
'fix': f'Add {header_name} header',
'code_example': f'{header_name}: DENY' if header_name == 'X-Frame-Options' else f'{header_name}: nosniff',
'action': f'add_{header_name.lower().replace("-", "_")}_header'
})
else:
warnings.append({
'type': 'warning',
'message': f'Missing {header_name} header',
'location': 'Server configuration',
'fix': f'Add {header_name} header for better security',
'code_example': f'{header_name}: max-age=31536000',
'action': f'add_{header_name.lower().replace("-", "_")}_header'
})
score = min(100, len(present_headers) * 16)
return {
'score': score,
'present_headers': present_headers,
'missing_headers': missing_headers,
'total_headers': len(present_headers),
'issues': issues,
'warnings': warnings,
'recommendations': recommendations
}
def _upsert_page_audit(
self,
db: Session,
user_id: str,
website_url: str,
page_url: str,
overall_score: int,
status: str,
category_scores: Optional[Dict[str, Any]] = None,
issues: Optional[List[Dict[str, Any]]] = None,
warnings: Optional[List[Dict[str, Any]]] = None,
recommendations: Optional[List[Dict[str, Any]]] = None,
audit_data: Optional[Dict[str, Any]] = None,
) -> None:
existing = db.query(SEOPageAudit).filter(
SEOPageAudit.user_id == user_id,
SEOPageAudit.page_url == page_url
).first()
if existing:
existing.website_url = website_url
existing.overall_score = overall_score
existing.status = status
existing.category_scores = category_scores
existing.issues = issues
existing.warnings = warnings
existing.recommendations = recommendations
existing.audit_data = audit_data
existing.last_analyzed_at = datetime.utcnow()
db.add(existing)
else:
db.add(SEOPageAudit(
user_id=user_id,
website_url=website_url,
page_url=page_url,
overall_score=overall_score,
status=status,
category_scores=category_scores,
issues=issues,
warnings=warnings,
recommendations=recommendations,
audit_data=audit_data,
last_analyzed_at=datetime.utcnow()
))
db.commit()
def _normalize_url(self, url: str) -> str:
u = (url or "").strip()
if not u:
return ""
if not u.startswith("http://") and not u.startswith("https://"):
u = "https://" + u
parsed = urlparse(u)
normalized = parsed._replace(fragment="").geturl()
return normalized.rstrip("/")
def _same_site(self, root: str, url: str) -> bool:
try:
a = urlparse(root)
b = urlparse(url)
return a.netloc == b.netloc
except Exception:
return False

View File

@@ -0,0 +1,153 @@
"""
SIF Indexing Executor
Executes SIF indexing tasks (Step 2 metadata and User Website Content).
"""
import time
from datetime import datetime, timedelta
from typing import Any, Optional
from sqlalchemy.orm import Session
from models.website_analysis_monitoring_models import (
SIFIndexingTask,
SIFIndexingExecutionLog
)
from services.scheduler.core.executor_interface import TaskExecutor, TaskExecutionResult
from services.scheduler.core.failure_detection_service import FailureDetectionService
from services.intelligence.sif_integration import SIFIntegrationService
from utils.logger_utils import get_service_logger
logger = get_service_logger("sif_indexing_executor")
class SIFIndexingExecutor(TaskExecutor):
"""
Executor for SIF indexing tasks.
Handles:
- Indexing Step 2 Website Analysis Data (Metadata)
- Harvesting and Indexing User Website Content (Deep Crawl)
- Scheduling recurring updates (snapshot refresh)
"""
def __init__(self):
pass
async def execute_task(self, task: Any, db: Session) -> TaskExecutionResult:
start_time = time.time()
if not isinstance(task, SIFIndexingTask):
return TaskExecutionResult(
success=False,
error_message="Invalid task type for SIF indexing",
retryable=False
)
task_log = SIFIndexingExecutionLog(
task_id=task.id,
status="running",
execution_date=datetime.utcnow()
)
db.add(task_log)
db.commit()
user_id = str(task.user_id)
website_url = task.website_url
try:
logger.info(f"Executing SIF indexing for user {user_id} ({website_url})")
# Initialize SIF Service
sif_service = SIFIntegrationService(user_id)
# 1. Sync Step 2 Metadata (WebsiteAnalysis, CompetitorAnalysis)
metadata_synced = await sif_service.sync_onboarding_data_to_sif()
# 2. Sync User Website Content (Deep Crawl / Snapshot)
content_synced = await sif_service.sync_user_website_content(website_url)
# Determine overall success
# We consider it a success if at least one operation worked, or if both were attempted without error
# But ideally, content sync is the heavy lifter.
success = metadata_synced or content_synced
if not success:
logger.warning(f"SIF indexing completed but no data was synced/indexed for {user_id}")
task.last_executed = datetime.utcnow()
task.last_success = datetime.utcnow()
# Schedule next execution (Recurring)
frequency_hours = task.frequency_hours or 48
task.next_execution = datetime.utcnow() + timedelta(hours=frequency_hours)
task.status = "active"
task.consecutive_failures = 0
task.failure_pattern = None
task.failure_reason = None
task_log.status = "success"
task_log.result_data = {
"metadata_synced": metadata_synced,
"content_synced": content_synced,
"website_url": website_url
}
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.commit()
return TaskExecutionResult(
success=True,
result_data=task_log.result_data,
execution_time_ms=task_log.execution_time_ms,
retryable=False
)
except Exception as e:
db.rollback()
logger.warning(f"SIF indexing task failed for user {user_id}: {e}")
failure_detection = FailureDetectionService(db)
pattern = failure_detection.analyze_task_failures(task.id, "sif_indexing", user_id)
task.last_executed = datetime.utcnow()
task.last_failure = datetime.utcnow()
task.failure_reason = str(e)
task.consecutive_failures = (task.consecutive_failures or 0) + 1
if pattern and pattern.should_cool_off:
task.status = "needs_intervention"
task.failure_pattern = {
"consecutive_failures": pattern.consecutive_failures,
"recent_failures": pattern.recent_failures,
"failure_reason": pattern.failure_reason.value,
"error_patterns": pattern.error_patterns,
"cool_off_until": (datetime.utcnow() + timedelta(days=7)).isoformat()
}
task.next_execution = None
else:
# Retry sooner if it's a transient failure
task.status = "active" # Keep active for retry
task.next_execution = datetime.utcnow() + timedelta(minutes=60)
task_log.status = "failed"
task_log.error_message = str(e)
task_log.execution_time_ms = int((time.time() - start_time) * 1000)
db.add(task_log)
db.commit()
return TaskExecutionResult(
success=False,
error_message=str(e),
execution_time_ms=task_log.execution_time_ms,
retryable=(task.status != "needs_intervention"),
retry_delay=3600
)
def calculate_next_execution(self, task: Any, frequency: str, last_execution: datetime = None) -> datetime:
# Not strictly used here as we handle logic in execute_task, but good for interface compliance
base = last_execution or datetime.utcnow()
hours = getattr(task, 'frequency_hours', 48) or 48
return base + timedelta(hours=hours)

View File

@@ -282,11 +282,18 @@ class WebsiteAnalysisExecutor(TaskExecutor):
None,
partial(self.style_logic.analyze_style_patterns, crawl_result['content'])
)
async def run_seo_audit():
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
partial(self.style_logic.perform_seo_audit, website_url, crawl_result['content'])
)
# Execute style and patterns analysis in parallel
style_analysis, patterns_result = await asyncio.gather(
style_analysis, patterns_result, seo_audit_result = await asyncio.gather(
run_style_analysis(),
run_patterns_analysis(),
run_seo_audit(),
return_exceptions=True
)
@@ -302,6 +309,12 @@ class WebsiteAnalysisExecutor(TaskExecutor):
if isinstance(patterns_result, Exception):
self.logger.warning(f"Patterns analysis exception: {patterns_result}")
patterns_result = None
seo_audit = None
if isinstance(seo_audit_result, Exception):
self.logger.warning(f"SEO audit exception: {seo_audit_result}")
else:
seo_audit = seo_audit_result
# Step 3: Generate style guidelines
style_guidelines = None
@@ -320,6 +333,7 @@ class WebsiteAnalysisExecutor(TaskExecutor):
'style_analysis': style_analysis.get('analysis') if style_analysis and style_analysis.get('success') else None,
'style_patterns': patterns_result if patterns_result and not isinstance(patterns_result, Exception) else None,
'style_guidelines': style_guidelines,
'seo_audit': seo_audit,
}
# Step 4: Store results based on task type
@@ -366,10 +380,12 @@ class WebsiteAnalysisExecutor(TaskExecutor):
):
"""Update existing WebsiteAnalysis record for user's website."""
try:
# Convert Clerk user ID to integer (same as component_logic.py)
# Use the same conversion logic as the website analysis API
import hashlib
user_id_int = int(hashlib.sha256(user_id.encode()).hexdigest()[:15], 16)
session = db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).order_by(OnboardingSession.updated_at.desc()).first()
if not session:
raise ValueError(f"No onboarding session found for user {user_id}")
# Use WebsiteAnalysisService to update
analysis_service = WebsiteAnalysisService(db)
@@ -380,13 +396,15 @@ class WebsiteAnalysisExecutor(TaskExecutor):
'style_analysis': analysis_data.get('style_analysis'),
'style_patterns': analysis_data.get('style_patterns'),
'style_guidelines': analysis_data.get('style_guidelines'),
'seo_audit': analysis_data.get('seo_audit'),
}
# Save/update analysis
analysis_id = analysis_service.save_analysis(
session_id=user_id_int,
session_id=session.id,
website_url=website_url,
analysis_data=response_data
analysis_data=response_data,
preserve_persona=True
)
if analysis_id:
@@ -490,3 +508,82 @@ class WebsiteAnalysisExecutor(TaskExecutor):
)
return last_execution + timedelta(days=task.frequency_days)
async def _perform_full_site_analysis(self, user_id: str, website_url: str, db: Session):
"""
Discover sitemap and perform non-AI SEO audit on all found pages.
"""
try:
self.logger.info(f"Starting full site scan for {website_url}")
sitemap_service = SitemapService()
# 1. Discover Sitemap
sitemap_url = await sitemap_service.discover_sitemap_url(website_url)
if not sitemap_url:
self.logger.warning(f"No sitemap found for {website_url}, skipping full site scan")
return
# 2. Get URLs (Raw mode)
sitemap_data = await sitemap_service.analyze_sitemap(
sitemap_url=sitemap_url,
analyze_content_trends=False,
analyze_publishing_patterns=False,
include_ai_insights=False
)
urls = [u.get('loc') for u in sitemap_data.get('urls', []) if u.get('loc')]
self.logger.info(f"Found {len(urls)} URLs in sitemap for {website_url}")
# 3. Batch Process (Limit to 50 for safety during testing)
urls_to_scan = urls[:50]
for page_url in urls_to_scan:
try:
# Check if exists
existing = db.query(SEOPageAudit).filter(
SEOPageAudit.user_id == user_id,
SEOPageAudit.page_url == page_url
).first()
# Run in executor to avoid blocking
loop = asyncio.get_event_loop()
# Pass empty content dict to trigger internal fetching in perform_seo_audit
audit_result = await loop.run_in_executor(
None,
partial(self.style_logic.perform_seo_audit, page_url, {})
)
if existing:
existing.overall_score = audit_result.get('overall_score')
existing.category_scores = {k: v.get('score') for k, v in audit_result.items() if isinstance(v, dict) and 'score' in v}
existing.issues = audit_result.get('summary', {}).get('critical_issues', [])
existing.warnings = audit_result.get('summary', {}).get('warnings', [])
existing.audit_data = audit_result
existing.last_analyzed_at = datetime.utcnow()
existing.status = 'completed'
else:
new_audit = SEOPageAudit(
user_id=user_id,
website_url=website_url,
page_url=page_url,
overall_score=audit_result.get('overall_score'),
category_scores={k: v.get('score') for k, v in audit_result.items() if isinstance(v, dict) and 'score' in v},
issues=audit_result.get('summary', {}).get('critical_issues', []),
warnings=audit_result.get('summary', {}).get('warnings', []),
audit_data=audit_result,
analysis_source='scheduled_full_site',
status='completed'
)
db.add(new_audit)
db.commit() # Commit each page to show progress
except Exception as e:
self.logger.error(f"Error auditing page {page_url}: {e}")
db.rollback()
self.logger.info(f"Completed full site scan for {website_url}")
except Exception as e:
self.logger.error(f"Error in full site analysis: {e}")