Files
ALwrity/backend/services/website_analysis_monitoring_service.py

461 lines
20 KiB
Python

"""
Website Analysis Monitoring Service
Creates and manages website analysis monitoring tasks.
"""
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Any, Optional
from sqlalchemy.orm import Session
from urllib.parse import urlparse
import hashlib
from models.website_analysis_monitoring_models import WebsiteAnalysisTask
from models.onboarding import OnboardingSession
from models.scheduler_models import SchedulerEventLog
from api.content_planning.services.content_strategy.onboarding import OnboardingDataIntegrationService
from services.database import get_db_session
from utils.logger_utils import get_service_logger
logger = get_service_logger("website_analysis_monitoring")
async def generate_website_analysis_tasks_task(user_id: str):
db = None
start_time = datetime.utcnow()
try:
db = get_db_session(user_id)
if not db:
raise RuntimeError(f"Failed to get database session for user {user_id}")
result = create_website_analysis_tasks(user_id=user_id, db=db)
success = bool(result.get("success"))
try:
event_log = SchedulerEventLog(
event_type="job_completed" if success else "job_failed",
event_date=start_time,
job_id=f"website_analysis_tasks_{user_id}",
job_type="one_time",
user_id=user_id,
error_message=None if success else str(result.get("error") or "website analysis task creation failed"),
event_data={
"job_function": "generate_website_analysis_tasks_task",
"status": "success" if success else "failed",
"tasks_created": int(result.get("tasks_created") or 0),
},
)
db.add(event_log)
db.commit()
except Exception as log_error:
logger.warning(f"Failed to log website analysis task creation event for user {user_id}: {log_error}")
db.rollback()
except Exception as e:
logger.error(f"Scheduled website analysis task creation failed for user {user_id}: {e}", exc_info=True)
if db:
try:
event_log = SchedulerEventLog(
event_type="job_failed",
event_date=start_time,
job_id=f"website_analysis_tasks_{user_id}",
job_type="one_time",
user_id=user_id,
error_message=str(e),
event_data={
"job_function": "generate_website_analysis_tasks_task",
"status": "failed",
"exception_type": type(e).__name__,
},
)
db.add(event_log)
db.commit()
except Exception:
db.rollback()
finally:
if db:
db.close()
def schedule_website_analysis_task_creation(user_id: str, delay_minutes: int = 5) -> str:
from services.scheduler import get_scheduler
scheduler = get_scheduler()
run_date = datetime.now(timezone.utc) + timedelta(minutes=delay_minutes)
job_id = f"website_analysis_tasks_{user_id}"
return scheduler.schedule_one_time_task(
func=generate_website_analysis_tasks_task,
run_date=run_date,
job_id=job_id,
kwargs={"user_id": user_id},
replace_existing=True,
)
def clerk_user_id_to_int(user_id: str) -> int:
"""
Convert Clerk user ID to consistent integer for database session_id.
Uses SHA256 hashing for deterministic, consistent results.
This MUST match the pattern used in component_logic.py for onboarding.
Args:
user_id: Clerk user ID (e.g., 'user_33Gz1FPI86VDXhRY8QN4ragRFGN')
Returns:
int: Deterministic integer derived from user ID
"""
user_id_hash = hashlib.sha256(user_id.encode()).hexdigest()
return int(user_id_hash[:8], 16) % 2147483647
def create_website_analysis_tasks(user_id: str, db: Session) -> Dict[str, Any]:
"""
Create website analysis tasks for user's website and all competitors.
This should be called after onboarding completion.
Args:
user_id: Clerk user ID (string)
db: Database session
Returns:
Dictionary with success status and task details
"""
try:
logger.info(f"[Website Analysis Tasks] Creating tasks for user: {user_id}")
# Get user's website URL from onboarding using SSOT
integration_service = OnboardingDataIntegrationService()
integrated_data = integration_service.get_integrated_data_sync(user_id, db)
website_analysis = integrated_data.get('website_analysis', {})
if not website_analysis:
logger.warning(f"[Website Analysis Tasks] No website analysis found for user {user_id}")
# Try direct query using hash-based session_id (must match onboarding pattern)
try:
from models.onboarding import WebsiteAnalysis
session_id_int = clerk_user_id_to_int(user_id)
logger.info(
f"[Website Analysis Tasks] Querying WebsiteAnalysis with hash-based session_id: {session_id_int}"
)
analysis = db.query(WebsiteAnalysis).filter(
WebsiteAnalysis.session_id == session_id_int
).order_by(WebsiteAnalysis.created_at.desc()).first()
if analysis:
logger.info(f"[Website Analysis Tasks] ✅ Found analysis via hash-based query: {analysis.website_url}")
website_analysis = analysis.to_dict()
except Exception as e:
logger.debug(f"[Website Analysis Tasks] Direct query fallback failed: {e}")
if not website_analysis:
return {
'success': False,
'error': 'No website analysis found. Complete onboarding first.'
}
website_url = website_analysis.get('website_url')
# Log the actual value for debugging (always log, not just debug level)
logger.info(
f"[Website Analysis Tasks] website_url from dict: {repr(website_url)} "
f"(type: {type(website_url).__name__}, truthy: {bool(website_url)})"
)
# Check if website_url is None, empty string, or whitespace
if not website_url or (isinstance(website_url, str) and not website_url.strip()):
# Log what we actually got for debugging
logger.warning(
f"[Website Analysis Tasks] No website URL found for user {user_id}. "
f"Analysis keys: {list(website_analysis.keys()) if website_analysis else 'None'}, "
f"website_url value: {repr(website_url)}"
)
# Try direct access to the model using hash-based session_id
# This MUST use the same hash function as onboarding (clerk_user_id_to_int)
try:
from models.onboarding import WebsiteAnalysis
session_id_int = clerk_user_id_to_int(user_id)
logger.info(
f"[Website Analysis Tasks] Querying WebsiteAnalysis with hash-based session_id: {session_id_int} "
f"for user {user_id}"
)
analysis = db.query(WebsiteAnalysis).filter(
WebsiteAnalysis.session_id == session_id_int
).order_by(WebsiteAnalysis.created_at.desc()).first()
if analysis:
logger.info(
f"[Website Analysis Tasks] Direct model access - "
f"website_url: {repr(analysis.website_url)}, "
f"type: {type(analysis.website_url).__name__ if analysis.website_url else 'None'}, "
f"id: {analysis.id}, session_id: {analysis.session_id}"
)
if analysis.website_url:
website_url = analysis.website_url
logger.info(f"[Website Analysis Tasks] ✅ Retrieved website_url via hash-based query: {website_url}")
else:
# Try to extract URL from crawl_result if website_url is NULL
if analysis.crawl_result and isinstance(analysis.crawl_result, dict):
# Check multiple possible locations for URL
crawl_url = (
analysis.crawl_result.get('url') or
analysis.crawl_result.get('website_url') or
(analysis.crawl_result.get('content', {}).get('domain_info', {}).get('domain') if isinstance(analysis.crawl_result.get('content'), dict) else None)
)
# If still not found, check if crawl_result has nested structure
if not crawl_url and 'content' in analysis.crawl_result:
content = analysis.crawl_result.get('content', {})
if isinstance(content, dict):
# Check domain_info for domain
domain_info = content.get('domain_info', {})
if isinstance(domain_info, dict):
crawl_url = domain_info.get('domain') or domain_info.get('url')
if crawl_url:
# Ensure it's a full URL (add https:// if missing)
if crawl_url and not crawl_url.startswith(('http://', 'https://')):
crawl_url = f"https://{crawl_url}"
logger.info(f"[Website Analysis Tasks] ✅ Extracted website_url from crawl_result: {crawl_url}")
website_url = crawl_url
else:
logger.warning(
f"[Website Analysis Tasks] Cannot extract URL from crawl_result. "
f"crawl_result keys: {list(analysis.crawl_result.keys()) if isinstance(analysis.crawl_result, dict) else 'not a dict'}, "
f"Analysis ID: {analysis.id}"
)
else:
logger.warning(
f"[Website Analysis Tasks] website_url is NULL and crawl_result is empty or invalid. "
f"Analysis ID: {analysis.id}, Status: {analysis.status}, "
f"crawl_result type: {type(analysis.crawl_result).__name__ if analysis.crawl_result else 'None'}"
)
else:
logger.warning(
f"[Website Analysis Tasks] No WebsiteAnalysis record found for "
f"hash-based session_id {session_id_int} (user {user_id})"
)
except Exception as e:
logger.warning(f"[Website Analysis Tasks] Hash-based query fallback failed: {e}", exc_info=True)
if not website_url:
return {
'success': False,
'error': 'No website URL found in onboarding data. Please complete step 2 (Website Analysis) in onboarding.'
}
logger.info(f"[Website Analysis Tasks] User website URL: {website_url}")
tasks_created = []
# 1. Create task for user's website (optional recurring every 30 days)
user_task = _create_or_update_task(
db=db,
user_id=user_id,
website_url=website_url,
task_type='user_website',
frequency_days=30 # Optional: recurring every 30 days
)
if user_task:
tasks_created.append(user_task)
logger.info(f"Created user website analysis task for {website_url}")
# 2. Get competitors from onboarding
competitors = _get_competitors_from_onboarding(user_id, db)
logger.info(
f"[Website Analysis Tasks] Found {len(competitors)} competitors for user {user_id}. "
f"Competitors: {[c.get('url') or c.get('website_url') or c.get('domain') for c in competitors]}"
)
# 3. Create task for each competitor
for competitor in competitors:
competitor_url = competitor.get('url') or competitor.get('website_url')
if not competitor_url:
continue
# Extract competitor identifier
competitor_id = competitor.get('domain') or competitor.get('id') or _extract_domain(competitor_url)
competitor_task = _create_or_update_task(
db=db,
user_id=user_id,
website_url=competitor_url,
task_type='competitor',
competitor_id=competitor_id,
frequency_days=10, # Recurring every 10 days
initial_delay_minutes=5
)
if competitor_task:
tasks_created.append(competitor_task)
logger.info(f"Created competitor analysis task for {competitor_url}")
db.commit()
logger.info(f"Created {len(tasks_created)} website analysis tasks for user {user_id}")
return {
'success': True,
'tasks_created': len(tasks_created),
'tasks': [{
'id': t.id,
'url': t.website_url,
'type': t.task_type,
'next_check': t.next_check.isoformat() if t.next_check else None
} for t in tasks_created]
}
except Exception as e:
logger.error(f"Error creating website analysis tasks for user {user_id}: {e}", exc_info=True)
db.rollback()
return {
'success': False,
'error': str(e)
}
def _create_or_update_task(
db: Session,
user_id: str,
website_url: str,
task_type: str,
competitor_id: Optional[str] = None,
frequency_days: int = 10,
initial_delay_minutes: Optional[int] = None
) -> Optional[WebsiteAnalysisTask]:
"""Create or update a website analysis task."""
try:
# Check if task already exists
existing = db.query(WebsiteAnalysisTask).filter(
WebsiteAnalysisTask.user_id == user_id,
WebsiteAnalysisTask.website_url == website_url,
WebsiteAnalysisTask.task_type == task_type
).first()
if existing:
# Update existing task
existing.status = 'active'
existing.frequency_days = frequency_days
existing.next_check = datetime.utcnow() + timedelta(days=frequency_days)
existing.updated_at = datetime.utcnow()
if competitor_id:
existing.competitor_id = competitor_id
logger.info(f"Updated existing website analysis task {existing.id}")
return existing
# Create new task
next_check = datetime.utcnow() + timedelta(days=frequency_days)
if initial_delay_minutes is not None:
next_check = datetime.utcnow() + timedelta(minutes=initial_delay_minutes)
task = WebsiteAnalysisTask(
user_id=user_id,
website_url=website_url,
task_type=task_type,
competitor_id=competitor_id,
status='active',
frequency_days=frequency_days,
next_check=next_check
)
db.add(task)
db.flush()
logger.info(f"Created new website analysis task {task.id} for {website_url}")
return task
except Exception as e:
logger.error(f"Error creating/updating task: {e}", exc_info=True)
return None
def _get_competitors_from_onboarding(user_id: str, db: Session) -> List[Dict[str, Any]]:
"""
Get competitors from onboarding database.
Competitors are stored in onboarding_sessions.step_data['step3_research_data']['competitors']
or via Step3ResearchService.
"""
try:
# Get onboarding session using SSOT
integration_service = OnboardingDataIntegrationService()
integrated_data = integration_service.get_integrated_data_sync(user_id, db)
# Get competitors from integrated data (SSOT handles fallback logic)
# Priority 1: Check competitor_analysis (from CompetitorAnalysis table)
competitors = integrated_data.get('competitor_analysis', [])
# Priority 2: Check research_preferences
if not competitors:
research_preferences = integrated_data.get('research_preferences', {})
competitors = research_preferences.get('competitors', [])
# If not found in research_preferences, try session step_data fallback
if not competitors:
session = integrated_data.get('onboarding_session')
if session:
# Method 1: Check if step_data column exists and has competitors
if hasattr(session, 'step_data') and session.step_data:
step_data = session.step_data if isinstance(session.step_data, dict) else {}
research_data = step_data.get('step3_research_data', {})
competitors = research_data.get('competitors', [])
logger.info(f"[Competitor Retrieval] Method 1 (step_data): found {len(competitors)} competitors")
# Method 2: If still not found, try Step3ResearchService (Legacy Fallback)
if not competitors:
logger.info(f"[Competitor Retrieval] Attempting Step3ResearchService for user {user_id}")
try:
# We need session_id for Step3ResearchService
session = integrated_data.get('onboarding_session')
if session and hasattr(session, 'id'):
from api.onboarding_utils.step3_research_service import Step3ResearchService
import asyncio
step3_service = Step3ResearchService()
# Run async function - handle both new and existing event loops
try:
loop = asyncio.get_event_loop()
except RuntimeError:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
research_data_result = loop.run_until_complete(
step3_service.get_research_data(str(session.id))
)
logger.info(f"[Competitor Retrieval] Step3ResearchService result: {research_data_result.get('success')}")
if research_data_result.get('success'):
research_data = research_data_result.get('research_data', {})
step3_data = research_data.get('step3_research_data', {})
competitors = step3_data.get('competitors', [])
logger.info(f"[Competitor Retrieval] Retrieved {len(competitors)} competitors from Step3ResearchService")
else:
logger.warning(f"[Competitor Retrieval] Step3ResearchService returned error: {research_data_result.get('error')}")
except Exception as e:
logger.warning(f"[Competitor Retrieval] Could not fetch competitors from Step3ResearchService: {e}", exc_info=True)
# Ensure competitors is a list
if not isinstance(competitors, list):
competitors = []
logger.info(f"Found {len(competitors)} competitors for user {user_id}")
return competitors
except Exception as e:
logger.error(f"Error getting competitors from onboarding: {e}", exc_info=True)
return []
def _extract_domain(url: str) -> str:
"""Extract domain from URL."""
try:
parsed = urlparse(url)
return parsed.netloc or url
except Exception:
return url