370 lines
16 KiB
Python
370 lines
16 KiB
Python
"""
|
|
Website Analysis Monitoring Service
|
|
Creates and manages website analysis monitoring tasks.
|
|
"""
|
|
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Any, Optional
|
|
from sqlalchemy.orm import Session
|
|
from urllib.parse import urlparse
|
|
import hashlib
|
|
|
|
from models.website_analysis_monitoring_models import WebsiteAnalysisTask
|
|
from models.onboarding import OnboardingSession
|
|
from services.onboarding.database_service import OnboardingDatabaseService
|
|
from utils.logger_utils import get_service_logger
|
|
|
|
logger = get_service_logger("website_analysis_monitoring")
|
|
|
|
|
|
def clerk_user_id_to_int(user_id: str) -> int:
|
|
"""
|
|
Convert Clerk user ID to consistent integer for database session_id.
|
|
Uses SHA256 hashing for deterministic, consistent results.
|
|
This MUST match the pattern used in component_logic.py for onboarding.
|
|
|
|
Args:
|
|
user_id: Clerk user ID (e.g., 'user_33Gz1FPI86VDXhRY8QN4ragRFGN')
|
|
|
|
Returns:
|
|
int: Deterministic integer derived from user ID
|
|
"""
|
|
user_id_hash = hashlib.sha256(user_id.encode()).hexdigest()
|
|
return int(user_id_hash[:8], 16) % 2147483647
|
|
|
|
|
|
def create_website_analysis_tasks(user_id: str, db: Session) -> Dict[str, Any]:
|
|
"""
|
|
Create website analysis tasks for user's website and all competitors.
|
|
|
|
This should be called after onboarding completion.
|
|
|
|
Args:
|
|
user_id: Clerk user ID (string)
|
|
db: Database session
|
|
|
|
Returns:
|
|
Dictionary with success status and task details
|
|
"""
|
|
try:
|
|
logger.info(f"[Website Analysis Tasks] Creating tasks for user: {user_id}")
|
|
|
|
# Get user's website URL from onboarding
|
|
onboarding_service = OnboardingDatabaseService(db=db)
|
|
website_analysis = onboarding_service.get_website_analysis(user_id, db)
|
|
|
|
if not website_analysis:
|
|
logger.warning(f"[Website Analysis Tasks] No website analysis found for user {user_id}")
|
|
# Try direct query using hash-based session_id (must match onboarding pattern)
|
|
try:
|
|
from models.onboarding import WebsiteAnalysis
|
|
session_id_int = clerk_user_id_to_int(user_id)
|
|
|
|
logger.info(
|
|
f"[Website Analysis Tasks] Querying WebsiteAnalysis with hash-based session_id: {session_id_int}"
|
|
)
|
|
|
|
analysis = db.query(WebsiteAnalysis).filter(
|
|
WebsiteAnalysis.session_id == session_id_int
|
|
).order_by(WebsiteAnalysis.created_at.desc()).first()
|
|
|
|
if analysis:
|
|
logger.info(f"[Website Analysis Tasks] ✅ Found analysis via hash-based query: {analysis.website_url}")
|
|
website_analysis = analysis.to_dict()
|
|
except Exception as e:
|
|
logger.debug(f"[Website Analysis Tasks] Direct query fallback failed: {e}")
|
|
|
|
if not website_analysis:
|
|
return {
|
|
'success': False,
|
|
'error': 'No website analysis found. Complete onboarding first.'
|
|
}
|
|
|
|
website_url = website_analysis.get('website_url')
|
|
|
|
# Log the actual value for debugging (always log, not just debug level)
|
|
logger.info(
|
|
f"[Website Analysis Tasks] website_url from dict: {repr(website_url)} "
|
|
f"(type: {type(website_url).__name__}, truthy: {bool(website_url)})"
|
|
)
|
|
|
|
# Check if website_url is None, empty string, or whitespace
|
|
if not website_url or (isinstance(website_url, str) and not website_url.strip()):
|
|
# Log what we actually got for debugging
|
|
logger.warning(
|
|
f"[Website Analysis Tasks] No website URL found for user {user_id}. "
|
|
f"Analysis keys: {list(website_analysis.keys()) if website_analysis else 'None'}, "
|
|
f"website_url value: {repr(website_url)}"
|
|
)
|
|
|
|
# Try direct access to the model using hash-based session_id
|
|
# This MUST use the same hash function as onboarding (clerk_user_id_to_int)
|
|
try:
|
|
from models.onboarding import WebsiteAnalysis
|
|
session_id_int = clerk_user_id_to_int(user_id)
|
|
|
|
logger.info(
|
|
f"[Website Analysis Tasks] Querying WebsiteAnalysis with hash-based session_id: {session_id_int} "
|
|
f"for user {user_id}"
|
|
)
|
|
|
|
analysis = db.query(WebsiteAnalysis).filter(
|
|
WebsiteAnalysis.session_id == session_id_int
|
|
).order_by(WebsiteAnalysis.created_at.desc()).first()
|
|
|
|
if analysis:
|
|
logger.info(
|
|
f"[Website Analysis Tasks] Direct model access - "
|
|
f"website_url: {repr(analysis.website_url)}, "
|
|
f"type: {type(analysis.website_url).__name__ if analysis.website_url else 'None'}, "
|
|
f"id: {analysis.id}, session_id: {analysis.session_id}"
|
|
)
|
|
|
|
if analysis.website_url:
|
|
website_url = analysis.website_url
|
|
logger.info(f"[Website Analysis Tasks] ✅ Retrieved website_url via hash-based query: {website_url}")
|
|
else:
|
|
# Try to extract URL from crawl_result if website_url is NULL
|
|
if analysis.crawl_result and isinstance(analysis.crawl_result, dict):
|
|
# Check multiple possible locations for URL
|
|
crawl_url = (
|
|
analysis.crawl_result.get('url') or
|
|
analysis.crawl_result.get('website_url') or
|
|
(analysis.crawl_result.get('content', {}).get('domain_info', {}).get('domain') if isinstance(analysis.crawl_result.get('content'), dict) else None)
|
|
)
|
|
|
|
# If still not found, check if crawl_result has nested structure
|
|
if not crawl_url and 'content' in analysis.crawl_result:
|
|
content = analysis.crawl_result.get('content', {})
|
|
if isinstance(content, dict):
|
|
# Check domain_info for domain
|
|
domain_info = content.get('domain_info', {})
|
|
if isinstance(domain_info, dict):
|
|
crawl_url = domain_info.get('domain') or domain_info.get('url')
|
|
|
|
if crawl_url:
|
|
# Ensure it's a full URL (add https:// if missing)
|
|
if crawl_url and not crawl_url.startswith(('http://', 'https://')):
|
|
crawl_url = f"https://{crawl_url}"
|
|
logger.info(f"[Website Analysis Tasks] ✅ Extracted website_url from crawl_result: {crawl_url}")
|
|
website_url = crawl_url
|
|
else:
|
|
logger.warning(
|
|
f"[Website Analysis Tasks] Cannot extract URL from crawl_result. "
|
|
f"crawl_result keys: {list(analysis.crawl_result.keys()) if isinstance(analysis.crawl_result, dict) else 'not a dict'}, "
|
|
f"Analysis ID: {analysis.id}"
|
|
)
|
|
else:
|
|
logger.warning(
|
|
f"[Website Analysis Tasks] website_url is NULL and crawl_result is empty or invalid. "
|
|
f"Analysis ID: {analysis.id}, Status: {analysis.status}, "
|
|
f"crawl_result type: {type(analysis.crawl_result).__name__ if analysis.crawl_result else 'None'}"
|
|
)
|
|
else:
|
|
logger.warning(
|
|
f"[Website Analysis Tasks] No WebsiteAnalysis record found for "
|
|
f"hash-based session_id {session_id_int} (user {user_id})"
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"[Website Analysis Tasks] Hash-based query fallback failed: {e}", exc_info=True)
|
|
|
|
if not website_url:
|
|
return {
|
|
'success': False,
|
|
'error': 'No website URL found in onboarding data. Please complete step 2 (Website Analysis) in onboarding.'
|
|
}
|
|
|
|
logger.info(f"[Website Analysis Tasks] User website URL: {website_url}")
|
|
|
|
tasks_created = []
|
|
|
|
# 1. Create task for user's website (optional recurring every 30 days)
|
|
user_task = _create_or_update_task(
|
|
db=db,
|
|
user_id=user_id,
|
|
website_url=website_url,
|
|
task_type='user_website',
|
|
frequency_days=30 # Optional: recurring every 30 days
|
|
)
|
|
if user_task:
|
|
tasks_created.append(user_task)
|
|
logger.info(f"Created user website analysis task for {website_url}")
|
|
|
|
# 2. Get competitors from onboarding
|
|
competitors = _get_competitors_from_onboarding(user_id, db)
|
|
logger.info(
|
|
f"[Website Analysis Tasks] Found {len(competitors)} competitors for user {user_id}. "
|
|
f"Competitors: {[c.get('url') or c.get('website_url') or c.get('domain') for c in competitors]}"
|
|
)
|
|
|
|
# 3. Create task for each competitor
|
|
for competitor in competitors:
|
|
competitor_url = competitor.get('url') or competitor.get('website_url')
|
|
if not competitor_url:
|
|
continue
|
|
|
|
# Extract competitor identifier
|
|
competitor_id = competitor.get('domain') or competitor.get('id') or _extract_domain(competitor_url)
|
|
|
|
competitor_task = _create_or_update_task(
|
|
db=db,
|
|
user_id=user_id,
|
|
website_url=competitor_url,
|
|
task_type='competitor',
|
|
competitor_id=competitor_id,
|
|
frequency_days=10 # Recurring every 10 days
|
|
)
|
|
if competitor_task:
|
|
tasks_created.append(competitor_task)
|
|
logger.info(f"Created competitor analysis task for {competitor_url}")
|
|
|
|
db.commit()
|
|
|
|
logger.info(f"Created {len(tasks_created)} website analysis tasks for user {user_id}")
|
|
|
|
return {
|
|
'success': True,
|
|
'tasks_created': len(tasks_created),
|
|
'tasks': [{
|
|
'id': t.id,
|
|
'url': t.website_url,
|
|
'type': t.task_type,
|
|
'next_check': t.next_check.isoformat() if t.next_check else None
|
|
} for t in tasks_created]
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating website analysis tasks for user {user_id}: {e}", exc_info=True)
|
|
db.rollback()
|
|
return {
|
|
'success': False,
|
|
'error': str(e)
|
|
}
|
|
|
|
|
|
def _create_or_update_task(
|
|
db: Session,
|
|
user_id: str,
|
|
website_url: str,
|
|
task_type: str,
|
|
competitor_id: Optional[str] = None,
|
|
frequency_days: int = 10
|
|
) -> Optional[WebsiteAnalysisTask]:
|
|
"""Create or update a website analysis task."""
|
|
try:
|
|
# Check if task already exists
|
|
existing = db.query(WebsiteAnalysisTask).filter(
|
|
WebsiteAnalysisTask.user_id == user_id,
|
|
WebsiteAnalysisTask.website_url == website_url,
|
|
WebsiteAnalysisTask.task_type == task_type
|
|
).first()
|
|
|
|
if existing:
|
|
# Update existing task
|
|
existing.status = 'active'
|
|
existing.frequency_days = frequency_days
|
|
existing.next_check = datetime.utcnow() + timedelta(days=frequency_days)
|
|
existing.updated_at = datetime.utcnow()
|
|
if competitor_id:
|
|
existing.competitor_id = competitor_id
|
|
logger.info(f"Updated existing website analysis task {existing.id}")
|
|
return existing
|
|
|
|
# Create new task
|
|
task = WebsiteAnalysisTask(
|
|
user_id=user_id,
|
|
website_url=website_url,
|
|
task_type=task_type,
|
|
competitor_id=competitor_id,
|
|
status='active',
|
|
frequency_days=frequency_days,
|
|
next_check=datetime.utcnow() + timedelta(days=frequency_days)
|
|
)
|
|
db.add(task)
|
|
db.flush()
|
|
logger.info(f"Created new website analysis task {task.id} for {website_url}")
|
|
return task
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating/updating task: {e}", exc_info=True)
|
|
return None
|
|
|
|
|
|
def _get_competitors_from_onboarding(user_id: str, db: Session) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get competitors from onboarding database.
|
|
|
|
Competitors are stored in onboarding_sessions.step_data['step3_research_data']['competitors']
|
|
or via Step3ResearchService.
|
|
"""
|
|
try:
|
|
# Get onboarding session
|
|
onboarding_service = OnboardingDatabaseService(db=db)
|
|
session = onboarding_service.get_session_by_user(user_id, db)
|
|
|
|
if not session:
|
|
logger.warning(f"No onboarding session found for user {user_id}")
|
|
return []
|
|
|
|
# Try to get from step_data JSON column
|
|
competitors = []
|
|
|
|
# Method 1: Check if step_data column exists and has competitors
|
|
if hasattr(session, 'step_data') and session.step_data:
|
|
step_data = session.step_data if isinstance(session.step_data, dict) else {}
|
|
research_data = step_data.get('step3_research_data', {})
|
|
competitors = research_data.get('competitors', [])
|
|
logger.info(f"[Competitor Retrieval] Method 1 (step_data): found {len(competitors)} competitors")
|
|
|
|
# Method 2: If not found, try Step3ResearchService
|
|
if not competitors:
|
|
logger.info(f"[Competitor Retrieval] Attempting Step3ResearchService for user {user_id}, session_id: {session.id}")
|
|
try:
|
|
from api.onboarding_utils.step3_research_service import Step3ResearchService
|
|
import asyncio
|
|
step3_service = Step3ResearchService()
|
|
|
|
# Run async function - handle both new and existing event loops
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
research_data_result = loop.run_until_complete(
|
|
step3_service.get_research_data(str(session.id))
|
|
)
|
|
|
|
logger.info(f"[Competitor Retrieval] Step3ResearchService result: {research_data_result.get('success')}")
|
|
|
|
if research_data_result.get('success'):
|
|
research_data = research_data_result.get('research_data', {})
|
|
step3_data = research_data.get('step3_research_data', {})
|
|
competitors = step3_data.get('competitors', [])
|
|
logger.info(f"[Competitor Retrieval] Retrieved {len(competitors)} competitors from Step3ResearchService")
|
|
else:
|
|
logger.warning(f"[Competitor Retrieval] Step3ResearchService returned error: {research_data_result.get('error')}")
|
|
except Exception as e:
|
|
logger.warning(f"[Competitor Retrieval] Could not fetch competitors from Step3ResearchService: {e}", exc_info=True)
|
|
|
|
# Ensure competitors is a list
|
|
if not isinstance(competitors, list):
|
|
competitors = []
|
|
|
|
logger.info(f"Found {len(competitors)} competitors for user {user_id}")
|
|
return competitors
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting competitors from onboarding: {e}", exc_info=True)
|
|
return []
|
|
|
|
|
|
def _extract_domain(url: str) -> str:
|
|
"""Extract domain from URL."""
|
|
try:
|
|
parsed = urlparse(url)
|
|
return parsed.netloc or url
|
|
except Exception:
|
|
return url
|
|
|