Scheduled research persona generation

2025-11-05 08:51:00 +05:30
parent 55087c4f37
commit d99c7c83a7
98 changed files with 14518 additions and 828 deletions
--- a/backend/services/scheduler/utils/init.py
+++ b/backend/services/scheduler/utils/init.py
@@ -1,4 +1,12 @@
 """
-Scheduler utilities.
+Scheduler Utilities Package
 """

+from .task_loader import load_due_monitoring_tasks
+from .user_job_store import extract_domain_root, get_user_job_store_name
+
+__all__ = [
+    'load_due_monitoring_tasks',
+    'extract_domain_root',
+    'get_user_job_store_name'
+]
--- a/backend/services/scheduler/utils/oauth_token_task_loader.py
+++ b/backend/services/scheduler/utils/oauth_token_task_loader.py
@@ -0,0 +1,54 @@
+"""
+OAuth Token Monitoring Task Loader
+Functions to load due OAuth token monitoring tasks from database.
+"""
+
+from datetime import datetime
+from typing import List, Optional, Union
+from sqlalchemy.orm import Session
+from sqlalchemy import and_, or_
+
+from models.oauth_token_monitoring_models import OAuthTokenMonitoringTask
+
+
+def load_due_oauth_token_monitoring_tasks(
+    db: Session,
+    user_id: Optional[Union[str, int]] = None
+) -> List[OAuthTokenMonitoringTask]:
+    """
+    Load all OAuth token monitoring tasks that are due for execution.
+    
+    Criteria:
+    - status == 'active' (only check active tasks)
+    - next_check <= now (or is None for first execution)
+    - Optional: user_id filter for specific user (for user isolation)
+    
+    User isolation is enforced through filtering by user_id when provided.
+    If no user_id is provided, loads tasks for all users (for system-wide monitoring).
+    
+    Args:
+        db: Database session
+        user_id: Optional user ID (Clerk string) to filter tasks (if None, loads all users' tasks)
+        
+    Returns:
+        List of due OAuthTokenMonitoringTask instances
+    """
+    now = datetime.utcnow()
+    
+    # Build query for due tasks
+    query = db.query(OAuthTokenMonitoringTask).filter(
+        and_(
+            OAuthTokenMonitoringTask.status == 'active',
+            or_(
+                OAuthTokenMonitoringTask.next_check <= now,
+                OAuthTokenMonitoringTask.next_check.is_(None)
+            )
+        )
+    )
+    
+    # Apply user filter if provided (for user isolation)
+    if user_id is not None:
+        query = query.filter(OAuthTokenMonitoringTask.user_id == str(user_id))
+    
+    return query.all()
+
--- a/backend/services/scheduler/utils/task_loader.py
+++ b/backend/services/scheduler/utils/task_loader.py
@@ -4,7 +4,7 @@ Functions to load due tasks from database.
 """

 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 from sqlalchemy.orm import Session, joinedload
 from sqlalchemy import and_, or_

@@ -14,7 +14,7 @@ from models.enhanced_strategy_models import EnhancedContentStrategy

 def load_due_monitoring_tasks(
    db: Session,
-    user_id: Optional[int] = None
+    user_id: Optional[Union[str, int]] = None
 ) -> List[MonitoringTask]:
    """
    Load all monitoring tasks that are due for execution.
@@ -22,14 +22,17 @@ def load_due_monitoring_tasks(
    Criteria:
    - status == 'active'
    - next_execution <= now (or is None for first execution)
-    - Optional: user_id filter for specific user (for future admin features)
+    - Optional: user_id filter for specific user (for user isolation)
    
    Note: Strategy relationship is eagerly loaded to ensure user_id is accessible
    during task execution for user isolation.
    
+    User isolation is enforced through filtering by user_id when provided.
+    If no user_id is provided, loads tasks for all users (for system-wide monitoring).
+    
    Args:
        db: Database session
-        user_id: Optional user ID to filter tasks (if None, loads all users' tasks)
+        user_id: Optional user ID (Clerk string or int) to filter tasks (if None, loads all users' tasks)
        
    Returns:
        List of due MonitoringTask instances with strategy relationship loaded
--- a/backend/services/scheduler/utils/user_job_store.py
+++ b/backend/services/scheduler/utils/user_job_store.py
@@ -0,0 +1,129 @@
+"""
+User Job Store Utilities
+Utilities for managing per-user job stores based on website root.
+"""
+
+from typing import Optional
+from urllib.parse import urlparse
+from loguru import logger
+from sqlalchemy.orm import Session as SQLSession
+
+from services.database import get_db_session
+from models.onboarding import OnboardingSession, WebsiteAnalysis
+
+
+def extract_domain_root(url: str) -> str:
+    """
+    Extract domain root from a website URL for use as job store identifier.
+    
+    Examples:
+        https://www.example.com -> example
+        https://blog.example.com -> example
+        https://example.co.uk -> example
+        http://subdomain.example.com/path -> example
+    
+    Args:
+        url: Website URL
+        
+    Returns:
+        Domain root (e.g., 'example') or 'default' if extraction fails
+    """
+    try:
+        parsed = urlparse(url)
+        hostname = parsed.netloc or parsed.path.split('/')[0]
+        
+        # Remove www. prefix if present
+        if hostname.startswith('www.'):
+            hostname = hostname[4:]
+        
+        # Split by dots and get the root domain
+        # For example.com -> example, for example.co.uk -> example
+        parts = hostname.split('.')
+        if len(parts) >= 2:
+            # Handle common TLDs that might be part of domain (e.g., co.uk)
+            if len(parts) >= 3 and parts[-2] in ['co', 'com', 'net', 'org']:
+                root = parts[-3]
+            else:
+                root = parts[-2]
+        else:
+            root = parts[0] if parts else 'default'
+        
+        # Clean and validate root
+        root = root.lower().strip()
+        # Remove invalid characters for job store name
+        root = ''.join(c for c in root if c.isalnum() or c in ['-', '_'])
+        
+        if not root or len(root) < 2:
+            return 'default'
+        
+        return root
+        
+    except Exception as e:
+        logger.warning(f"Failed to extract domain root from URL '{url}': {e}")
+        return 'default'
+
+
+def get_user_job_store_name(user_id: str, db: SQLSession = None) -> str:
+    """
+    Get job store name for a user based on their website root from onboarding.
+    
+    Args:
+        user_id: User ID (Clerk string)
+        db: Optional database session (will create if not provided)
+        
+    Returns:
+        Job store name (e.g., 'example' or 'default')
+    """
+    db_session = db
+    close_db = False
+    
+    try:
+        if not db_session:
+            db_session = get_db_session()
+            close_db = True
+        
+        if not db_session:
+            logger.warning(f"Could not get database session for user {user_id}, using default job store")
+            return 'default'
+        
+        # Get user's website URL from onboarding
+        # Query directly since user_id is a string (Clerk ID)
+        onboarding_session = db_session.query(OnboardingSession).filter(
+            OnboardingSession.user_id == user_id
+        ).order_by(OnboardingSession.updated_at.desc()).first()
+        
+        if not onboarding_session:
+            logger.debug(
+                f"[Job Store] No onboarding session found for user {user_id}, using default job store. "
+                f"This is normal if user hasn't completed onboarding."
+            )
+            return 'default'
+        
+        # Get the latest website analysis for this session
+        website_analysis = db_session.query(WebsiteAnalysis).filter(
+            WebsiteAnalysis.session_id == onboarding_session.id
+        ).order_by(WebsiteAnalysis.updated_at.desc()).first()
+        
+        if not website_analysis or not website_analysis.website_url:
+            logger.debug(
+                f"[Job Store] No website URL found for user {user_id} (session_id: {onboarding_session.id}), "
+                f"using default job store. This is normal if website analysis wasn't completed."
+            )
+            return 'default'
+        
+        website_url = website_analysis.website_url
+        domain_root = extract_domain_root(website_url)
+        
+        logger.debug(f"Job store for user {user_id}: {domain_root} (from {website_url})")
+        return domain_root
+        
+    except Exception as e:
+        logger.error(f"Error getting job store name for user {user_id}: {e}")
+        return 'default'
+    finally:
+        if close_db and db_session:
+            try:
+                db_session.close()
+            except Exception:
+                pass
+