ALwrity onboarding fixes

2025-10-04 13:24:41 +05:30
parent 510b79bbf8
commit 14dfb2e5c0
14 changed files with 1182 additions and 446 deletions
--- a/backend/.onboarding_progress_user_33Gz1FPI86VDXhRY8QN4ragRFGN.json
+++ b/backend/.onboarding_progress_user_33Gz1FPI86VDXhRY8QN4ragRFGN.json
@@ -5,7 +5,7 @@
      "title": "AI LLM Providers",
      "description": "Configure AI language model providers",
      "status": "completed",
-      "completed_at": "2025-09-30T11:54:21.688932",
+      "completed_at": "2025-10-03T17:29:12.878656",
      "data": {
        "api_keys": {
          "gemini": "AIzaSyB6QrCiOBAzh8xLdmSumec2ysdHeyqyxgw",
@@ -19,9 +19,175 @@
      "step_number": 2,
      "title": "Website Analysis",
      "description": "Set up website analysis and crawling",
-      "status": "pending",
-      "completed_at": null,
-      "data": null,
+      "status": "completed",
+      "completed_at": "2025-10-03T17:42:17.953305",
+      "data": {
+        "website": "https://alwrity.com",
+        "domainName": "Alwrity.com",
+        "analysis": {
+          "writing_style": {
+            "tone": "Informative, enthusiastic",
+            "voice": "Helpful, direct",
+            "complexity": "Moderate",
+            "engagement_level": "High",
+            "brand_personality": "Innovative, friendly",
+            "formality_level": "Semi-formal",
+            "emotional_appeal": "Benefit-driven"
+          },
+          "content_characteristics": {
+            "sentence_structure": "Varied",
+            "vocabulary_level": "Accessible",
+            "paragraph_organization": "Clear, concise",
+            "content_flow": "Logical, user-centric",
+            "readability_score": "Good",
+            "content_density": "Moderate",
+            "visual_elements_usage": "High"
+          },
+          "target_audience": {
+            "demographics": [
+              "Marketers",
+              "Bloggers",
+              "Content creators"
+            ],
+            "expertise_level": "Beginner to intermediate",
+            "industry_focus": "General, tech",
+            "geographic_focus": "",
+            "psychographic_profile": [
+              "Tech-savvy",
+              "Value-conscious",
+              "Efficiency-seeking"
+            ],
+            "pain_points": [
+              "Time-consuming content creation",
+              "Lack of SEO knowledge",
+              "Writer's block"
+            ],
+            "motivations": [
+              "Efficiency",
+              "Increased website traffic",
+              "Content quality"
+            ]
+          },
+          "content_type": {
+            "primary_type": "AI writing tool promotion",
+            "secondary_types": [
+              "Blog posts",
+              "Marketing materials"
+            ],
+            "purpose": "Promote AI tools, generate leads",
+            "call_to_action": "Try Now!",
+            "conversion_focus": "Tool usage",
+            "educational_value": "Moderate"
+          },
+          "recommended_settings": {
+            "writing_tone": "Enthusiastic, informative",
+            "target_audience": "Content creators, marketers",
+            "content_type": "AI tool promotion, blog",
+            "creativity_level": "High",
+            "geographic_location": "",
+            "industry_context": "AI, Content Creation",
+            "brand_alignment": "Strong"
+          },
+          "guidelines": {
+            "tone_recommendations": [
+              "Informative & Enthusiastic: Maintain a helpful and engaging tone. Example: 'Let's dive into how AI can revolutionize your content creation!'"
+            ],
+            "structure_guidelines": [
+              "Clear & Concise: Use headings, subheadings, and bullet points for easy readability. Example: Break down complex topics into digestible sections.",
+              "Logical Flow: Organize content with a user-centric approach, starting with the problem and offering solutions. Example: Start with the pain points of content creation and then introduce your AI tools."
+            ],
+            "vocabulary_suggestions": [
+              "Accessible Language: Avoid jargon; use clear and concise language. Example: Instead of 'leverage AI,' use 'use AI.'"
+            ],
+            "engagement_tips": [
+              "Visuals: Incorporate images, screenshots, and videos to enhance understanding. Example: Include screenshots of your AI tools in action.",
+              "Benefit-Driven: Focus on the benefits for the user. Example: 'Save time and create high-quality content with our AI.'"
+            ],
+            "audience_considerations": [
+              "Targeted Content: Address the needs of marketers, bloggers, and content creators. Example: Provide specific examples relevant to their workflows.",
+              "Address Pain Points: Acknowledge and solve common content creation challenges. Example: Offer solutions to writer's block and SEO optimization."
+            ],
+            "brand_alignment": [
+              "Helpful & Innovative Voice: Maintain a helpful, innovative, and friendly brand voice. Example: Offer free resources and tutorials.",
+              "Open Source Focus: Highlight the open-source nature of the tools. Example: Mention the benefits of open-source for users.",
+              "Value Proposition: Clearly communicate the value of the tools. Example: 'Create fact-based, multilingual content efficiently.'"
+            ],
+            "seo_optimization": [
+              "Keyword Research: Identify relevant keywords and incorporate them naturally. Example: Use keywords like 'AI content creation,' 'SEO optimization,' and 'free AI tools.'",
+              "Optimize Headings & Meta Descriptions: Use keywords in headings and create compelling meta descriptions. Example: Write a meta description that includes a clear call to action and keyword."
+            ],
+            "conversion_optimization": [
+              "Clear CTAs: Include clear calls to action. Example: 'Try our free AI tool today!'",
+              "Focus on Benefits: Emphasize the value proposition. Example: 'Sign up to get instant access to AI-powered content creation.'"
+            ]
+          },
+          "best_practices": [
+            "Provide In-Depth Tutorials: Offer detailed guides and tutorials to showcase the tools' capabilities.",
+            "Showcase Diverse Use Cases: Demonstrate how the tools can be applied in various scenarios.",
+            "Build Community: Encourage user interaction and feedback.",
+            "Integrate with Platforms: Explore integrations with popular content platforms."
+          ],
+          "avoid_elements": [
+            "Overly Technical Jargon: Steer clear of overly complex technical terms that may alienate the audience.",
+            "Misleading Claims: Avoid making exaggerated claims about AI capabilities.",
+            "Negative Brand Association: Do not use language that portrays the brand as anything other than helpful and accessible."
+          ],
+          "content_strategy": "Focus on creating informative, user-centric content that highlights the benefits of open-source AI tools for content creation, addressing the pain points of the target audience while providing practical solutions and SEO optimization.",
+          "ai_generation_tips": [
+            "Fact-Checking: Always verify the information generated by AI tools.",
+            "Human Oversight: Review and refine AI-generated content for accuracy, clarity, and brand voice.",
+            "Experimentation: Test different prompts and inputs to optimize output."
+          ],
+          "competitive_advantages": [
+            "Fact-Based Content: Emphasize the ability to generate fact-based content.",
+            "Open Source: Highlight the benefits of open-source for users (e.g., transparency, community support, customization).",
+            "Multilingual Support: Promote the multilingual capabilities of the tools."
+          ],
+          "content_calendar_suggestions": [
+            "Tutorials: Create step-by-step guides on using the AI tools for different content types.",
+            "Use Case Studies: Showcase successful implementations of the tools.",
+            "SEO Optimization Guides: Provide tips and best practices for improving search engine rankings.",
+            "Industry News & Trends: Share insights on the latest developments in AI and content creation."
+          ],
+          "style_patterns": {
+            "patterns": {
+              "sentence_length": "short to medium",
+              "vocabulary_patterns": [
+                "keywords related to AI and content generation",
+                "action-oriented verbs"
+              ],
+              "rhetorical_devices": [
+                "repetition",
+                "call to action"
+              ],
+              "paragraph_structure": "varied, often short and focused",
+              "transition_phrases": [
+                "Click to",
+                "and"
+              ]
+            },
+            "style_consistency": "consistent in tone and purpose",
+            "unique_elements": [
+              "focus on AI-powered content creation tools",
+              "integration with platforms like WordPress"
+            ],
+            "meta": {
+              "schema_version": "1.1",
+              "confidence": 0.8,
+              "notes": "The content is promotional and tool-focused.",
+              "uncertainty": {
+                "fields": []
+              }
+            }
+          },
+          "style_consistency": "consistent in tone and purpose",
+          "unique_elements": [
+            "focus on AI-powered content creation tools",
+            "integration with platforms like WordPress"
+          ]
+        },
+        "useAnalysisForGenAI": true
+      },
      "validation_errors": []
    },
    {
@@ -61,9 +227,9 @@
      "validation_errors": []
    }
  ],
-  "current_step": 2,
+  "current_step": 3,
  "started_at": "2025-09-29T17:22:14.375002",
-  "last_updated": "2025-09-30T11:54:21.688938",
+  "last_updated": "2025-10-03T17:42:17.953324",
  "is_completed": false,
  "completed_at": null
 }
--- a/backend/api/component_logic.py
+++ b/backend/api/component_logic.py
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
 from loguru import logger
 from typing import Dict, Any
 from datetime import datetime
+import hashlib

 from models.component_logic import (
    UserInfoRequest, UserInfoResponse,
@@ -45,6 +46,23 @@ research_utilities = ResearchUtilities()
 # Create router
 router = APIRouter(prefix="/api/onboarding", tags=["component_logic"])

+# Utility function for consistent user ID to integer conversion
+def clerk_user_id_to_int(user_id: str) -> int:
+    """
+    Convert Clerk user ID to consistent integer for database session_id.
+    Uses SHA256 hashing for deterministic, consistent results across all requests.
+    
+    Args:
+        user_id: Clerk user ID (e.g., 'user_2qA6V8bFFnhPRGp8JYxP4YTJtHl')
+    
+    Returns:
+        int: Deterministic integer derived from user ID
+    """
+    # Use SHA256 for consistent hashing (unlike Python's hash() which varies per process)
+    user_id_hash = hashlib.sha256(user_id.encode()).hexdigest()
+    # Take first 8 characters of hex and convert to int, mod to fit in INT range
+    return int(user_id_hash[:8], 16) % 2147483647
+
 # AI Research Endpoints

@router.post("/ai-research/validate-user", response_model=UserInfoResponse)
@@ -99,11 +117,8 @@ async def configure_research_preferences(
                preferences_service = ResearchPreferencesService(db)
                
                # Use authenticated Clerk user ID for proper user isolation
-                # Convert user_id to int if service expects it, or update service to accept string
-                try:
-                    user_id_int = int(user_id.replace('user_', '').replace('-', '')[:8], 16) % 2147483647
-                except:
-                    user_id_int = hash(user_id) % 2147483647
+                # Use consistent SHA256-based conversion
+                user_id_int = clerk_user_id_to_int(user_id)
                
                # Save preferences with user ID (not session_id)
                preferences_id = preferences_service.save_preferences_with_style_data(user_id_int, preferences)
@@ -504,10 +519,8 @@ async def complete_style_detection(
        analysis_service = WebsiteAnalysisService(db_session)
        
        # Use authenticated Clerk user ID for proper user isolation
-        try:
-            user_id_int = int(user_id.replace('user_', '').replace('-', '')[:8], 16) % 2147483647
-        except:
-            user_id_int = hash(user_id) % 2147483647
+        # Use consistent SHA256-based conversion
+        user_id_int = clerk_user_id_to_int(user_id)
        
        # Check for existing analysis if URL is provided
        existing_analysis = None
@@ -536,11 +549,44 @@ async def complete_style_detection(
                timestamp=datetime.now().isoformat()
            )
        
-        # Step 2: Analyze style
-        style_analysis = style_logic.analyze_content_style(crawl_result['content'])
+        # Step 2-4: Parallelize AI API calls for performance (3 calls → 1 parallel batch)
+        import asyncio
+        from functools import partial
+        
+        # Prepare parallel tasks
+        logger.info("[complete_style_detection] Starting parallel AI analysis...")
+        
+        async def run_style_analysis():
+            """Run style analysis in executor"""
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(None, partial(style_logic.analyze_content_style, crawl_result['content']))
+        
+        async def run_patterns_analysis():
+            """Run patterns analysis in executor (if requested)"""
+            if not request.include_patterns:
+                return None
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(None, partial(style_logic.analyze_style_patterns, crawl_result['content']))
+        
+        # Execute style and patterns analysis in parallel
+        style_analysis, patterns_result = await asyncio.gather(
+            run_style_analysis(),
+            run_patterns_analysis(),
+            return_exceptions=True
+        )
+        
+        # Check if style_analysis failed
+        if isinstance(style_analysis, Exception):
+            error_msg = str(style_analysis)
+            logger.error(f"Style analysis failed with exception: {error_msg}")
+            analysis_service.save_error_analysis(user_id_int, request.url or "text_sample", error_msg)
+            return StyleDetectionResponse(
+                success=False,
+                error=f"Style analysis failed: {error_msg}",
+                timestamp=datetime.now().isoformat()
+            )
        
        if not style_analysis or not style_analysis.get('success'):
-            # Check if it's an API key issue
            error_msg = style_analysis.get('error', 'Unknown error') if style_analysis else 'Analysis failed'
            if 'API key' in error_msg or 'configure' in error_msg:
                return StyleDetectionResponse(
@@ -549,7 +595,6 @@ async def complete_style_detection(
                    timestamp=datetime.now().isoformat()
                )
            else:
-                # Save error analysis
                analysis_service.save_error_analysis(user_id_int, request.url or "text_sample", error_msg)
                return StyleDetectionResponse(
                    success=False,
@@ -557,17 +602,20 @@ async def complete_style_detection(
                    timestamp=datetime.now().isoformat()
                )
        
-        # Step 3: Analyze patterns (optional)
+        # Process patterns result
        style_patterns = None
-        if request.include_patterns:
-            patterns_result = style_logic.analyze_style_patterns(crawl_result['content'])
-            if patterns_result and patterns_result.get('success'):
+        if request.include_patterns and patterns_result and not isinstance(patterns_result, Exception):
+            if patterns_result.get('success'):
                style_patterns = patterns_result.get('patterns')
        
-        # Step 4: Generate guidelines (optional)
+        # Step 4: Generate guidelines (depends on style_analysis, must run after)
        style_guidelines = None
        if request.include_guidelines:
-            guidelines_result = style_logic.generate_style_guidelines(style_analysis.get('analysis', {}))
+            loop = asyncio.get_event_loop()
+            guidelines_result = await loop.run_in_executor(
+                None, 
+                partial(style_logic.generate_style_guidelines, style_analysis.get('analysis', {}))
+            )
            if guidelines_result and guidelines_result.get('success'):
                style_guidelines = guidelines_result.get('guidelines')
        
@@ -628,10 +676,8 @@ async def check_existing_analysis(
        analysis_service = WebsiteAnalysisService(db_session)
        
        # Use authenticated Clerk user ID for proper user isolation
-        try:
-            user_id_int = int(user_id.replace('user_', '').replace('-', '')[:8], 16) % 2147483647
-        except:
-            user_id_int = hash(user_id) % 2147483647
+        # Use consistent SHA256-based conversion
+        user_id_int = clerk_user_id_to_int(user_id)
        
        # Check for existing analysis for THIS USER ONLY
        existing_analysis = analysis_service.check_existing_analysis(user_id_int, website_url)
@@ -684,10 +730,8 @@ async def get_session_analyses(current_user: Dict[str, Any] = Depends(get_curren
        analysis_service = WebsiteAnalysisService(db_session)
        
        # Use authenticated Clerk user ID for proper user isolation
-        try:
-            user_id_int = int(user_id.replace('user_', '').replace('-', '')[:8], 16) % 2147483647
-        except:
-            user_id_int = hash(user_id) % 2147483647
+        # Use consistent SHA256-based conversion
+        user_id_int = clerk_user_id_to_int(user_id)
        
        # Get analyses for THIS USER ONLY (not all users!)
        analyses = analysis_service.get_session_analyses(user_id_int)
--- a/backend/middleware/auth_middleware.py
+++ b/backend/middleware/auth_middleware.py
@@ -117,26 +117,24 @@ class ClerkAuthMiddleware:
                    # Use cached PyJWKClient to avoid repeated JWKS fetches
                    if jwks_url not in self._jwks_client_cache:
                        logger.info(f"Creating new PyJWKClient for {jwks_url} with caching enabled")
-                        # Create client with caching: cache_keys=True, max_cached_keys=16, cache_jwk_set_timeout=3600 (1 hour)
+                        # Create client with caching enabled (cache_keys=True keeps keys in memory)
                        self._jwks_client_cache[jwks_url] = PyJWKClient(
                            jwks_url,
                            cache_keys=True,
-                            max_cached_keys=16,
-                            cache_jwk_set_timeout=3600,  # Cache JWKS for 1 hour
-                            timeout=10  # 10 second timeout for JWKS fetch
+                            max_cached_keys=16
                        )
                    
                    jwks_client = self._jwks_client_cache[jwks_url]
                    signing_key = jwks_client.get_signing_key_from_jwt(token)
                    
                    # Verify and decode the token with clock skew tolerance
-                    # Add 60 seconds leeway to handle clock skew between client/server
+                    # Add 300 seconds (5 minutes) leeway to handle clock skew and token refresh delays
                    decoded_token = jwt.decode(
                        token,
                        signing_key.key,
                        algorithms=["RS256"],
                        options={"verify_signature": True, "verify_exp": True},
-                        leeway=60  # Allow 60 seconds clock skew
+                        leeway=300  # Allow 5 minutes leeway for token refresh during navigation
                    )
                    
                    # Extract user information
@@ -171,7 +169,7 @@ class ClerkAuthMiddleware:
                    decoded_token = jwt.decode(
                        token, 
                        options={"verify_signature": False},
-                        leeway=60  # Allow 60 seconds clock skew
+                        leeway=300  # Allow 5 minutes leeway for token refresh
                    )
                    
                    # Extract user information from the token
--- a/backend/services/website_analysis_service.py
+++ b/backend/services/website_analysis_service.py
@@ -41,11 +41,17 @@ class WebsiteAnalysisService:
            
            if existing_analysis:
                # Update existing analysis
-                existing_analysis.writing_style = analysis_data.get('style_analysis', {}).get('writing_style')
-                existing_analysis.content_characteristics = analysis_data.get('style_analysis', {}).get('content_characteristics')
-                existing_analysis.target_audience = analysis_data.get('style_analysis', {}).get('target_audience')
-                existing_analysis.content_type = analysis_data.get('style_analysis', {}).get('content_type')
-                existing_analysis.recommended_settings = analysis_data.get('style_analysis', {}).get('recommended_settings')
+                style_analysis = analysis_data.get('style_analysis', {})
+                existing_analysis.writing_style = style_analysis.get('writing_style')
+                existing_analysis.content_characteristics = style_analysis.get('content_characteristics')
+                existing_analysis.target_audience = style_analysis.get('target_audience')
+                existing_analysis.content_type = style_analysis.get('content_type')
+                existing_analysis.recommended_settings = style_analysis.get('recommended_settings')
+                # Store brand_analysis and content_strategy_insights if model supports it
+                if hasattr(existing_analysis, 'brand_analysis'):
+                    existing_analysis.brand_analysis = style_analysis.get('brand_analysis')
+                if hasattr(existing_analysis, 'content_strategy_insights'):
+                    existing_analysis.content_strategy_insights = style_analysis.get('content_strategy_insights')
                existing_analysis.crawl_result = analysis_data.get('crawl_result')
                existing_analysis.style_patterns = analysis_data.get('style_patterns')
                existing_analysis.style_guidelines = analysis_data.get('style_guidelines')
@@ -59,20 +65,28 @@ class WebsiteAnalysisService:
                return existing_analysis.id
            else:
                # Create new analysis
-                analysis = WebsiteAnalysis(
-                    session_id=session_id,
-                    website_url=website_url,
-                    writing_style=analysis_data.get('style_analysis', {}).get('writing_style'),
-                    content_characteristics=analysis_data.get('style_analysis', {}).get('content_characteristics'),
-                    target_audience=analysis_data.get('style_analysis', {}).get('target_audience'),
-                    content_type=analysis_data.get('style_analysis', {}).get('content_type'),
-                    recommended_settings=analysis_data.get('style_analysis', {}).get('recommended_settings'),
-                    crawl_result=analysis_data.get('crawl_result'),
-                    style_patterns=analysis_data.get('style_patterns'),
-                    style_guidelines=analysis_data.get('style_guidelines'),
-                    status='completed',
-                    warning_message=analysis_data.get('warning')
-                )
+                style_analysis = analysis_data.get('style_analysis', {})
+                analysis_args = {
+                    'session_id': session_id,
+                    'website_url': website_url,
+                    'writing_style': style_analysis.get('writing_style'),
+                    'content_characteristics': style_analysis.get('content_characteristics'),
+                    'target_audience': style_analysis.get('target_audience'),
+                    'content_type': style_analysis.get('content_type'),
+                    'recommended_settings': style_analysis.get('recommended_settings'),
+                    'crawl_result': analysis_data.get('crawl_result'),
+                    'style_patterns': analysis_data.get('style_patterns'),
+                    'style_guidelines': analysis_data.get('style_guidelines'),
+                    'status': 'completed',
+                    'warning_message': analysis_data.get('warning')
+                }
+                # Add brand_analysis and content_strategy_insights if model supports it
+                if hasattr(WebsiteAnalysis, 'brand_analysis'):
+                    analysis_args['brand_analysis'] = style_analysis.get('brand_analysis')
+                if hasattr(WebsiteAnalysis, 'content_strategy_insights'):
+                    analysis_args['content_strategy_insights'] = style_analysis.get('content_strategy_insights')
+                
+                analysis = WebsiteAnalysis(**analysis_args)
                
                self.db.add(analysis)
                self.db.commit()