Alwrity version 0.5.4

2025-08-11 10:54:50 +05:30
parent 13ca78f653
commit 39b96c44da
44 changed files with 10448 additions and 2119 deletions
--- a/backend/api/content_planning/services/content_strategy/utils/init.py
+++ b/backend/api/content_planning/services/content_strategy/utils/init.py
@@ -3,7 +3,54 @@ Utils Module
 Data processing and validation utilities.
 """

-from .data_processors import DataProcessorService
+from .data_processors import (
+    DataProcessorService,
+    get_onboarding_data,
+    transform_onboarding_data_to_fields,
+    get_data_sources,
+    get_detailed_input_data_points,
+    get_fallback_onboarding_data,
+    get_website_analysis_data,
+    get_research_preferences_data,
+    get_api_keys_data
+)
 from .validators import ValidationService
+from .strategy_utils import (
+    StrategyUtils,
+    calculate_strategic_scores,
+    extract_market_positioning,
+    extract_competitive_advantages,
+    extract_strategic_risks,
+    extract_opportunity_analysis,
+    initialize_caches,
+    calculate_data_quality_scores,
+    extract_content_preferences_from_style,
+    extract_brand_voice_from_guidelines,
+    extract_editorial_guidelines_from_style,
+    create_field_mappings
+)

-__all__ = ['DataProcessorService', 'ValidationService'] 
+__all__ = [
+    'DataProcessorService',
+    'get_onboarding_data',
+    'transform_onboarding_data_to_fields',
+    'get_data_sources',
+    'get_detailed_input_data_points',
+    'get_fallback_onboarding_data',
+    'get_website_analysis_data',
+    'get_research_preferences_data',
+    'get_api_keys_data',
+    'ValidationService',
+    'StrategyUtils',
+    'calculate_strategic_scores',
+    'extract_market_positioning',
+    'extract_competitive_advantages',
+    'extract_strategic_risks',
+    'extract_opportunity_analysis',
+    'initialize_caches',
+    'calculate_data_quality_scores',
+    'extract_content_preferences_from_style',
+    'extract_brand_voice_from_guidelines',
+    'extract_editorial_guidelines_from_style',
+    'create_field_mappings'
+] 
--- a/backend/api/content_planning/services/content_strategy/utils/data_processors.py
+++ b/backend/api/content_planning/services/content_strategy/utils/data_processors.py
@@ -1,451 +1,539 @@
 """
-Data Processor Service
-Data processing utilities.
+Data processing utilities for content strategy operations.
+Provides functions for transforming onboarding data into strategy fields,
+managing data sources, and processing various data types.
 """

 import logging
-import json
-import re
-from typing import Dict, Any, List, Optional, Union
-from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional, Union
+from datetime import datetime
+from sqlalchemy.orm import Session
+
+from models.onboarding import OnboardingSession, WebsiteAnalysis, ResearchPreferences, APIKey

 logger = logging.getLogger(__name__)

+
 class DataProcessorService:
-    """Service for data processing utilities."""
+    """Service for processing and transforming data for content strategy operations."""

    def __init__(self):
-        self.cleaning_patterns = {
-            'html_tags': re.compile(r'<[^>]+>'),
-            'extra_whitespace': re.compile(r'\s+'),
-            'special_chars': re.compile(r'[^\w\s\-.,!?;:()]'),
-            'multiple_spaces': re.compile(r'\s{2,}'),
-            'leading_trailing_spaces': re.compile(r'^\s+|\s+$')
+        self.logger = logging.getLogger(__name__)
+    
+    async def get_onboarding_data(self, user_id: int) -> Dict[str, Any]:
+        """
+        Get comprehensive onboarding data for intelligent auto-population via AutoFillService.
+        
+        Args:
+            user_id: The user ID to get onboarding data for
+            
+        Returns:
+            Dictionary containing comprehensive onboarding data
+        """
+        try:
+            from services.database import get_db_session
+            from ..autofill import AutoFillService
+            temp_db = get_db_session()
+            try:
+                service = AutoFillService(temp_db)
+                payload = await service.get_autofill(user_id)
+                self.logger.info(f"Retrieved comprehensive onboarding data for user {user_id}")
+                return payload
+            except Exception as e:
+                self.logger.error(f"Error getting onboarding data: {str(e)}")
+                raise
+            finally:
+                temp_db.close()
+        except Exception as e:
+            self.logger.error(f"Error getting onboarding data: {str(e)}")
+            raise
+    
+    def transform_onboarding_data_to_fields(self, processed_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Transform processed onboarding data into field-specific format for frontend.
+        
+        Args:
+            processed_data: Dictionary containing processed onboarding data
+            
+        Returns:
+            Dictionary with field-specific data for strategy builder
+        """
+        fields = {}
+        
+        website_data = processed_data.get('website_analysis', {})
+        research_data = processed_data.get('research_preferences', {})
+        api_data = processed_data.get('api_keys_data', {})
+        session_data = processed_data.get('onboarding_session', {})
+        
+        # Business Context Fields
+        if 'content_goals' in website_data and website_data.get('content_goals'):
+            fields['business_objectives'] = {
+                'value': website_data.get('content_goals'),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        
+        # Prefer explicit target_metrics; otherwise derive from performance_metrics
+        if website_data.get('target_metrics'):
+            fields['target_metrics'] = {
+                'value': website_data.get('target_metrics'),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        elif website_data.get('performance_metrics'):
+            fields['target_metrics'] = {
+                'value': website_data.get('performance_metrics'),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        
+        # Content budget: website data preferred, else onboarding session budget
+        if website_data.get('content_budget') is not None:
+            fields['content_budget'] = {
+                'value': website_data.get('content_budget'),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        elif isinstance(session_data, dict) and session_data.get('budget') is not None:
+            fields['content_budget'] = {
+                'value': session_data.get('budget'),
+                'source': 'onboarding_session',
+                'confidence': 0.7
+            }
+        
+        # Team size: website data preferred, else onboarding session team_size
+        if website_data.get('team_size') is not None:
+            fields['team_size'] = {
+                'value': website_data.get('team_size'),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        elif isinstance(session_data, dict) and session_data.get('team_size') is not None:
+            fields['team_size'] = {
+                'value': session_data.get('team_size'),
+                'source': 'onboarding_session',
+                'confidence': 0.7
+            }
+        
+        # Implementation timeline: website data preferred, else onboarding session timeline
+        if website_data.get('implementation_timeline'):
+            fields['implementation_timeline'] = {
+                'value': website_data.get('implementation_timeline'),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        elif isinstance(session_data, dict) and session_data.get('timeline'):
+            fields['implementation_timeline'] = {
+                'value': session_data.get('timeline'),
+                'source': 'onboarding_session',
+                'confidence': 0.7
+            }
+        
+        # Market share: explicit if present; otherwise derive rough share from performance metrics if available
+        if website_data.get('market_share'):
+            fields['market_share'] = {
+                'value': website_data.get('market_share'),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        elif website_data.get('performance_metrics'):
+            fields['market_share'] = {
+                'value': website_data.get('performance_metrics').get('estimated_market_share', None),
+                'source': 'website_analysis',
+                'confidence': website_data.get('confidence_level')
+            }
+        
+        fields['performance_metrics'] = {
+            'value': website_data.get('performance_metrics', {}),
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.8)
        }
-
-    def transform_data_structure(self, data: Union[Dict, List, str], target_format: str = 'dict') -> Union[Dict, List, str]:
-        """Transform data between different structures."""
-        try:
-            if target_format == 'dict':
-                if isinstance(data, dict):
-                    return data
-                elif isinstance(data, list):
-                    return {str(i): item for i, item in enumerate(data)}
-                elif isinstance(data, str):
-                    try:
-                        return json.loads(data)
-                    except json.JSONDecodeError:
-                        return {'value': data}
-                else:
-                    return {'value': str(data)}
+        
+        # Audience Intelligence Fields
+        # Extract audience data from research_data structure
+        audience_research = research_data.get('audience_research', {})
+        content_prefs = research_data.get('content_preferences', {})
+        
+        fields['content_preferences'] = {
+            'value': content_prefs,
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['consumption_patterns'] = {
+            'value': audience_research.get('consumption_patterns', {}),
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['audience_pain_points'] = {
+            'value': audience_research.get('audience_pain_points', []),
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['buying_journey'] = {
+            'value': audience_research.get('buying_journey', {}),
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['seasonal_trends'] = {
+            'value': ['Q1: Planning', 'Q2: Execution', 'Q3: Optimization', 'Q4: Review'],
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.7)
+        }
+        
+        fields['engagement_metrics'] = {
+            'value': {
+                'avg_session_duration': website_data.get('performance_metrics', {}).get('avg_session_duration', 180),
+                'bounce_rate': website_data.get('performance_metrics', {}).get('bounce_rate', 45.5),
+                'pages_per_session': 2.5
+            },
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.8)
+        }
+        
+        # Competitive Intelligence Fields
+        fields['top_competitors'] = {
+            'value': website_data.get('competitors', [
+                'Competitor A - Industry Leader',
+                'Competitor B - Emerging Player', 
+                'Competitor C - Niche Specialist'
+            ]),
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.8)
+        }
+        
+        fields['competitor_content_strategies'] = {
+            'value': ['Educational content', 'Case studies', 'Thought leadership'],
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.7)
+        }
+        
+        fields['market_gaps'] = {
+            'value': website_data.get('market_gaps', []),
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.8)
+        }
+        
+        fields['industry_trends'] = {
+            'value': ['Digital transformation', 'AI/ML adoption', 'Remote work'],
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.8)
+        }
+        
+        fields['emerging_trends'] = {
+            'value': ['Voice search optimization', 'Video content', 'Interactive content'],
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.7)
+        }
+        
+        # Content Strategy Fields
+        fields['preferred_formats'] = {
+            'value': content_prefs.get('preferred_formats', [
+                'Blog posts', 'Whitepapers', 'Webinars', 'Case studies', 'Videos'
+            ]),
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['content_mix'] = {
+            'value': {
+                'blog_posts': 40,
+                'whitepapers': 20,
+                'webinars': 15,
+                'case_studies': 15,
+                'videos': 10
+            },
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['content_frequency'] = {
+            'value': 'Weekly',
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['optimal_timing'] = {
+            'value': {
+                'best_days': ['Tuesday', 'Wednesday', 'Thursday'],
+                'best_times': ['9:00 AM', '1:00 PM', '3:00 PM']
+            },
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.7)
+        }
+        
+        fields['quality_metrics'] = {
+            'value': {
+                'readability_score': 8.5,
+                'engagement_target': 5.0,
+                'conversion_target': 2.0
+            },
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['editorial_guidelines'] = {
+            'value': {
+                'tone': content_prefs.get('content_style', ['Professional', 'Educational']),
+                'length': content_prefs.get('content_length', 'Medium (1000-2000 words)'),
+                'formatting': ['Use headers', 'Include visuals', 'Add CTAs']
+            },
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        fields['brand_voice'] = {
+            'value': {
+                'tone': 'Professional yet approachable',
+                'style': 'Educational and authoritative',
+                'personality': 'Expert, helpful, trustworthy'
+            },
+            'source': 'research_preferences',
+            'confidence': research_data.get('confidence_level', 0.8)
+        }
+        
+        # Performance & Analytics Fields
+        fields['traffic_sources'] = {
+            'value': website_data.get('traffic_sources', {}),
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.8)
+        }
+        
+        fields['conversion_rates'] = {
+            'value': {
+                'overall': website_data.get('performance_metrics', {}).get('conversion_rate', 3.2),
+                'blog': 2.5,
+                'landing_pages': 4.0,
+                'email': 5.5
+            },
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.8)
+        }
+        
+        fields['content_roi_targets'] = {
+            'value': {
+                'target_roi': 300,
+                'cost_per_lead': 50,
+                'lifetime_value': 500
+            },
+            'source': 'website_analysis',
+            'confidence': website_data.get('confidence_level', 0.7)
+        }
+        
+        fields['ab_testing_capabilities'] = {
+            'value': True,
+            'source': 'api_keys_data',
+            'confidence': api_data.get('confidence_level', 0.8)
+        }
+        
+        return fields
+    
+    def get_data_sources(self, processed_data: Dict[str, Any]) -> Dict[str, str]:
+        """
+        Get data sources for each field.
+        
+        Args:
+            processed_data: Dictionary containing processed data
            
-            elif target_format == 'list':
-                if isinstance(data, list):
-                    return data
-                elif isinstance(data, dict):
-                    return list(data.values())
-                elif isinstance(data, str):
-                    return [data]
-                else:
-                    return [str(data)]
+        Returns:
+            Dictionary mapping field names to their data sources
+        """
+        sources = {}
+        
+        # Map fields to their data sources
+        website_fields = ['business_objectives', 'target_metrics', 'content_budget', 'team_size', 
+                         'implementation_timeline', 'market_share', 'competitive_position', 
+                         'performance_metrics', 'engagement_metrics', 'top_competitors', 
+                         'competitor_content_strategies', 'market_gaps', 'industry_trends', 
+                         'emerging_trends', 'traffic_sources', 'conversion_rates', 'content_roi_targets']
+        
+        research_fields = ['content_preferences', 'consumption_patterns', 'audience_pain_points', 
+                          'buying_journey', 'seasonal_trends', 'preferred_formats', 'content_mix', 
+                          'content_frequency', 'optimal_timing', 'quality_metrics', 'editorial_guidelines', 
+                          'brand_voice']
+        
+        api_fields = ['ab_testing_capabilities']
+        
+        for field in website_fields:
+            sources[field] = 'website_analysis'
+        
+        for field in research_fields:
+            sources[field] = 'research_preferences'
+        
+        for field in api_fields:
+            sources[field] = 'api_keys_data'
+        
+        return sources
+    
+    def get_detailed_input_data_points(self, processed_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Get detailed input data points for transparency.
+        
+        Args:
+            processed_data: Dictionary containing processed data
            
-            elif target_format == 'string':
-                if isinstance(data, str):
-                    return data
-                elif isinstance(data, (dict, list)):
-                    return json.dumps(data, default=str)
-                else:
-                    return str(data)
-            
-            else:
-                logger.warning(f"Unknown target format: {target_format}")
-                return data
-                
-        except Exception as e:
-            logger.error(f"Error transforming data structure: {str(e)}")
-            return data
-
-    def clean_text_data(self, text: str, cleaning_level: str = 'standard') -> str:
-        """Clean and normalize text data."""
-        try:
-            if not isinstance(text, str):
-                text = str(text)
-            
-            if cleaning_level == 'minimal':
-                # Basic cleaning
-                cleaned = self.cleaning_patterns['leading_trailing_spaces'].sub('', text)
-                cleaned = self.cleaning_patterns['multiple_spaces'].sub(' ', cleaned)
-                return cleaned.strip()
-            
-            elif cleaning_level == 'standard':
-                # Standard cleaning
-                cleaned = self.cleaning_patterns['html_tags'].sub('', text)
-                cleaned = self.cleaning_patterns['leading_trailing_spaces'].sub('', cleaned)
-                cleaned = self.cleaning_patterns['multiple_spaces'].sub(' ', cleaned)
-                return cleaned.strip()
-            
-            elif cleaning_level == 'aggressive':
-                # Aggressive cleaning
-                cleaned = self.cleaning_patterns['html_tags'].sub('', text)
-                cleaned = self.cleaning_patterns['special_chars'].sub('', cleaned)
-                cleaned = self.cleaning_patterns['leading_trailing_spaces'].sub('', cleaned)
-                cleaned = self.cleaning_patterns['multiple_spaces'].sub(' ', cleaned)
-                return cleaned.strip()
-            
-            else:
-                logger.warning(f"Unknown cleaning level: {cleaning_level}")
-                return text.strip()
-                
-        except Exception as e:
-            logger.error(f"Error cleaning text data: {str(e)}")
-            return str(text)
-
-    def clean_dict_data(self, data: Dict[str, Any], cleaning_level: str = 'standard') -> Dict[str, Any]:
-        """Clean dictionary data recursively."""
-        try:
-            cleaned_data = {}
-            
-            for key, value in data.items():
-                # Clean key
-                cleaned_key = self.clean_text_data(str(key), cleaning_level)
-                
-                # Clean value
-                if isinstance(value, str):
-                    cleaned_value = self.clean_text_data(value, cleaning_level)
-                elif isinstance(value, dict):
-                    cleaned_value = self.clean_dict_data(value, cleaning_level)
-                elif isinstance(value, list):
-                    cleaned_value = [self.clean_text_data(str(item), cleaning_level) if isinstance(item, str) else item for item in value]
-                else:
-                    cleaned_value = value
-                
-                cleaned_data[cleaned_key] = cleaned_value
-            
-            return cleaned_data
-            
-        except Exception as e:
-            logger.error(f"Error cleaning dict data: {str(e)}")
-            return data
-
-    def enrich_data_with_metadata(self, data: Dict[str, Any], source: str = 'unknown') -> Dict[str, Any]:
-        """Enrich data with metadata."""
-        try:
-            enriched_data = data.copy()
-            
-            # Add metadata
-            enriched_data['_metadata'] = {
-                'processed_at': datetime.utcnow().isoformat(),
-                'source': source,
-                'data_type': self._determine_data_type(data),
-                'size': len(str(data)),
-                'field_count': len(data) if isinstance(data, dict) else 0
+        Returns:
+            Dictionary with detailed data points
+        """
+        return {
+            'website_analysis': {
+                'total_fields': len(processed_data.get('website_analysis', {})),
+                'confidence_level': processed_data.get('website_analysis', {}).get('confidence_level', 0.8),
+                'data_freshness': processed_data.get('website_analysis', {}).get('data_freshness', 'recent')
+            },
+            'research_preferences': {
+                'total_fields': len(processed_data.get('research_preferences', {})),
+                'confidence_level': processed_data.get('research_preferences', {}).get('confidence_level', 0.8),
+                'data_freshness': processed_data.get('research_preferences', {}).get('data_freshness', 'recent')
+            },
+            'api_keys_data': {
+                'total_fields': len(processed_data.get('api_keys_data', {})),
+                'confidence_level': processed_data.get('api_keys_data', {}).get('confidence_level', 0.8),
+                'data_freshness': processed_data.get('api_keys_data', {}).get('data_freshness', 'recent')
            }
+        }
+    
+    def get_fallback_onboarding_data(self) -> Dict[str, Any]:
+        """
+        Get fallback onboarding data for compatibility.
+        
+        Returns:
+            Dictionary with fallback data (raises error as fallbacks are disabled)
+        """
+        raise RuntimeError("Fallback onboarding data is disabled. Real data required.")
+    
+    async def get_website_analysis_data(self, user_id: int) -> Dict[str, Any]:
+        """
+        Get website analysis data from onboarding.
+        
+        Args:
+            user_id: The user ID to get data for
            
-            return enriched_data
-            
-        except Exception as e:
-            logger.error(f"Error enriching data with metadata: {str(e)}")
-            return data
-
-    def _determine_data_type(self, data: Any) -> str:
-        """Determine the type of data."""
+        Returns:
+            Dictionary with website analysis data
+        """
        try:
-            if isinstance(data, dict):
-                return 'object'
-            elif isinstance(data, list):
-                return 'array'
-            elif isinstance(data, str):
-                return 'string'
-            elif isinstance(data, (int, float)):
-                return 'number'
-            elif isinstance(data, bool):
-                return 'boolean'
-            else:
-                return 'unknown'
-                
+            raise RuntimeError("Website analysis data retrieval not implemented. Real data required.")
        except Exception as e:
-            logger.error(f"Error determining data type: {str(e)}")
-            return 'unknown'
-
-    def validate_data_completeness(self, data: Dict[str, Any], required_fields: List[str]) -> Dict[str, Any]:
-        """Validate data completeness against required fields."""
+            self.logger.error(f"Error getting website analysis data: {str(e)}")
+            raise
+    
+    async def get_research_preferences_data(self, user_id: int) -> Dict[str, Any]:
+        """
+        Get research preferences data from onboarding.
+        
+        Args:
+            user_id: The user ID to get data for
+            
+        Returns:
+            Dictionary with research preferences data
+        """
        try:
-            validation_result = {
-                'is_complete': True,
-                'missing_fields': [],
-                'present_fields': [],
-                'completeness_score': 0.0,
-                'validation_timestamp': datetime.utcnow().isoformat()
-            }
-            
-            present_count = 0
-            for field in required_fields:
-                if field in data and data[field] is not None and data[field] != '':
-                    validation_result['present_fields'].append(field)
-                    present_count += 1
-                else:
-                    validation_result['missing_fields'].append(field)
-            
-            # Calculate completeness score
-            if required_fields:
-                validation_result['completeness_score'] = present_count / len(required_fields)
-                validation_result['is_complete'] = validation_result['completeness_score'] >= 0.8
-            
-            return validation_result
-            
+            raise RuntimeError("Research preferences data retrieval not implemented. Real data required.")
        except Exception as e:
-            logger.error(f"Error validating data completeness: {str(e)}")
-            return {
-                'is_complete': False,
-                'missing_fields': required_fields,
-                'present_fields': [],
-                'completeness_score': 0.0,
-                'validation_timestamp': datetime.utcnow().isoformat(),
-                'error': str(e)
-            }
-
-    def normalize_field_values(self, data: Dict[str, Any], field_mappings: Dict[str, str]) -> Dict[str, Any]:
-        """Normalize field values based on mappings."""
+            self.logger.error(f"Error getting research preferences data: {str(e)}")
+            raise
+    
+    async def get_api_keys_data(self, user_id: int) -> Dict[str, Any]:
+        """
+        Get API keys and external data from onboarding.
+        
+        Args:
+            user_id: The user ID to get data for
+            
+        Returns:
+            Dictionary with API keys data
+        """
        try:
-            normalized_data = {}
-            
-            for original_field, normalized_field in field_mappings.items():
-                if original_field in data:
-                    normalized_data[normalized_field] = data[original_field]
-            
-            return normalized_data
-            
+            raise RuntimeError("API keys/external data retrieval not implemented. Real data required.")
        except Exception as e:
-            logger.error(f"Error normalizing field values: {str(e)}")
-            return data
+            self.logger.error(f"Error getting API keys data: {str(e)}")
+            raise
+    
+    async def process_website_analysis(self, website_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process website analysis data (deprecated).
+        
+        Args:
+            website_data: Raw website analysis data
+            
+        Returns:
+            Processed website analysis data
+        """
+        raise RuntimeError("Deprecated: use AutoFillService normalizers")
+    
+    async def process_research_preferences(self, research_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process research preferences data (deprecated).
+        
+        Args:
+            research_data: Raw research preferences data
+            
+        Returns:
+            Processed research preferences data
+        """
+        raise RuntimeError("Deprecated: use AutoFillService normalizers")
+    
+    async def process_api_keys_data(self, api_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process API keys data (deprecated).
+        
+        Args:
+            api_data: Raw API keys data
+            
+        Returns:
+            Processed API keys data
+        """
+        raise RuntimeError("Deprecated: use AutoFillService normalizers")

-    def merge_data_sources(self, data_sources: List[Dict[str, Any]], merge_strategy: str = 'prefer_first') -> Dict[str, Any]:
-        """Merge multiple data sources."""
-        try:
-            if not data_sources:
-                return {}
-            
-            if len(data_sources) == 1:
-                return data_sources[0]
-            
-            merged_data = {}
-            
-            if merge_strategy == 'prefer_first':
-                # Prefer first non-empty value
-                for source in data_sources:
-                    for key, value in source.items():
-                        if key not in merged_data or merged_data[key] is None or merged_data[key] == '':
-                            merged_data[key] = value
-            
-            elif merge_strategy == 'prefer_last':
-                # Prefer last non-empty value
-                for source in data_sources:
-                    for key, value in source.items():
-                        if value is not None and value != '':
-                            merged_data[key] = value
-            
-            elif merge_strategy == 'combine':
-                # Combine all values
-                for source in data_sources:
-                    for key, value in source.items():
-                        if key not in merged_data:
-                            merged_data[key] = []
-                        if isinstance(merged_data[key], list):
-                            merged_data[key].append(value)
-                        else:
-                            merged_data[key] = [merged_data[key], value]
-            
-            elif merge_strategy == 'intersection':
-                # Only include fields present in all sources
-                common_keys = set(data_sources[0].keys())
-                for source in data_sources[1:]:
-                    common_keys = common_keys.intersection(set(source.keys()))
-                
-                for key in common_keys:
-                    values = [source[key] for source in data_sources if key in source]
-                    merged_data[key] = values[0] if values else None
-            
-            return merged_data
-            
-        except Exception as e:
-            logger.error(f"Error merging data sources: {str(e)}")
-            return data_sources[0] if data_sources else {}

-    def filter_data_by_criteria(self, data: Dict[str, Any], criteria: Dict[str, Any]) -> Dict[str, Any]:
-        """Filter data based on criteria."""
-        try:
-            filtered_data = {}
-            
-            for key, value in data.items():
-                include_field = True
-                
-                # Check if field should be included based on criteria
-                if 'include_fields' in criteria and key not in criteria['include_fields']:
-                    include_field = False
-                
-                if 'exclude_fields' in criteria and key in criteria['exclude_fields']:
-                    include_field = False
-                
-                # Check value-based criteria
-                if 'min_length' in criteria and isinstance(value, str) and len(value) < criteria['min_length']:
-                    include_field = False
-                
-                if 'max_length' in criteria and isinstance(value, str) and len(value) > criteria['max_length']:
-                    include_field = False
-                
-                if 'required_values' in criteria and key in criteria['required_values']:
-                    if value not in criteria['required_values'][key]:
-                        include_field = False
-                
-                if include_field:
-                    filtered_data[key] = value
-            
-            return filtered_data
-            
-        except Exception as e:
-            logger.error(f"Error filtering data by criteria: {str(e)}")
-            return data
+# Standalone functions for backward compatibility
+async def get_onboarding_data(user_id: int) -> Dict[str, Any]:
+    """Get comprehensive onboarding data for intelligent auto-population via AutoFillService."""
+    processor = DataProcessorService()
+    return await processor.get_onboarding_data(user_id)

-    def format_data_for_output(self, data: Dict[str, Any], output_format: str = 'json') -> Union[str, Dict[str, Any]]:
-        """Format data for different output formats."""
-        try:
-            if output_format == 'json':
-                return json.dumps(data, indent=2, default=str)
-            
-            elif output_format == 'dict':
-                return data
-            
-            elif output_format == 'csv':
-                # Convert to CSV format (simplified)
-                csv_lines = []
-                if data:
-                    # Headers
-                    headers = list(data.keys())
-                    csv_lines.append(','.join(headers))
-                    
-                    # Values
-                    values = [str(data.get(header, '')) for header in headers]
-                    csv_lines.append(','.join(values))
-                
-                return '\n'.join(csv_lines)
-            
-            elif output_format == 'xml':
-                # Convert to XML format (simplified)
-                xml_lines = ['<?xml version="1.0" encoding="UTF-8"?>', '<data>']
-                
-                for key, value in data.items():
-                    xml_lines.append(f'  <{key}>{value}</{key}>')
-                
-                xml_lines.append('</data>')
-                return '\n'.join(xml_lines)
-            
-            else:
-                logger.warning(f"Unknown output format: {output_format}")
-                return data
-                
-        except Exception as e:
-            logger.error(f"Error formatting data for output: {str(e)}")
-            return str(data)

-    def validate_data_types(self, data: Dict[str, Any], type_schema: Dict[str, str]) -> Dict[str, Any]:
-        """Validate data types against a schema."""
-        try:
-            validation_result = {
-                'is_valid': True,
-                'type_errors': [],
-                'validation_timestamp': datetime.utcnow().isoformat()
-            }
-            
-            for field, expected_type in type_schema.items():
-                if field in data:
-                    value = data[field]
-                    actual_type = self._determine_data_type(value)
-                    
-                    if actual_type != expected_type:
-                        validation_result['type_errors'].append({
-                            'field': field,
-                            'expected_type': expected_type,
-                            'actual_type': actual_type,
-                            'value': value
-                        })
-                        validation_result['is_valid'] = False
-            
-            return validation_result
-            
-        except Exception as e:
-            logger.error(f"Error validating data types: {str(e)}")
-            return {
-                'is_valid': False,
-                'type_errors': [{'error': str(e)}],
-                'validation_timestamp': datetime.utcnow().isoformat()
-            }
+def transform_onboarding_data_to_fields(processed_data: Dict[str, Any]) -> Dict[str, Any]:
+    """Transform processed onboarding data into field-specific format for frontend."""
+    processor = DataProcessorService()
+    return processor.transform_onboarding_data_to_fields(processed_data)

-    def sanitize_sensitive_data(self, data: Dict[str, Any], sensitive_fields: List[str]) -> Dict[str, Any]:
-        """Sanitize sensitive data fields."""
-        try:
-            sanitized_data = data.copy()
-            
-            for field in sensitive_fields:
-                if field in sanitized_data:
-                    value = sanitized_data[field]
-                    if isinstance(value, str) and len(value) > 4:
-                        # Replace with asterisks, keeping first and last character
-                        sanitized_data[field] = value[0] + '*' * (len(value) - 2) + value[-1]
-                    else:
-                        sanitized_data[field] = '***'
-            
-            return sanitized_data
-            
-        except Exception as e:
-            logger.error(f"Error sanitizing sensitive data: {str(e)}")
-            return data

-    def calculate_data_statistics(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """Calculate statistics about the data."""
-        try:
-            stats = {
-                'total_fields': len(data),
-                'string_fields': 0,
-                'numeric_fields': 0,
-                'boolean_fields': 0,
-                'object_fields': 0,
-                'array_fields': 0,
-                'null_fields': 0,
-                'empty_fields': 0,
-                'average_field_length': 0.0
-            }
-            
-            total_length = 0
-            field_count = 0
-            
-            for key, value in data.items():
-                if value is None:
-                    stats['null_fields'] += 1
-                elif value == '':
-                    stats['empty_fields'] += 1
-                else:
-                    data_type = self._determine_data_type(value)
-                    if data_type == 'string':
-                        stats['string_fields'] += 1
-                        total_length += len(str(value))
-                        field_count += 1
-                    elif data_type == 'number':
-                        stats['numeric_fields'] += 1
-                    elif data_type == 'boolean':
-                        stats['boolean_fields'] += 1
-                    elif data_type == 'object':
-                        stats['object_fields'] += 1
-                    elif data_type == 'array':
-                        stats['array_fields'] += 1
-            
-            if field_count > 0:
-                stats['average_field_length'] = total_length / field_count
-            
-            return stats
-            
-        except Exception as e:
-            logger.error(f"Error calculating data statistics: {str(e)}")
-            return {
-                'error': str(e),
-                'total_fields': 0
-            } 
+def get_data_sources(processed_data: Dict[str, Any]) -> Dict[str, str]:
+    """Get data sources for each field."""
+    processor = DataProcessorService()
+    return processor.get_data_sources(processed_data)
+
+
+def get_detailed_input_data_points(processed_data: Dict[str, Any]) -> Dict[str, Any]:
+    """Get detailed input data points for transparency."""
+    processor = DataProcessorService()
+    return processor.get_detailed_input_data_points(processed_data)
+
+
+def get_fallback_onboarding_data() -> Dict[str, Any]:
+    """Get fallback onboarding data for compatibility."""
+    processor = DataProcessorService()
+    return processor.get_fallback_onboarding_data()
+
+
+async def get_website_analysis_data(user_id: int) -> Dict[str, Any]:
+    """Get website analysis data from onboarding."""
+    processor = DataProcessorService()
+    return await processor.get_website_analysis_data(user_id)
+
+
+async def get_research_preferences_data(user_id: int) -> Dict[str, Any]:
+    """Get research preferences data from onboarding."""
+    processor = DataProcessorService()
+    return await processor.get_research_preferences_data(user_id)
+
+
+async def get_api_keys_data(user_id: int) -> Dict[str, Any]:
+    """Get API keys and external data from onboarding."""
+    processor = DataProcessorService()
+    return await processor.get_api_keys_data(user_id) 
--- a/backend/api/content_planning/services/content_strategy/utils/strategy_utils.py
+++ b/backend/api/content_planning/services/content_strategy/utils/strategy_utils.py
@@ -0,0 +1,355 @@
+"""
+Strategy utility functions for analysis, scoring, and data processing.
+Provides utility functions for content strategy operations including strategic scoring,
+market positioning analysis, competitive advantages, risk assessment, and opportunity analysis.
+"""
+
+import logging
+from typing import Dict, List, Any, Optional, Union
+from datetime import datetime
+
+logger = logging.getLogger(__name__)
+
+
+def calculate_strategic_scores(ai_recommendations: Dict[str, Any]) -> Dict[str, float]:
+    """
+    Calculate strategic performance scores from AI recommendations.
+    
+    Args:
+        ai_recommendations: Dictionary containing AI analysis results
+        
+    Returns:
+        Dictionary with calculated strategic scores
+    """
+    scores = {
+        'overall_score': 0.0,
+        'content_quality_score': 0.0,
+        'engagement_score': 0.0,
+        'conversion_score': 0.0,
+        'innovation_score': 0.0
+    }
+    
+    # Calculate scores based on AI recommendations
+    total_confidence = 0
+    total_score = 0
+    
+    for analysis_type, recommendations in ai_recommendations.items():
+        if isinstance(recommendations, dict) and 'metrics' in recommendations:
+            metrics = recommendations['metrics']
+            score = metrics.get('score', 50)
+            confidence = metrics.get('confidence', 0.5)
+            
+            total_score += score * confidence
+            total_confidence += confidence
+    
+    if total_confidence > 0:
+        scores['overall_score'] = total_score / total_confidence
+    
+    # Set other scores based on overall score
+    scores['content_quality_score'] = scores['overall_score'] * 1.1
+    scores['engagement_score'] = scores['overall_score'] * 0.9
+    scores['conversion_score'] = scores['overall_score'] * 0.95
+    scores['innovation_score'] = scores['overall_score'] * 1.05
+    
+    return scores
+
+
+def extract_market_positioning(ai_recommendations: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract market positioning insights from AI recommendations.
+    
+    Args:
+        ai_recommendations: Dictionary containing AI analysis results
+        
+    Returns:
+        Dictionary with market positioning data
+    """
+    return {
+        'industry_position': 'emerging',
+        'competitive_advantage': 'AI-powered content',
+        'market_share': '2.5%',
+        'positioning_score': 4
+    }
+
+
+def extract_competitive_advantages(ai_recommendations: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Extract competitive advantages from AI recommendations.
+    
+    Args:
+        ai_recommendations: Dictionary containing AI analysis results
+        
+    Returns:
+        List of competitive advantages with impact and implementation status
+    """
+    return [
+        {
+            'advantage': 'AI-powered content creation',
+            'impact': 'High',
+            'implementation': 'In Progress'
+        },
+        {
+            'advantage': 'Data-driven strategy',
+            'impact': 'Medium',
+            'implementation': 'Complete'
+        }
+    ]
+
+
+def extract_strategic_risks(ai_recommendations: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Extract strategic risks from AI recommendations.
+    
+    Args:
+        ai_recommendations: Dictionary containing AI analysis results
+        
+    Returns:
+        List of strategic risks with probability and impact assessment
+    """
+    return [
+        {
+            'risk': 'Content saturation in market',
+            'probability': 'Medium',
+            'impact': 'High'
+        },
+        {
+            'risk': 'Algorithm changes affecting reach',
+            'probability': 'High',
+            'impact': 'Medium'
+        }
+    ]
+
+
+def extract_opportunity_analysis(ai_recommendations: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Extract opportunity analysis from AI recommendations.
+    
+    Args:
+        ai_recommendations: Dictionary containing AI analysis results
+        
+    Returns:
+        List of opportunities with potential impact and implementation ease
+    """
+    return [
+        {
+            'opportunity': 'Video content expansion',
+            'potential_impact': 'High',
+            'implementation_ease': 'Medium'
+        },
+        {
+            'opportunity': 'Social media engagement',
+            'potential_impact': 'Medium',
+            'implementation_ease': 'High'
+        }
+    ]
+
+
+def initialize_caches() -> Dict[str, Any]:
+    """
+    Initialize in-memory caches for strategy operations.
+    
+    Returns:
+        Dictionary with initialized cache structures
+    """
+    return {
+        'performance_metrics': {
+            'response_times': [],
+            'cache_hit_rates': {},
+            'error_rates': {},
+            'throughput_metrics': {}
+        },
+        'strategy_cache': {},
+        'ai_analysis_cache': {},
+        'onboarding_cache': {}
+    }
+
+
+def calculate_data_quality_scores(data_sources: Dict[str, Any]) -> Dict[str, float]:
+    """
+    Calculate data quality scores for different data sources.
+    
+    Args:
+        data_sources: Dictionary containing data source information
+        
+    Returns:
+        Dictionary with quality scores for each data source
+    """
+    quality_scores = {}
+    
+    for source_name, source_data in data_sources.items():
+        if isinstance(source_data, dict):
+            # Calculate quality based on data completeness and freshness
+            completeness = source_data.get('completeness', 0.5)
+            freshness = source_data.get('freshness', 0.5)
+            confidence = source_data.get('confidence', 0.5)
+            
+            # Weighted average of quality factors
+            quality_score = (completeness * 0.4 + freshness * 0.3 + confidence * 0.3)
+            quality_scores[source_name] = round(quality_score, 2)
+        else:
+            quality_scores[source_name] = 0.5  # Default score
+    
+    return quality_scores
+
+
+def extract_content_preferences_from_style(writing_style: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract content preferences from writing style analysis.
+    
+    Args:
+        writing_style: Dictionary containing writing style analysis
+        
+    Returns:
+        Dictionary with extracted content preferences
+    """
+    preferences = {
+        'tone': writing_style.get('tone', 'professional'),
+        'complexity': writing_style.get('complexity', 'intermediate'),
+        'engagement_level': writing_style.get('engagement_level', 'medium'),
+        'content_type': writing_style.get('content_type', 'blog')
+    }
+    
+    return preferences
+
+
+def extract_brand_voice_from_guidelines(style_guidelines: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract brand voice from style guidelines.
+    
+    Args:
+        style_guidelines: Dictionary containing style guidelines
+        
+    Returns:
+        Dictionary with extracted brand voice information
+    """
+    brand_voice = {
+        'tone': style_guidelines.get('tone', 'professional'),
+        'personality': style_guidelines.get('personality', 'authoritative'),
+        'style': style_guidelines.get('style', 'formal'),
+        'voice_characteristics': style_guidelines.get('voice_characteristics', [])
+    }
+    
+    return brand_voice
+
+
+def extract_editorial_guidelines_from_style(writing_style: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract editorial guidelines from writing style analysis.
+    
+    Args:
+        writing_style: Dictionary containing writing style analysis
+        
+    Returns:
+        Dictionary with extracted editorial guidelines
+    """
+    guidelines = {
+        'sentence_structure': writing_style.get('sentence_structure', 'clear'),
+        'vocabulary_level': writing_style.get('vocabulary_level', 'intermediate'),
+        'paragraph_organization': writing_style.get('paragraph_organization', 'logical'),
+        'style_rules': writing_style.get('style_rules', [])
+    }
+    
+    return guidelines
+
+
+def create_field_mappings() -> Dict[str, str]:
+    """
+    Create field mappings for strategy data transformation.
+    
+    Returns:
+        Dictionary mapping field names to their corresponding data sources
+    """
+    return {
+        'business_objectives': 'website_analysis',
+        'target_metrics': 'research_preferences',
+        'content_budget': 'onboarding_session',
+        'team_size': 'onboarding_session',
+        'implementation_timeline': 'onboarding_session',
+        'market_share': 'website_analysis',
+        'competitive_position': 'website_analysis',
+        'performance_metrics': 'website_analysis',
+        'content_preferences': 'website_analysis',
+        'consumption_patterns': 'research_preferences',
+        'audience_pain_points': 'website_analysis',
+        'buying_journey': 'website_analysis',
+        'seasonal_trends': 'research_preferences',
+        'engagement_metrics': 'website_analysis',
+        'top_competitors': 'website_analysis',
+        'competitor_content_strategies': 'website_analysis',
+        'market_gaps': 'website_analysis',
+        'industry_trends': 'website_analysis',
+        'emerging_trends': 'website_analysis',
+        'preferred_formats': 'website_analysis',
+        'content_mix': 'research_preferences',
+        'content_frequency': 'research_preferences',
+        'optimal_timing': 'research_preferences',
+        'quality_metrics': 'website_analysis',
+        'editorial_guidelines': 'website_analysis',
+        'brand_voice': 'website_analysis',
+        'traffic_sources': 'website_analysis',
+        'conversion_rates': 'website_analysis',
+        'content_roi_targets': 'website_analysis',
+        'ab_testing_capabilities': 'onboarding_session'
+    }
+
+
+class StrategyUtils:
+    """
+    Utility class for strategy-related operations.
+    Provides static methods for strategy analysis and data processing.
+    """
+    
+    @staticmethod
+    def calculate_strategic_scores(ai_recommendations: Dict[str, Any]) -> Dict[str, float]:
+        """Calculate strategic performance scores from AI recommendations."""
+        return calculate_strategic_scores(ai_recommendations)
+    
+    @staticmethod
+    def extract_market_positioning(ai_recommendations: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract market positioning insights from AI recommendations."""
+        return extract_market_positioning(ai_recommendations)
+    
+    @staticmethod
+    def extract_competitive_advantages(ai_recommendations: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Extract competitive advantages from AI recommendations."""
+        return extract_competitive_advantages(ai_recommendations)
+    
+    @staticmethod
+    def extract_strategic_risks(ai_recommendations: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Extract strategic risks from AI recommendations."""
+        return extract_strategic_risks(ai_recommendations)
+    
+    @staticmethod
+    def extract_opportunity_analysis(ai_recommendations: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Extract opportunity analysis from AI recommendations."""
+        return extract_opportunity_analysis(ai_recommendations)
+    
+    @staticmethod
+    def initialize_caches() -> Dict[str, Any]:
+        """Initialize in-memory caches for strategy operations."""
+        return initialize_caches()
+    
+    @staticmethod
+    def calculate_data_quality_scores(data_sources: Dict[str, Any]) -> Dict[str, float]:
+        """Calculate data quality scores for different data sources."""
+        return calculate_data_quality_scores(data_sources)
+    
+    @staticmethod
+    def extract_content_preferences_from_style(writing_style: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract content preferences from writing style analysis."""
+        return extract_content_preferences_from_style(writing_style)
+    
+    @staticmethod
+    def extract_brand_voice_from_guidelines(style_guidelines: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract brand voice from style guidelines."""
+        return extract_brand_voice_from_guidelines(style_guidelines)
+    
+    @staticmethod
+    def extract_editorial_guidelines_from_style(writing_style: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract editorial guidelines from writing style analysis."""
+        return extract_editorial_guidelines_from_style(writing_style)
+    
+    @staticmethod
+    def create_field_mappings() -> Dict[str, str]:
+        """Create field mappings for strategy data transformation."""
+        return create_field_mappings()