ALwrity version 0.5.4
This commit is contained in:
@@ -1,10 +1,16 @@
|
||||
"""
|
||||
Onboarding Module
|
||||
Onboarding data integration and processing services.
|
||||
Onboarding data integration and processing.
|
||||
"""
|
||||
|
||||
from .data_integration import OnboardingDataIntegrationService
|
||||
from .field_transformation import FieldTransformationService
|
||||
from .data_quality import DataQualityService
|
||||
from .field_transformation import FieldTransformationService
|
||||
from .data_processor import OnboardingDataProcessor
|
||||
|
||||
__all__ = ['OnboardingDataIntegrationService', 'FieldTransformationService', 'DataQualityService']
|
||||
__all__ = [
|
||||
'OnboardingDataIntegrationService',
|
||||
'DataQualityService',
|
||||
'FieldTransformationService',
|
||||
'OnboardingDataProcessor'
|
||||
]
|
||||
@@ -305,19 +305,28 @@ class OnboardingDataIntegrationService:
|
||||
).first()
|
||||
|
||||
if existing_record:
|
||||
existing_record.website_analysis_data = integrated_data.get('website_analysis', {})
|
||||
existing_record.research_preferences_data = integrated_data.get('research_preferences', {})
|
||||
existing_record.api_keys_data = integrated_data.get('api_keys_data', {})
|
||||
# Use legacy columns that are known to exist
|
||||
if hasattr(existing_record, 'website_analysis_data'):
|
||||
existing_record.website_analysis_data = integrated_data.get('website_analysis', {})
|
||||
if hasattr(existing_record, 'research_preferences_data'):
|
||||
existing_record.research_preferences_data = integrated_data.get('research_preferences', {})
|
||||
if hasattr(existing_record, 'api_keys_data'):
|
||||
existing_record.api_keys_data = integrated_data.get('api_keys_data', {})
|
||||
existing_record.updated_at = datetime.utcnow()
|
||||
else:
|
||||
new_record = OnboardingDataIntegration(
|
||||
user_id=user_id,
|
||||
website_analysis_data=integrated_data.get('website_analysis', {}),
|
||||
research_preferences_data=integrated_data.get('research_preferences', {}),
|
||||
api_keys_data=integrated_data.get('api_keys_data', {}),
|
||||
created_at=datetime.utcnow(),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
new_kwargs = {
|
||||
'user_id': user_id,
|
||||
'created_at': datetime.utcnow(),
|
||||
'updated_at': datetime.utcnow()
|
||||
}
|
||||
if 'website_analysis' in integrated_data:
|
||||
new_kwargs['website_analysis_data'] = integrated_data.get('website_analysis', {})
|
||||
if 'research_preferences' in integrated_data:
|
||||
new_kwargs['research_preferences_data'] = integrated_data.get('research_preferences', {})
|
||||
if 'api_keys_data' in integrated_data:
|
||||
new_kwargs['api_keys_data'] = integrated_data.get('api_keys_data', {})
|
||||
|
||||
new_record = OnboardingDataIntegration(**new_kwargs)
|
||||
db.add(new_record)
|
||||
|
||||
db.commit()
|
||||
@@ -326,6 +335,8 @@ class OnboardingDataIntegrationService:
|
||||
except Exception as e:
|
||||
logger.error(f"Error storing integrated data for user {user_id}: {str(e)}")
|
||||
db.rollback()
|
||||
# Soft-fail storage: do not break the refresh path
|
||||
return
|
||||
|
||||
def _get_fallback_data(self) -> Dict[str, Any]:
|
||||
"""Get fallback data when processing fails."""
|
||||
|
||||
@@ -0,0 +1,301 @@
|
||||
"""
|
||||
Onboarding Data Processor
|
||||
Handles processing and transformation of onboarding data for strategic intelligence.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional, Union
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
# Import database models
|
||||
from models.onboarding import OnboardingSession, WebsiteAnalysis, ResearchPreferences, APIKey
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class OnboardingDataProcessor:
|
||||
"""Processes and transforms onboarding data for strategic intelligence generation."""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_onboarding_data(self, user_id: int, db: Session) -> Optional[Dict[str, Any]]:
|
||||
"""Process onboarding data for a user and return structured data for strategic intelligence."""
|
||||
try:
|
||||
logger.info(f"Processing onboarding data for user {user_id}")
|
||||
|
||||
# Get onboarding session
|
||||
onboarding_session = db.query(OnboardingSession).filter(
|
||||
OnboardingSession.user_id == user_id
|
||||
).first()
|
||||
|
||||
if not onboarding_session:
|
||||
logger.warning(f"No onboarding session found for user {user_id}")
|
||||
return None
|
||||
|
||||
# Get website analysis data
|
||||
website_analysis = db.query(WebsiteAnalysis).filter(
|
||||
WebsiteAnalysis.session_id == onboarding_session.id
|
||||
).first()
|
||||
|
||||
# Get research preferences data
|
||||
research_preferences = db.query(ResearchPreferences).filter(
|
||||
ResearchPreferences.session_id == onboarding_session.id
|
||||
).first()
|
||||
|
||||
# Get API keys data
|
||||
api_keys = db.query(APIKey).filter(
|
||||
APIKey.session_id == onboarding_session.id
|
||||
).all()
|
||||
|
||||
# Process each data type
|
||||
processed_data = {
|
||||
'website_analysis': await self._process_website_analysis(website_analysis),
|
||||
'research_preferences': await self._process_research_preferences(research_preferences),
|
||||
'api_keys_data': await self._process_api_keys_data(api_keys),
|
||||
'session_data': self._process_session_data(onboarding_session)
|
||||
}
|
||||
|
||||
# Transform into strategic intelligence format
|
||||
strategic_data = self._transform_to_strategic_format(processed_data)
|
||||
|
||||
logger.info(f"Successfully processed onboarding data for user {user_id}")
|
||||
return strategic_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing onboarding data for user {user_id}: {str(e)}")
|
||||
return None
|
||||
|
||||
async def _process_website_analysis(self, website_analysis: Optional[WebsiteAnalysis]) -> Dict[str, Any]:
|
||||
"""Process website analysis data."""
|
||||
if not website_analysis:
|
||||
return {}
|
||||
|
||||
try:
|
||||
return {
|
||||
'website_url': getattr(website_analysis, 'website_url', ''),
|
||||
'industry': getattr(website_analysis, 'industry', 'Technology'), # Default value if attribute doesn't exist
|
||||
'content_goals': getattr(website_analysis, 'content_goals', []),
|
||||
'performance_metrics': getattr(website_analysis, 'performance_metrics', {}),
|
||||
'traffic_sources': getattr(website_analysis, 'traffic_sources', []),
|
||||
'content_gaps': getattr(website_analysis, 'content_gaps', []),
|
||||
'topics': getattr(website_analysis, 'topics', []),
|
||||
'content_quality_score': getattr(website_analysis, 'content_quality_score', 0),
|
||||
'seo_opportunities': getattr(website_analysis, 'seo_opportunities', []),
|
||||
'competitors': getattr(website_analysis, 'competitors', []),
|
||||
'competitive_advantages': getattr(website_analysis, 'competitive_advantages', []),
|
||||
'market_gaps': getattr(website_analysis, 'market_gaps', []),
|
||||
'last_updated': website_analysis.updated_at.isoformat() if hasattr(website_analysis, 'updated_at') and website_analysis.updated_at else None
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing website analysis: {str(e)}")
|
||||
return {}
|
||||
|
||||
async def _process_research_preferences(self, research_preferences: Optional[ResearchPreferences]) -> Dict[str, Any]:
|
||||
"""Process research preferences data."""
|
||||
if not research_preferences:
|
||||
return {}
|
||||
|
||||
try:
|
||||
return {
|
||||
'content_preferences': {
|
||||
'preferred_formats': research_preferences.content_types,
|
||||
'content_topics': research_preferences.research_topics,
|
||||
'content_style': research_preferences.writing_style.get('tone', []) if research_preferences.writing_style else [],
|
||||
'content_length': research_preferences.content_length,
|
||||
'visual_preferences': research_preferences.visual_preferences
|
||||
},
|
||||
'audience_research': {
|
||||
'target_audience': research_preferences.target_audience.get('demographics', []) if research_preferences.target_audience else [],
|
||||
'audience_pain_points': research_preferences.target_audience.get('pain_points', []) if research_preferences.target_audience else [],
|
||||
'buying_journey': research_preferences.target_audience.get('buying_journey', {}) if research_preferences.target_audience else {},
|
||||
'consumption_patterns': research_preferences.target_audience.get('consumption_patterns', {}) if research_preferences.target_audience else {}
|
||||
},
|
||||
'research_goals': {
|
||||
'primary_goals': research_preferences.research_topics,
|
||||
'secondary_goals': research_preferences.content_types,
|
||||
'success_metrics': research_preferences.success_metrics
|
||||
},
|
||||
'last_updated': research_preferences.updated_at.isoformat() if research_preferences.updated_at else None
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing research preferences: {str(e)}")
|
||||
return {}
|
||||
|
||||
async def _process_api_keys_data(self, api_keys: List[APIKey]) -> Dict[str, Any]:
|
||||
"""Process API keys data."""
|
||||
try:
|
||||
processed_data = {
|
||||
'analytics_data': {},
|
||||
'social_media_data': {},
|
||||
'competitor_data': {},
|
||||
'last_updated': None
|
||||
}
|
||||
|
||||
for api_key in api_keys:
|
||||
if api_key.provider == 'google_analytics':
|
||||
processed_data['analytics_data']['google_analytics'] = {
|
||||
'connected': True,
|
||||
'data_available': True,
|
||||
'metrics': api_key.metrics if api_key.metrics else {}
|
||||
}
|
||||
elif api_key.provider == 'google_search_console':
|
||||
processed_data['analytics_data']['google_search_console'] = {
|
||||
'connected': True,
|
||||
'data_available': True,
|
||||
'metrics': api_key.metrics if api_key.metrics else {}
|
||||
}
|
||||
elif api_key.provider in ['linkedin', 'twitter', 'facebook']:
|
||||
processed_data['social_media_data'][api_key.provider] = {
|
||||
'connected': True,
|
||||
'followers': api_key.metrics.get('followers', 0) if api_key.metrics else 0
|
||||
}
|
||||
elif api_key.provider in ['semrush', 'ahrefs', 'moz']:
|
||||
processed_data['competitor_data'][api_key.provider] = {
|
||||
'connected': True,
|
||||
'competitors_analyzed': api_key.metrics.get('competitors_analyzed', 0) if api_key.metrics else 0
|
||||
}
|
||||
|
||||
# Update last_updated if this key is more recent
|
||||
if api_key.updated_at and (not processed_data['last_updated'] or api_key.updated_at > datetime.fromisoformat(processed_data['last_updated'])):
|
||||
processed_data['last_updated'] = api_key.updated_at.isoformat()
|
||||
|
||||
return processed_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing API keys data: {str(e)}")
|
||||
return {}
|
||||
|
||||
def _process_session_data(self, onboarding_session: OnboardingSession) -> Dict[str, Any]:
|
||||
"""Process onboarding session data."""
|
||||
try:
|
||||
return {
|
||||
'session_id': getattr(onboarding_session, 'id', None),
|
||||
'user_id': getattr(onboarding_session, 'user_id', None),
|
||||
'created_at': onboarding_session.created_at.isoformat() if hasattr(onboarding_session, 'created_at') and onboarding_session.created_at else None,
|
||||
'updated_at': onboarding_session.updated_at.isoformat() if hasattr(onboarding_session, 'updated_at') and onboarding_session.updated_at else None,
|
||||
'completion_status': getattr(onboarding_session, 'completion_status', 'in_progress'),
|
||||
'session_data': getattr(onboarding_session, 'session_data', {}),
|
||||
'progress_percentage': getattr(onboarding_session, 'progress_percentage', 0),
|
||||
'last_activity': getattr(onboarding_session, 'last_activity', None)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing session data: {str(e)}")
|
||||
return {}
|
||||
|
||||
def _transform_to_strategic_format(self, processed_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Transform processed onboarding data into strategic intelligence format."""
|
||||
try:
|
||||
website_data = processed_data.get('website_analysis', {})
|
||||
research_data = processed_data.get('research_preferences', {})
|
||||
api_data = processed_data.get('api_keys_data', {})
|
||||
session_data = processed_data.get('session_data', {})
|
||||
|
||||
# Return data in nested format that field transformation service expects
|
||||
return {
|
||||
'website_analysis': {
|
||||
'content_goals': website_data.get('content_goals', []),
|
||||
'performance_metrics': website_data.get('performance_metrics', {}),
|
||||
'competitors': website_data.get('competitors', []),
|
||||
'content_gaps': website_data.get('content_gaps', []),
|
||||
'industry': website_data.get('industry', 'Technology'),
|
||||
'target_audience': website_data.get('target_audience', {}),
|
||||
'business_type': website_data.get('business_type', 'Technology')
|
||||
},
|
||||
'research_preferences': {
|
||||
'content_types': research_data.get('content_preferences', {}).get('preferred_formats', []),
|
||||
'research_topics': research_data.get('research_topics', []),
|
||||
'performance_tracking': research_data.get('performance_tracking', []),
|
||||
'competitor_analysis': research_data.get('competitor_analysis', []),
|
||||
'target_audience': research_data.get('audience_research', {}).get('target_audience', {}),
|
||||
'industry_focus': research_data.get('industry_focus', []),
|
||||
'trend_analysis': research_data.get('trend_analysis', []),
|
||||
'content_calendar': research_data.get('content_calendar', {})
|
||||
},
|
||||
'onboarding_session': {
|
||||
'session_data': {
|
||||
'budget': session_data.get('budget', 3000),
|
||||
'team_size': session_data.get('team_size', 2),
|
||||
'timeline': session_data.get('timeline', '3 months'),
|
||||
'brand_voice': session_data.get('brand_voice', 'Professional yet approachable')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error transforming to strategic format: {str(e)}")
|
||||
return {}
|
||||
|
||||
def calculate_data_quality_scores(self, processed_data: Dict[str, Any]) -> Dict[str, float]:
|
||||
"""Calculate quality scores for each data source."""
|
||||
scores = {}
|
||||
|
||||
for source, data in processed_data.items():
|
||||
if data and isinstance(data, dict):
|
||||
# Simple scoring based on data completeness
|
||||
total_fields = len(data)
|
||||
present_fields = len([v for v in data.values() if v is not None and v != {}])
|
||||
completeness = present_fields / total_fields if total_fields > 0 else 0.0
|
||||
scores[source] = completeness * 100
|
||||
else:
|
||||
scores[source] = 0.0
|
||||
|
||||
return scores
|
||||
|
||||
def calculate_confidence_levels(self, processed_data: Dict[str, Any]) -> Dict[str, float]:
|
||||
"""Calculate confidence levels for processed data."""
|
||||
confidence_levels = {}
|
||||
|
||||
# Base confidence on data source quality
|
||||
base_confidence = {
|
||||
'website_analysis': 0.8,
|
||||
'research_preferences': 0.7,
|
||||
'api_keys_data': 0.6,
|
||||
'session_data': 0.9
|
||||
}
|
||||
|
||||
for source, data in processed_data.items():
|
||||
if data and isinstance(data, dict):
|
||||
# Adjust confidence based on data completeness
|
||||
quality_score = self.calculate_data_quality_scores({source: data})[source] / 100
|
||||
base_conf = base_confidence.get(source, 0.5)
|
||||
confidence_levels[source] = base_conf * quality_score
|
||||
else:
|
||||
confidence_levels[source] = 0.0
|
||||
|
||||
return confidence_levels
|
||||
|
||||
def calculate_data_freshness(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Calculate data freshness for onboarding data."""
|
||||
try:
|
||||
updated_at = session_data.get('updated_at')
|
||||
if not updated_at:
|
||||
return {'status': 'unknown', 'age_days': 'unknown'}
|
||||
|
||||
# Convert string to datetime if needed
|
||||
if isinstance(updated_at, str):
|
||||
try:
|
||||
updated_at = datetime.fromisoformat(updated_at.replace('Z', '+00:00'))
|
||||
except ValueError:
|
||||
return {'status': 'unknown', 'age_days': 'unknown'}
|
||||
|
||||
age_days = (datetime.utcnow() - updated_at).days
|
||||
|
||||
if age_days <= 7:
|
||||
status = 'fresh'
|
||||
elif age_days <= 30:
|
||||
status = 'recent'
|
||||
elif age_days <= 90:
|
||||
status = 'aging'
|
||||
else:
|
||||
status = 'stale'
|
||||
|
||||
return {
|
||||
'status': status,
|
||||
'age_days': age_days,
|
||||
'last_updated': updated_at.isoformat() if hasattr(updated_at, 'isoformat') else str(updated_at)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating data freshness: {str(e)}")
|
||||
return {'status': 'unknown', 'age_days': 'unknown'}
|
||||
@@ -92,7 +92,8 @@ class DataQualityService:
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error assessing data quality: {str(e)}")
|
||||
return self._get_fallback_quality_assessment()
|
||||
# Raise exception instead of returning fallback data
|
||||
raise Exception(f"Failed to assess data quality: {str(e)}")
|
||||
|
||||
def _assess_website_analysis_quality(self, website_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Assess quality of website analysis data."""
|
||||
@@ -501,22 +502,6 @@ class DataQualityService:
|
||||
logger.error(f"Error identifying quality issues: {str(e)}")
|
||||
return ["Unable to identify issues due to assessment error"]
|
||||
|
||||
def _get_fallback_quality_assessment(self) -> Dict[str, Any]:
|
||||
"""Get fallback quality assessment when assessment fails."""
|
||||
return {
|
||||
'overall_score': 0.0,
|
||||
'completeness': 0.0,
|
||||
'freshness': 0.0,
|
||||
'accuracy': 0.0,
|
||||
'relevance': 0.0,
|
||||
'consistency': 0.0,
|
||||
'confidence': 0.0,
|
||||
'quality_level': 'poor',
|
||||
'recommendations': ['Unable to assess data quality'],
|
||||
'issues': ['Quality assessment failed'],
|
||||
'assessment_timestamp': datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
def validate_field_data(self, field_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate individual field data."""
|
||||
try:
|
||||
|
||||
@@ -147,48 +147,108 @@ class FieldTransformationService:
|
||||
}
|
||||
|
||||
def transform_onboarding_data_to_fields(self, integrated_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Transform integrated onboarding data to strategic input fields."""
|
||||
"""Transform onboarding data to strategic input fields."""
|
||||
try:
|
||||
logger.info("Transforming onboarding data to strategic fields")
|
||||
|
||||
|
||||
transformed_fields = {}
|
||||
data_sources = {}
|
||||
|
||||
for field_id, mapping_config in self.field_mappings.items():
|
||||
try:
|
||||
# Extract data from sources
|
||||
source_data = self._extract_source_data(integrated_data, mapping_config['sources'])
|
||||
|
||||
if source_data:
|
||||
# Apply transformation
|
||||
transformation_method = getattr(self, mapping_config['transformation'])
|
||||
transformed_value = transformation_method(source_data, integrated_data)
|
||||
|
||||
if transformed_value:
|
||||
transformed_fields[field_id] = transformed_value
|
||||
data_sources[field_id] = self._get_data_source_info(mapping_config['sources'], integrated_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error transforming field {field_id}: {str(e)}")
|
||||
continue
|
||||
|
||||
result = {
|
||||
'fields': transformed_fields,
|
||||
'sources': data_sources,
|
||||
'transformation_metadata': {
|
||||
'total_fields_processed': len(self.field_mappings),
|
||||
'successful_transformations': len(transformed_fields),
|
||||
'transformation_timestamp': datetime.utcnow().isoformat()
|
||||
}
|
||||
transformation_metadata = {
|
||||
'total_fields': 0,
|
||||
'populated_fields': 0,
|
||||
'data_sources_used': [],
|
||||
'confidence_scores': {}
|
||||
}
|
||||
|
||||
logger.info(f"Successfully transformed {len(transformed_fields)} fields from onboarding data")
|
||||
return result
|
||||
|
||||
|
||||
# Process each field mapping
|
||||
for field_name, mapping in self.field_mappings.items():
|
||||
try:
|
||||
sources = mapping.get('sources', [])
|
||||
transformation_method = mapping.get('transformation')
|
||||
|
||||
# Extract source data
|
||||
source_data = self._extract_source_data(integrated_data, sources)
|
||||
|
||||
# Apply transformation if method exists
|
||||
if transformation_method and hasattr(self, transformation_method):
|
||||
transform_func = getattr(self, transformation_method)
|
||||
field_value = transform_func(source_data, integrated_data)
|
||||
else:
|
||||
# Default transformation - use first available source data
|
||||
field_value = self._default_transformation(source_data, field_name)
|
||||
|
||||
# If no value found, provide default based on field type
|
||||
if field_value is None or field_value == "":
|
||||
field_value = self._get_default_value_for_field(field_name)
|
||||
|
||||
if field_value is not None:
|
||||
transformed_fields[field_name] = {
|
||||
'value': field_value,
|
||||
'source': sources[0] if sources else 'default',
|
||||
'confidence': self._calculate_field_confidence(source_data, sources),
|
||||
'auto_populated': True
|
||||
}
|
||||
transformation_metadata['populated_fields'] += 1
|
||||
|
||||
transformation_metadata['total_fields'] += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error transforming field {field_name}: {str(e)}")
|
||||
# Don't provide fallback data - let the error propagate
|
||||
transformation_metadata['total_fields'] += 1
|
||||
|
||||
logger.info(f"Successfully transformed {transformation_metadata['populated_fields']} fields from onboarding data")
|
||||
|
||||
return {
|
||||
'fields': transformed_fields,
|
||||
'sources': self._get_data_source_info(list(self.field_mappings.keys()), integrated_data),
|
||||
'transformation_metadata': transformation_metadata
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error transforming onboarding data to fields: {str(e)}")
|
||||
logger.error(f"Error in transform_onboarding_data_to_fields: {str(e)}")
|
||||
return {'fields': {}, 'sources': {}, 'transformation_metadata': {'error': str(e)}}
|
||||
|
||||
def get_data_sources(self, integrated_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get data sources information for the transformed fields."""
|
||||
try:
|
||||
sources_info = {}
|
||||
for field_name, mapping in self.field_mappings.items():
|
||||
sources = mapping.get('sources', [])
|
||||
sources_info[field_name] = {
|
||||
'sources': sources,
|
||||
'source_count': len(sources),
|
||||
'has_data': any(self._has_source_data(integrated_data, source) for source in sources)
|
||||
}
|
||||
return sources_info
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting data sources: {str(e)}")
|
||||
return {}
|
||||
|
||||
def get_detailed_input_data_points(self, integrated_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get detailed input data points for debugging and analysis."""
|
||||
try:
|
||||
data_points = {}
|
||||
for field_name, mapping in self.field_mappings.items():
|
||||
sources = mapping.get('sources', [])
|
||||
source_data = {}
|
||||
|
||||
for source in sources:
|
||||
source_data[source] = {
|
||||
'exists': self._has_source_data(integrated_data, source),
|
||||
'value': self._get_nested_value(integrated_data, source),
|
||||
'type': type(self._get_nested_value(integrated_data, source)).__name__
|
||||
}
|
||||
|
||||
data_points[field_name] = {
|
||||
'sources': source_data,
|
||||
'transformation_method': mapping.get('transformation'),
|
||||
'has_data': any(source_data[source]['exists'] for source in sources)
|
||||
}
|
||||
return data_points
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting detailed input data points: {str(e)}")
|
||||
return {}
|
||||
|
||||
def _extract_source_data(self, integrated_data: Dict[str, Any], sources: List[str]) -> Dict[str, Any]:
|
||||
"""Extract data from specified sources."""
|
||||
source_data = {}
|
||||
@@ -362,22 +422,34 @@ class FieldTransformationService:
|
||||
return None
|
||||
|
||||
def extract_competitive_position(self, source_data: Dict[str, Any], integrated_data: Dict[str, Any]) -> Optional[str]:
|
||||
"""Extract competitive position from competitor data."""
|
||||
"""Extract and normalize competitive position to one of Leader, Challenger, Niche, Emerging."""
|
||||
try:
|
||||
position_indicators = []
|
||||
text_blobs: list[str] = []
|
||||
|
||||
if 'website_analysis.competitors' in source_data:
|
||||
competitors = source_data['website_analysis.competitors']
|
||||
if competitors:
|
||||
position_indicators.append(f"Competitors: {competitors}")
|
||||
if isinstance(competitors, (str, list, dict)):
|
||||
text_blobs.append(str(competitors))
|
||||
|
||||
if 'research_preferences.competitor_analysis' in source_data:
|
||||
analysis = source_data['research_preferences.competitor_analysis']
|
||||
if analysis:
|
||||
position_indicators.append(f"Analysis: {analysis}")
|
||||
|
||||
return '; '.join(position_indicators) if position_indicators else None
|
||||
if isinstance(analysis, (str, list, dict)):
|
||||
text_blobs.append(str(analysis))
|
||||
|
||||
blob = ' '.join(text_blobs).lower()
|
||||
|
||||
# Simple keyword heuristics
|
||||
if any(kw in blob for kw in ['leader', 'market leader', 'category leader', 'dominant']):
|
||||
return 'Leader'
|
||||
if any(kw in blob for kw in ['challenger', 'fast follower', 'aggressive']):
|
||||
return 'Challenger'
|
||||
if any(kw in blob for kw in ['niche', 'niche player', 'specialized']):
|
||||
return 'Niche'
|
||||
if any(kw in blob for kw in ['emerging', 'new entrant', 'startup', 'growing']):
|
||||
return 'Emerging'
|
||||
|
||||
# No clear signal; let default take over
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting competitive position: {str(e)}")
|
||||
return None
|
||||
@@ -427,6 +499,15 @@ class FieldTransformationService:
|
||||
if research_audience:
|
||||
patterns.append(f"Research Audience: {research_audience}")
|
||||
|
||||
# If we have consumption data as a dict, format it nicely
|
||||
if isinstance(integrated_data.get('consumption_patterns'), dict):
|
||||
consumption_data = integrated_data['consumption_patterns']
|
||||
if isinstance(consumption_data, dict):
|
||||
formatted_patterns = []
|
||||
for platform, percentage in consumption_data.items():
|
||||
formatted_patterns.append(f"{platform.title()}: {percentage}%")
|
||||
patterns.append(', '.join(formatted_patterns))
|
||||
|
||||
return '; '.join(patterns) if patterns else None
|
||||
|
||||
except Exception as e:
|
||||
@@ -465,6 +546,16 @@ class FieldTransformationService:
|
||||
audience = source_data['website_analysis.target_audience']
|
||||
if audience:
|
||||
return f"Journey based on: {audience}"
|
||||
|
||||
# If we have buying journey data as a dict, format it nicely
|
||||
if isinstance(integrated_data.get('buying_journey'), dict):
|
||||
journey_data = integrated_data['buying_journey']
|
||||
if isinstance(journey_data, dict):
|
||||
formatted_journey = []
|
||||
for stage, percentage in journey_data.items():
|
||||
formatted_journey.append(f"{stage.title()}: {percentage}%")
|
||||
return ', '.join(formatted_journey)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
@@ -599,16 +690,51 @@ class FieldTransformationService:
|
||||
return None
|
||||
|
||||
def extract_preferred_formats(self, source_data: Dict[str, Any], integrated_data: Dict[str, Any]) -> Optional[str]:
|
||||
"""Extract preferred content formats."""
|
||||
"""Extract preferred content formats and normalize to UI option labels array."""
|
||||
try:
|
||||
def to_canonical(label: str) -> Optional[str]:
|
||||
normalized = label.strip().lower()
|
||||
mapping = {
|
||||
'blog': 'Blog Posts',
|
||||
'blog post': 'Blog Posts',
|
||||
'blog posts': 'Blog Posts',
|
||||
'article': 'Blog Posts',
|
||||
'articles': 'Blog Posts',
|
||||
'video': 'Videos',
|
||||
'videos': 'Videos',
|
||||
'infographic': 'Infographics',
|
||||
'infographics': 'Infographics',
|
||||
'webinar': 'Webinars',
|
||||
'webinars': 'Webinars',
|
||||
'podcast': 'Podcasts',
|
||||
'podcasts': 'Podcasts',
|
||||
'case study': 'Case Studies',
|
||||
'case studies': 'Case Studies',
|
||||
'whitepaper': 'Whitepapers',
|
||||
'whitepapers': 'Whitepapers',
|
||||
'social': 'Social Media Posts',
|
||||
'social media': 'Social Media Posts',
|
||||
'social media posts': 'Social Media Posts'
|
||||
}
|
||||
return mapping.get(normalized, None)
|
||||
|
||||
if 'research_preferences.content_types' in source_data:
|
||||
content_types = source_data['research_preferences.content_types']
|
||||
canonical: list[str] = []
|
||||
if isinstance(content_types, list):
|
||||
return ', '.join(content_types)
|
||||
for item in content_types:
|
||||
if isinstance(item, str):
|
||||
canon = to_canonical(item)
|
||||
if canon and canon not in canonical:
|
||||
canonical.append(canon)
|
||||
elif isinstance(content_types, str):
|
||||
return content_types
|
||||
for part in content_types.split(','):
|
||||
canon = to_canonical(part)
|
||||
if canon and canon not in canonical:
|
||||
canonical.append(canon)
|
||||
if canonical:
|
||||
return canonical
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting preferred formats: {str(e)}")
|
||||
return None
|
||||
@@ -654,6 +780,20 @@ class FieldTransformationService:
|
||||
calendar = source_data['research_preferences.content_calendar']
|
||||
if calendar:
|
||||
return str(calendar)
|
||||
|
||||
# If we have optimal timing data as a dict, format it nicely
|
||||
if isinstance(integrated_data.get('optimal_timing'), dict):
|
||||
timing_data = integrated_data['optimal_timing']
|
||||
if isinstance(timing_data, dict):
|
||||
formatted_timing = []
|
||||
if 'best_days' in timing_data:
|
||||
days = timing_data['best_days']
|
||||
if isinstance(days, list):
|
||||
formatted_timing.append(f"Best Days: {', '.join(days)}")
|
||||
if 'best_time' in timing_data:
|
||||
formatted_timing.append(f"Best Time: {timing_data['best_time']}")
|
||||
return ', '.join(formatted_timing)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
@@ -668,7 +808,19 @@ class FieldTransformationService:
|
||||
if isinstance(metrics, dict):
|
||||
quality_metrics = {k: v for k, v in metrics.items() if 'quality' in k.lower()}
|
||||
if quality_metrics:
|
||||
return ', '.join([f"{k}: {v}" for k, v in quality_metrics.items()])
|
||||
return ', '.join([f"{k.title()}: {v}" for k, v in quality_metrics.items()])
|
||||
elif isinstance(metrics, str):
|
||||
return metrics
|
||||
|
||||
# If we have quality metrics data as a dict, format it nicely
|
||||
if isinstance(integrated_data.get('quality_metrics'), dict):
|
||||
quality_data = integrated_data['quality_metrics']
|
||||
if isinstance(quality_data, dict):
|
||||
formatted_metrics = []
|
||||
for metric, value in quality_data.items():
|
||||
formatted_metrics.append(f"{metric.title()}: {value}")
|
||||
return ', '.join(formatted_metrics)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
@@ -725,7 +877,9 @@ class FieldTransformationService:
|
||||
if isinstance(metrics, dict):
|
||||
traffic_metrics = {k: v for k, v in metrics.items() if 'traffic' in k.lower()}
|
||||
if traffic_metrics:
|
||||
return ', '.join([f"{k}: {v}" for k, v in traffic_metrics.items()])
|
||||
return ', '.join([f"{k.title()}: {v}%" for k, v in traffic_metrics.items()])
|
||||
elif isinstance(metrics, str):
|
||||
return metrics
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
@@ -740,7 +894,9 @@ class FieldTransformationService:
|
||||
if isinstance(metrics, dict):
|
||||
conversion_metrics = {k: v for k, v in metrics.items() if 'conversion' in k.lower()}
|
||||
if conversion_metrics:
|
||||
return ', '.join([f"{k}: {v}" for k, v in conversion_metrics.items()])
|
||||
return ', '.join([f"{k.title()}: {v}%" for k, v in conversion_metrics.items()])
|
||||
elif isinstance(metrics, str):
|
||||
return metrics
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
@@ -770,21 +926,135 @@ class FieldTransformationService:
|
||||
logger.error(f"Error extracting ROI targets: {str(e)}")
|
||||
return None
|
||||
|
||||
def extract_ab_testing_capabilities(self, source_data: Dict[str, Any], integrated_data: Dict[str, Any]) -> Optional[str]:
|
||||
def extract_ab_testing_capabilities(self, source_data: Dict[str, Any], integrated_data: Dict[str, Any]) -> Optional[bool]:
|
||||
"""Extract A/B testing capabilities from team size."""
|
||||
try:
|
||||
if 'onboarding_session.session_data.team_size' in source_data:
|
||||
team_size = source_data['onboarding_session.session_data.team_size']
|
||||
if team_size:
|
||||
# Simple logic based on team size
|
||||
if int(team_size) > 5:
|
||||
return "Advanced A/B testing capabilities"
|
||||
elif int(team_size) > 2:
|
||||
return "Basic A/B testing capabilities"
|
||||
else:
|
||||
return "Limited A/B testing capabilities"
|
||||
return None
|
||||
# Return boolean based on team size
|
||||
team_size_int = int(team_size) if isinstance(team_size, (str, int, float)) else 1
|
||||
return team_size_int > 2 # True if team size > 2, False otherwise
|
||||
|
||||
# Default to False if no team size data
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting A/B testing capabilities: {str(e)}")
|
||||
return False
|
||||
|
||||
def _get_default_value_for_field(self, field_name: str) -> Any:
|
||||
"""Get default value for a field when no data is available."""
|
||||
# Provide sensible defaults for required fields
|
||||
default_values = {
|
||||
'business_objectives': 'Lead Generation, Brand Awareness',
|
||||
'target_metrics': 'Traffic Growth: 30%, Engagement Rate: 5%, Conversion Rate: 2%',
|
||||
'content_budget': 1000,
|
||||
'team_size': 1,
|
||||
'implementation_timeline': '3 months',
|
||||
'market_share': 'Small but growing',
|
||||
'competitive_position': 'Niche',
|
||||
'performance_metrics': 'Current Traffic: 1000, Current Engagement: 3%',
|
||||
'content_preferences': 'Blog posts, Social media content',
|
||||
'consumption_patterns': 'Mobile: 60%, Desktop: 40%',
|
||||
'audience_pain_points': 'Time constraints, Content quality',
|
||||
'buying_journey': 'Awareness: 40%, Consideration: 35%, Decision: 25%',
|
||||
'seasonal_trends': 'Q4 peak, Summer slowdown',
|
||||
'engagement_metrics': 'Likes: 100, Shares: 20, Comments: 15',
|
||||
'top_competitors': 'Competitor A, Competitor B',
|
||||
'competitor_content_strategies': 'Blog-focused, Video-heavy',
|
||||
'market_gaps': 'Underserved niche, Content gap',
|
||||
'industry_trends': 'AI integration, Video content',
|
||||
'emerging_trends': 'Voice search, Interactive content',
|
||||
'preferred_formats': ['Blog Posts', 'Videos', 'Infographics'],
|
||||
'content_mix': 'Educational: 40%, Entertaining: 30%, Promotional: 30%',
|
||||
'content_frequency': 'Weekly',
|
||||
'optimal_timing': 'Best Days: Tuesday, Thursday, Best Time: 10 AM',
|
||||
'quality_metrics': 'Readability: 8, Engagement: 7, SEO Score: 6',
|
||||
'editorial_guidelines': 'Professional tone, Clear structure',
|
||||
'brand_voice': 'Professional yet approachable',
|
||||
'traffic_sources': 'Organic: 60%, Social: 25%, Direct: 15%',
|
||||
'conversion_rates': 'Overall: 2%, Blog: 3%, Landing Pages: 5%',
|
||||
'content_roi_targets': 'Target ROI: 300%, Break Even: 6 months',
|
||||
'ab_testing_capabilities': False
|
||||
}
|
||||
|
||||
return default_values.get(field_name, None)
|
||||
|
||||
def _default_transformation(self, source_data: Dict[str, Any], field_name: str) -> Any:
|
||||
"""Default transformation when no specific method is available."""
|
||||
try:
|
||||
# Try to find any non-empty value in source data
|
||||
for key, value in source_data.items():
|
||||
if value is not None and value != "":
|
||||
# For budget and team_size, try to convert to number
|
||||
if field_name in ['content_budget', 'team_size'] and isinstance(value, (str, int, float)):
|
||||
try:
|
||||
return int(value) if field_name == 'team_size' else float(value)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
# For other fields, return the first non-empty value
|
||||
return value
|
||||
|
||||
# If no value found, return None
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error in default transformation for {field_name}: {str(e)}")
|
||||
return None
|
||||
|
||||
def _calculate_field_confidence(self, source_data: Dict[str, Any], sources: List[str]) -> float:
|
||||
"""Calculate confidence score for a field based on data quality and source availability."""
|
||||
try:
|
||||
if not source_data:
|
||||
return 0.3 # Low confidence when no data
|
||||
|
||||
# Check data quality indicators
|
||||
data_quality_score = 0.0
|
||||
total_indicators = 0
|
||||
|
||||
# Check if data is not empty
|
||||
for key, value in source_data.items():
|
||||
if value is not None and value != "":
|
||||
data_quality_score += 1.0
|
||||
total_indicators += 1
|
||||
|
||||
# Check source availability
|
||||
source_availability = len([s for s in sources if self._has_source_data(source_data, s)]) / max(len(sources), 1)
|
||||
|
||||
# Calculate final confidence
|
||||
if total_indicators > 0:
|
||||
data_quality = data_quality_score / total_indicators
|
||||
confidence = (data_quality + source_availability) / 2
|
||||
return min(confidence, 1.0) # Cap at 1.0
|
||||
else:
|
||||
return 0.3 # Default low confidence
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating field confidence: {str(e)}")
|
||||
return 0.3 # Default low confidence
|
||||
|
||||
def _has_source_data(self, integrated_data: Dict[str, Any], source_path: str) -> bool:
|
||||
"""Check if source data exists in integrated data."""
|
||||
try:
|
||||
value = self._get_nested_value(integrated_data, source_path)
|
||||
return value is not None and value != ""
|
||||
except Exception as e:
|
||||
logger.debug(f"Error checking source data for {source_path}: {str(e)}")
|
||||
return False
|
||||
|
||||
def _get_nested_value(self, data: Dict[str, Any], path: str) -> Any:
|
||||
"""Get nested value from dictionary using dot notation."""
|
||||
try:
|
||||
keys = path.split('.')
|
||||
value = data
|
||||
|
||||
for key in keys:
|
||||
if isinstance(value, dict) and key in value:
|
||||
value = value[key]
|
||||
else:
|
||||
return None
|
||||
|
||||
return value
|
||||
except Exception as e:
|
||||
logger.debug(f"Error getting nested value for {path}: {str(e)}")
|
||||
return None
|
||||
Reference in New Issue
Block a user