Added documentation for the auto-population feature and the analytics integration.

This commit is contained in:
ajaysi
2026-01-17 11:01:10 +05:30
parent 8193cdba67
commit 1db10ccd0f
61 changed files with 6773 additions and 579 deletions

View File

@@ -17,8 +17,11 @@ from models.onboarding import (
OnboardingSession,
WebsiteAnalysis,
ResearchPreferences,
APIKey
APIKey,
PersonaData,
CompetitorAnalysis
)
import os
logger = logging.getLogger(__name__)
@@ -29,8 +32,13 @@ class OnboardingDataIntegrationService:
self.data_freshness_threshold = timedelta(hours=24)
self.max_analysis_age = timedelta(days=7)
async def process_onboarding_data(self, user_id: int, db: Session) -> Dict[str, Any]:
"""Process and integrate all onboarding data for a user."""
async def process_onboarding_data(self, user_id: str, db: Session) -> Dict[str, Any]:
"""Process and integrate all onboarding data for a user.
Args:
user_id: Clerk user ID (string format, e.g., 'user_xxx')
db: Database session
"""
try:
logger.info(f"Processing onboarding data for user: {user_id}")
@@ -39,6 +47,10 @@ class OnboardingDataIntegrationService:
research_preferences = self._get_research_preferences(user_id, db)
api_keys_data = self._get_api_keys_data(user_id, db)
onboarding_session = self._get_onboarding_session(user_id, db)
persona_data = self._get_persona_data(user_id, db)
competitor_analysis = self._get_competitor_analysis(user_id, db)
gsc_analytics = await self._get_gsc_analytics(user_id)
bing_analytics = await self._get_bing_analytics(user_id)
# Log data source status
logger.info(f"Data source status for user {user_id}:")
@@ -46,6 +58,10 @@ class OnboardingDataIntegrationService:
logger.info(f" - Research preferences: {'✅ Found' if research_preferences else '❌ Missing'}")
logger.info(f" - API keys data: {'✅ Found' if api_keys_data else '❌ Missing'}")
logger.info(f" - Onboarding session: {'✅ Found' if onboarding_session else '❌ Missing'}")
logger.info(f" - Persona data: {'✅ Found' if persona_data else '❌ Missing'}")
logger.info(f" - Competitor analysis: {'✅ Found' if competitor_analysis else '❌ Missing'}")
logger.info(f" - GSC Analytics: {'✅ Found' if gsc_analytics else '❌ Missing'}")
logger.info(f" - Bing Analytics: {'✅ Found' if bing_analytics else '❌ Missing'}")
# Process and integrate data
integrated_data = {
@@ -53,7 +69,11 @@ class OnboardingDataIntegrationService:
'research_preferences': research_preferences,
'api_keys_data': api_keys_data,
'onboarding_session': onboarding_session,
'data_quality': self._assess_data_quality(website_analysis, research_preferences, api_keys_data),
'persona_data': persona_data,
'competitor_analysis': competitor_analysis,
'gsc_analytics': gsc_analytics,
'bing_analytics': bing_analytics,
'data_quality': self._assess_data_quality(website_analysis, research_preferences, api_keys_data, persona_data, competitor_analysis, gsc_analytics, bing_analytics),
'processing_timestamp': datetime.utcnow().isoformat()
}
@@ -76,7 +96,7 @@ class OnboardingDataIntegrationService:
logger.error("Traceback:\n%s", traceback.format_exc())
return self._get_fallback_data()
def _get_website_analysis(self, user_id: int, db: Session) -> Dict[str, Any]:
def _get_website_analysis(self, user_id: str, db: Session) -> Dict[str, Any]:
"""Get website analysis data for the user."""
try:
# Get the latest onboarding session for the user
@@ -109,7 +129,7 @@ class OnboardingDataIntegrationService:
logger.error(f"Error getting website analysis for user {user_id}: {str(e)}")
return {}
def _get_research_preferences(self, user_id: int, db: Session) -> Dict[str, Any]:
def _get_research_preferences(self, user_id: str, db: Session) -> Dict[str, Any]:
"""Get research preferences data for the user."""
try:
# Get the latest onboarding session for the user
@@ -142,7 +162,7 @@ class OnboardingDataIntegrationService:
logger.error(f"Error getting research preferences for user {user_id}: {str(e)}")
return {}
def _get_api_keys_data(self, user_id: int, db: Session) -> Dict[str, Any]:
def _get_api_keys_data(self, user_id: str, db: Session) -> Dict[str, Any]:
"""Get API keys data for the user."""
try:
# Get the latest onboarding session for the user
@@ -179,7 +199,7 @@ class OnboardingDataIntegrationService:
logger.error(f"Error getting API keys data for user {user_id}: {str(e)}")
return {}
def _get_onboarding_session(self, user_id: int, db: Session) -> Dict[str, Any]:
def _get_onboarding_session(self, user_id: str, db: Session) -> Dict[str, Any]:
"""Get onboarding session data for the user."""
try:
# Get the latest onboarding session for the user
@@ -210,7 +230,7 @@ class OnboardingDataIntegrationService:
logger.error(f"Error getting onboarding session for user {user_id}: {str(e)}")
return {}
def _assess_data_quality(self, website_analysis: Dict, research_preferences: Dict, api_keys_data: Dict) -> Dict[str, Any]:
def _assess_data_quality(self, website_analysis: Dict, research_preferences: Dict, api_keys_data: Dict, persona_data: Dict = None, competitor_analysis: List = None, gsc_analytics: Dict = None, bing_analytics: Dict = None) -> Dict[str, Any]:
"""Assess the quality and completeness of onboarding data."""
try:
quality_metrics = {
@@ -244,6 +264,26 @@ class OnboardingDataIntegrationService:
if api_keys_data:
filled_fields += 1
# Persona data completeness
total_fields += 1
if persona_data and persona_data.get('core_persona'):
filled_fields += 1
# Competitor analysis completeness
total_fields += 1
if competitor_analysis and len(competitor_analysis) > 0:
filled_fields += 1
# GSC analytics completeness
total_fields += 1
if gsc_analytics and (gsc_analytics.get('data') or gsc_analytics.get('metrics')):
filled_fields += 1
# Bing analytics completeness
total_fields += 1
if bing_analytics and (bing_analytics.get('data') or bing_analytics.get('summary')):
filled_fields += 1
quality_metrics['completeness'] = filled_fields / total_fields if total_fields > 0 else 0.0
# Calculate freshness
@@ -251,17 +291,36 @@ class OnboardingDataIntegrationService:
for data_source in [website_analysis, research_preferences]:
if data_source.get('data_freshness'):
freshness_scores.append(data_source['data_freshness'])
if persona_data and persona_data.get('data_freshness'):
freshness_scores.append(persona_data['data_freshness'])
if competitor_analysis:
for competitor in competitor_analysis:
if competitor.get('data_freshness'):
freshness_scores.append(competitor['data_freshness'])
break # Just use first competitor's freshness
if gsc_analytics and gsc_analytics.get('data_freshness'):
freshness_scores.append(gsc_analytics['data_freshness'])
if bing_analytics and bing_analytics.get('data_freshness'):
freshness_scores.append(bing_analytics['data_freshness'])
quality_metrics['freshness'] = sum(freshness_scores) / len(freshness_scores) if freshness_scores else 0.0
# Calculate relevance (based on data presence and quality)
relevance_score = 0.0
if website_analysis.get('domain'):
relevance_score += 0.4
relevance_score += 0.20
if research_preferences.get('research_topics'):
relevance_score += 0.3
relevance_score += 0.15
if api_keys_data:
relevance_score += 0.3
relevance_score += 0.10
if persona_data and persona_data.get('core_persona'):
relevance_score += 0.15
if competitor_analysis and len(competitor_analysis) > 0:
relevance_score += 0.15
if gsc_analytics and (gsc_analytics.get('data') or gsc_analytics.get('metrics')):
relevance_score += 0.15 # Real analytics data is highly relevant
if bing_analytics and (bing_analytics.get('data') or bing_analytics.get('summary')):
relevance_score += 0.10 # Real analytics data is highly relevant
quality_metrics['relevance'] = relevance_score
@@ -313,7 +372,7 @@ class OnboardingDataIntegrationService:
logger.error(f"Error checking API data availability: {str(e)}")
return False
async def _store_integrated_data(self, user_id: int, integrated_data: Dict[str, Any], db: Session) -> None:
async def _store_integrated_data(self, user_id: str, integrated_data: Dict[str, Any], db: Session) -> None:
"""Store integrated onboarding data."""
try:
# Create or update integrated data record
@@ -355,6 +414,200 @@ class OnboardingDataIntegrationService:
# Soft-fail storage: do not break the refresh path
return
def _get_persona_data(self, user_id: str, db: Session) -> Dict[str, Any]:
"""Get persona data for the user."""
try:
# Get the latest onboarding session for the user
session = db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).order_by(OnboardingSession.updated_at.desc()).first()
if not session:
logger.warning(f"No onboarding session found for user {user_id}")
return {}
# Get persona data for this session
persona = db.query(PersonaData).filter(
PersonaData.session_id == session.id
).first()
if not persona:
logger.warning(f"No persona data found for user {user_id}")
return {}
# Convert to dictionary and add metadata
persona_dict = persona.to_dict()
persona_dict['data_freshness'] = self._calculate_freshness(persona.updated_at)
persona_dict['confidence_level'] = 0.9
logger.info(f"Retrieved persona data for user {user_id}")
return persona_dict
except Exception as e:
logger.error(f"Error getting persona data for user {user_id}: {str(e)}")
return {}
def _get_competitor_analysis(self, user_id: str, db: Session) -> List[Dict[str, Any]]:
"""Get competitor analysis data for the user."""
try:
# Get the latest onboarding session for the user
session = db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).order_by(OnboardingSession.updated_at.desc()).first()
if not session:
logger.warning(f"🔍 COMPETITOR VALIDATION: No onboarding session found for user {user_id}")
return []
logger.warning(f"🔍 COMPETITOR VALIDATION: Found session {session.id} for user {user_id}")
# Get all competitor analyses for this session
competitor_records = db.query(CompetitorAnalysis).filter(
CompetitorAnalysis.session_id == session.id
).order_by(CompetitorAnalysis.updated_at.desc()).all()
if not competitor_records:
logger.warning(f"🔍 COMPETITOR VALIDATION: No competitor analysis records found for user {user_id}, session {session.id}")
logger.warning(f" Checking all sessions for user {user_id}...")
# Check all sessions for this user
all_sessions = db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).all()
logger.warning(f" Total sessions for user: {len(all_sessions)}")
for sess in all_sessions:
comp_count = db.query(CompetitorAnalysis).filter(
CompetitorAnalysis.session_id == sess.id
).count()
session_timestamp = getattr(sess, 'started_at', None) or getattr(sess, 'updated_at', None)
logger.warning(f" Session {sess.id} (timestamp: {session_timestamp}): {comp_count} competitors")
return []
logger.warning(f"🔍 COMPETITOR VALIDATION: Found {len(competitor_records)} competitor records for user {user_id}")
# Convert to list of dictionaries
# Use to_dict() which includes competitor_url, competitor_domain, analysis_data
competitors = []
for record in competitor_records:
competitor_dict = record.to_dict()
# Ensure analysis_data is included (to_dict() should include it)
if 'analysis_data' not in competitor_dict and record.analysis_data:
competitor_dict['analysis_data'] = record.analysis_data
competitor_dict['data_freshness'] = self._calculate_freshness(record.updated_at)
competitor_dict['confidence_level'] = 0.9 if record.status == 'completed' else 0.5
competitors.append(competitor_dict)
logger.info(f"Retrieved {len(competitors)} competitor analyses for user {user_id}")
if competitors:
logger.warning(f"🔍 Sample competitor keys: {list(competitors[0].keys())}")
logger.warning(f"🔍 Sample competitor has analysis_data: {'analysis_data' in competitors[0]}")
if 'analysis_data' in competitors[0]:
logger.warning(f"🔍 Sample analysis_data keys: {list(competitors[0]['analysis_data'].keys()) if isinstance(competitors[0]['analysis_data'], dict) else 'Not a dict'}")
return competitors
except Exception as e:
logger.error(f"Error getting competitor analysis for user {user_id}: {str(e)}")
return []
async def _get_gsc_analytics(self, user_id: str) -> Dict[str, Any]:
"""Get Google Search Console analytics data for the user."""
try:
from services.seo.dashboard_service import SEODashboardService
from services.database import get_db_session
db = get_db_session()
try:
dashboard_service = SEODashboardService(db)
gsc_data = await dashboard_service.get_gsc_data(user_id)
finally:
db.close()
if gsc_data and gsc_data.get('status') != 'disconnected' and not gsc_data.get('error'):
logger.info(f"Retrieved GSC analytics for user {user_id}")
return {
'data': gsc_data.get('data', {}),
'metrics': gsc_data.get('metrics', {}),
'date_range': gsc_data.get('date_range', {}),
'data_freshness': 1.0, # GSC data is typically fresh
'confidence_level': 0.9
}
else:
logger.warning(f"No GSC analytics found or not connected for user {user_id}")
return {}
except Exception as e:
logger.error(f"Error getting GSC analytics for user {user_id}: {str(e)}")
return {}
async def _get_bing_analytics(self, user_id: str) -> Dict[str, Any]:
"""Get Bing Webmaster Tools analytics data for the user."""
try:
from services.seo.dashboard_service import SEODashboardService
from services.bing_analytics_storage_service import BingAnalyticsStorageService
from services.database import get_db_session
db = get_db_session()
try:
dashboard_service = SEODashboardService(db)
bing_data = await dashboard_service.get_bing_data(user_id)
finally:
db.close()
# Also try to get from storage service for more detailed metrics
bing_storage = BingAnalyticsStorageService(os.getenv('DATABASE_URL', 'sqlite:///alwrity.db'))
# Get site URL from onboarding session if available
site_url = None
try:
from services.database import get_db_session
with get_db_session() as db:
session = db.query(OnboardingSession).filter(
OnboardingSession.user_id == user_id
).order_by(OnboardingSession.updated_at.desc()).first()
if session:
website_analysis = db.query(WebsiteAnalysis).filter(
WebsiteAnalysis.session_id == session.id
).order_by(WebsiteAnalysis.updated_at.desc()).first()
if website_analysis:
site_url = website_analysis.website_url
except Exception as e:
logger.warning(f"Could not get site URL for Bing analytics: {e}")
analytics_summary = {}
if site_url:
try:
analytics_summary = bing_storage.get_analytics_summary(user_id, site_url, days=30)
except Exception as e:
logger.warning(f"Could not get Bing analytics summary: {e}")
if bing_data and bing_data.get('status') != 'disconnected' and not bing_data.get('error'):
logger.info(f"Retrieved Bing analytics for user {user_id}")
return {
'data': bing_data.get('data', {}),
'metrics': bing_data.get('metrics', {}),
'summary': analytics_summary,
'date_range': bing_data.get('date_range', {}),
'data_freshness': 1.0, # Bing data is typically fresh
'confidence_level': 0.9
}
elif analytics_summary and not analytics_summary.get('error'):
# Use stored analytics if available even if API is disconnected
logger.info(f"Retrieved Bing analytics from storage for user {user_id}")
return {
'data': {},
'metrics': {},
'summary': analytics_summary,
'date_range': {},
'data_freshness': 0.8, # Stored data might be slightly older
'confidence_level': 0.85
}
else:
logger.warning(f"No Bing analytics found or not connected for user {user_id}")
return {}
except Exception as e:
logger.error(f"Error getting Bing analytics for user {user_id}: {str(e)}")
return {}
def _get_fallback_data(self) -> Dict[str, Any]:
"""Get fallback data when processing fails."""
return {
@@ -362,6 +615,10 @@ class OnboardingDataIntegrationService:
'research_preferences': {},
'api_keys_data': {},
'onboarding_session': {},
'persona_data': {},
'competitor_analysis': [],
'gsc_analytics': {},
'bing_analytics': {},
'data_quality': {
'overall_score': 0.0,
'completeness': 0.0,