Onboarding Manager and Router Manager refactored, analytics and background jobs added, database setup updated, environment setup updated, frontend updated, backend updated. Critical onboarding database migration implemented.
571 lines
23 KiB
Python
571 lines
23 KiB
Python
"""
|
|
Bing Analytics Storage Service
|
|
|
|
Handles storage, retrieval, and analysis of Bing Webmaster Tools analytics data.
|
|
Provides methods for data persistence, trend analysis, and alert management.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from sqlalchemy import create_engine, func, desc, and_, or_
|
|
from sqlalchemy.orm import sessionmaker, Session
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
|
|
from models.bing_analytics_models import (
|
|
BingQueryStats, BingDailyMetrics, BingTrendAnalysis,
|
|
BingAlertRules, BingAlertHistory, BingSitePerformance
|
|
)
|
|
from services.integrations.bing_oauth import BingOAuthService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BingAnalyticsStorageService:
|
|
"""Service for managing Bing analytics data storage and analysis"""
|
|
|
|
def __init__(self, database_url: str):
|
|
"""Initialize the storage service with database connection"""
|
|
# Configure engine with minimal pooling to prevent connection exhaustion
|
|
engine_kwargs = {}
|
|
if 'sqlite' in database_url:
|
|
engine_kwargs = {
|
|
'pool_size': 1, # Minimal pool size
|
|
'max_overflow': 2, # Minimal overflow
|
|
'pool_pre_ping': False, # Disable pre-ping to reduce overhead
|
|
'pool_recycle': 300, # Recycle connections every 5 minutes
|
|
'connect_args': {'timeout': 10} # Shorter timeout
|
|
}
|
|
|
|
self.engine = create_engine(database_url, **engine_kwargs)
|
|
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
|
|
self.bing_service = BingOAuthService()
|
|
|
|
# Create tables if they don't exist
|
|
self._create_tables()
|
|
|
|
def _create_tables(self):
|
|
"""Create database tables if they don't exist"""
|
|
try:
|
|
from models.bing_analytics_models import Base
|
|
Base.metadata.create_all(bind=self.engine)
|
|
logger.info("Bing analytics database tables created/verified successfully")
|
|
except Exception as e:
|
|
logger.error(f"Error creating Bing analytics tables: {e}")
|
|
|
|
def _get_db_session(self) -> Session:
|
|
"""Get database session"""
|
|
return self.SessionLocal()
|
|
|
|
def _with_db_session(self, func):
|
|
"""Context manager for database sessions"""
|
|
db = None
|
|
try:
|
|
db = self._get_db_session()
|
|
return func(db)
|
|
finally:
|
|
if db:
|
|
db.close()
|
|
|
|
def store_raw_query_data(self, user_id: str, site_url: str, query_data: List[Dict[str, Any]]) -> bool:
|
|
"""
|
|
Store raw query statistics data from Bing API
|
|
|
|
Args:
|
|
user_id: User identifier
|
|
site_url: Site URL
|
|
query_data: List of query statistics from Bing API
|
|
|
|
Returns:
|
|
bool: True if successful, False otherwise
|
|
"""
|
|
try:
|
|
db = self._get_db_session()
|
|
|
|
# Process and store each query
|
|
stored_count = 0
|
|
for query_item in query_data:
|
|
try:
|
|
# Parse date from Bing format
|
|
query_date = self._parse_bing_date(query_item.get('Date', ''))
|
|
|
|
# Calculate CTR
|
|
clicks = query_item.get('Clicks', 0)
|
|
impressions = query_item.get('Impressions', 0)
|
|
ctr = (clicks / impressions * 100) if impressions > 0 else 0
|
|
|
|
# Determine if brand query
|
|
is_brand = self._is_brand_query(query_item.get('Query', ''), site_url)
|
|
|
|
# Categorize query
|
|
category = self._categorize_query(query_item.get('Query', ''))
|
|
|
|
# Create query stats record
|
|
query_stats = BingQueryStats(
|
|
user_id=user_id,
|
|
site_url=site_url,
|
|
query=query_item.get('Query', ''),
|
|
clicks=clicks,
|
|
impressions=impressions,
|
|
avg_click_position=query_item.get('AvgClickPosition', -1),
|
|
avg_impression_position=query_item.get('AvgImpressionPosition', -1),
|
|
ctr=ctr,
|
|
query_date=query_date,
|
|
query_length=len(query_item.get('Query', '')),
|
|
is_brand_query=is_brand,
|
|
category=category
|
|
)
|
|
|
|
db.add(query_stats)
|
|
stored_count += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing individual query: {e}")
|
|
continue
|
|
|
|
db.commit()
|
|
db.close()
|
|
|
|
logger.info(f"Successfully stored {stored_count} Bing query records for {site_url}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error storing Bing query data: {e}")
|
|
if 'db' in locals():
|
|
db.rollback()
|
|
db.close()
|
|
return False
|
|
|
|
def generate_daily_metrics(self, user_id: str, site_url: str, target_date: datetime = None) -> bool:
|
|
"""
|
|
Generate and store daily aggregated metrics
|
|
|
|
Args:
|
|
user_id: User identifier
|
|
site_url: Site URL
|
|
target_date: Date to generate metrics for (defaults to yesterday)
|
|
|
|
Returns:
|
|
bool: True if successful, False otherwise
|
|
"""
|
|
try:
|
|
if target_date is None:
|
|
target_date = datetime.now() - timedelta(days=1)
|
|
|
|
# Get date range for the day
|
|
start_date = target_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
end_date = start_date + timedelta(days=1)
|
|
|
|
db = self._get_db_session()
|
|
|
|
# Get raw data for the day
|
|
daily_queries = db.query(BingQueryStats).filter(
|
|
and_(
|
|
BingQueryStats.user_id == user_id,
|
|
BingQueryStats.site_url == site_url,
|
|
BingQueryStats.query_date >= start_date,
|
|
BingQueryStats.query_date < end_date
|
|
)
|
|
).all()
|
|
|
|
if not daily_queries:
|
|
logger.warning(f"No query data found for {site_url} on {target_date.date()}")
|
|
db.close()
|
|
return False
|
|
|
|
# Calculate aggregated metrics
|
|
total_clicks = sum(q.clicks for q in daily_queries)
|
|
total_impressions = sum(q.impressions for q in daily_queries)
|
|
total_queries = len(daily_queries)
|
|
avg_ctr = (total_clicks / total_impressions * 100) if total_impressions > 0 else 0
|
|
avg_position = sum(q.avg_click_position for q in daily_queries if q.avg_click_position > 0) / len([q for q in daily_queries if q.avg_click_position > 0]) if any(q.avg_click_position > 0 for q in daily_queries) else 0
|
|
|
|
# Get top performing queries
|
|
top_queries = sorted(daily_queries, key=lambda x: x.clicks, reverse=True)[:10]
|
|
top_clicks = [{'query': q.query, 'clicks': q.clicks, 'impressions': q.impressions, 'ctr': q.ctr} for q in top_queries]
|
|
top_impressions = sorted(daily_queries, key=lambda x: x.impressions, reverse=True)[:10]
|
|
top_impressions_data = [{'query': q.query, 'clicks': q.clicks, 'impressions': q.impressions, 'ctr': q.ctr} for q in top_impressions]
|
|
|
|
# Calculate changes from previous day
|
|
prev_day_metrics = self._get_previous_day_metrics(db, user_id, site_url, target_date)
|
|
clicks_change = self._calculate_percentage_change(total_clicks, prev_day_metrics.get('total_clicks', 0))
|
|
impressions_change = self._calculate_percentage_change(total_impressions, prev_day_metrics.get('total_impressions', 0))
|
|
ctr_change = self._calculate_percentage_change(avg_ctr, prev_day_metrics.get('avg_ctr', 0))
|
|
|
|
# Create daily metrics record
|
|
daily_metrics = BingDailyMetrics(
|
|
user_id=user_id,
|
|
site_url=site_url,
|
|
metric_date=start_date,
|
|
total_clicks=total_clicks,
|
|
total_impressions=total_impressions,
|
|
total_queries=total_queries,
|
|
avg_ctr=avg_ctr,
|
|
avg_position=avg_position,
|
|
top_queries=json.dumps(top_clicks),
|
|
top_clicks=json.dumps(top_clicks),
|
|
top_impressions=json.dumps(top_impressions_data),
|
|
clicks_change=clicks_change,
|
|
impressions_change=impressions_change,
|
|
ctr_change=ctr_change
|
|
)
|
|
|
|
# Check if record already exists and update or create
|
|
existing = db.query(BingDailyMetrics).filter(
|
|
and_(
|
|
BingDailyMetrics.user_id == user_id,
|
|
BingDailyMetrics.site_url == site_url,
|
|
BingDailyMetrics.metric_date == start_date
|
|
)
|
|
).first()
|
|
|
|
if existing:
|
|
# Update existing record
|
|
for key, value in daily_metrics.__dict__.items():
|
|
if not key.startswith('_') and key != 'id':
|
|
setattr(existing, key, value)
|
|
else:
|
|
# Create new record
|
|
db.add(daily_metrics)
|
|
|
|
db.commit()
|
|
db.close()
|
|
|
|
logger.info(f"Successfully generated daily metrics for {site_url} on {target_date.date()}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating daily metrics: {e}")
|
|
if 'db' in locals():
|
|
db.rollback()
|
|
db.close()
|
|
return False
|
|
|
|
def get_analytics_summary(self, user_id: str, site_url: str, days: int = 30) -> Dict[str, Any]:
|
|
"""
|
|
Get analytics summary for a site over a specified period
|
|
|
|
Args:
|
|
user_id: User identifier
|
|
site_url: Site URL
|
|
days: Number of days to include in summary
|
|
|
|
Returns:
|
|
Dict containing analytics summary
|
|
"""
|
|
try:
|
|
db = self._get_db_session()
|
|
|
|
# Date range
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=days)
|
|
|
|
# Get daily metrics for the period
|
|
daily_metrics = db.query(BingDailyMetrics).filter(
|
|
and_(
|
|
BingDailyMetrics.user_id == user_id,
|
|
BingDailyMetrics.site_url == site_url,
|
|
BingDailyMetrics.metric_date >= start_date,
|
|
BingDailyMetrics.metric_date <= end_date
|
|
)
|
|
).order_by(BingDailyMetrics.metric_date).all()
|
|
|
|
if not daily_metrics:
|
|
return {'error': 'No analytics data found for the specified period'}
|
|
|
|
# Calculate summary statistics
|
|
total_clicks = sum(m.total_clicks for m in daily_metrics)
|
|
total_impressions = sum(m.total_impressions for m in daily_metrics)
|
|
total_queries = sum(m.total_queries for m in daily_metrics)
|
|
avg_ctr = (total_clicks / total_impressions * 100) if total_impressions > 0 else 0
|
|
|
|
# Get top performing queries for the period
|
|
top_queries = []
|
|
for metric in daily_metrics:
|
|
if metric.top_queries:
|
|
try:
|
|
queries = json.loads(metric.top_queries)
|
|
top_queries.extend(queries)
|
|
except:
|
|
continue
|
|
|
|
# Aggregate and sort top queries
|
|
query_aggregates = {}
|
|
for query in top_queries:
|
|
q = query['query']
|
|
if q not in query_aggregates:
|
|
query_aggregates[q] = {'clicks': 0, 'impressions': 0, 'count': 0}
|
|
query_aggregates[q]['clicks'] += query['clicks']
|
|
query_aggregates[q]['impressions'] += query['impressions']
|
|
query_aggregates[q]['count'] += 1
|
|
|
|
# Sort by clicks and get top 10
|
|
top_performing = sorted(
|
|
[{'query': k, **v} for k, v in query_aggregates.items()],
|
|
key=lambda x: x['clicks'],
|
|
reverse=True
|
|
)[:10]
|
|
|
|
# Calculate trends
|
|
recent_metrics = daily_metrics[-7:] if len(daily_metrics) >= 7 else daily_metrics
|
|
older_metrics = daily_metrics[:-7] if len(daily_metrics) >= 14 else daily_metrics
|
|
|
|
recent_avg_ctr = sum(m.avg_ctr for m in recent_metrics) / len(recent_metrics) if recent_metrics else 0
|
|
older_avg_ctr = sum(m.avg_ctr for m in older_metrics) / len(older_metrics) if older_metrics else 0
|
|
ctr_trend = self._calculate_percentage_change(recent_avg_ctr, older_avg_ctr)
|
|
|
|
db.close()
|
|
|
|
return {
|
|
'period_days': days,
|
|
'total_clicks': total_clicks,
|
|
'total_impressions': total_impressions,
|
|
'total_queries': total_queries,
|
|
'avg_ctr': round(avg_ctr, 2),
|
|
'ctr_trend': round(ctr_trend, 2),
|
|
'top_queries': top_performing,
|
|
'daily_metrics_count': len(daily_metrics),
|
|
'data_quality': 'good' if len(daily_metrics) >= days * 0.8 else 'partial'
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting analytics summary: {e}")
|
|
if 'db' in locals():
|
|
db.close()
|
|
return {'error': str(e)}
|
|
|
|
def get_top_queries(self, user_id: str, site_url: str, days: int = 30, limit: int = 50) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get top performing queries for a site over a specified period
|
|
|
|
Args:
|
|
user_id: User identifier
|
|
site_url: Site URL
|
|
days: Number of days to analyze
|
|
limit: Maximum number of queries to return
|
|
|
|
Returns:
|
|
List of top queries with performance data
|
|
"""
|
|
try:
|
|
db = self._get_db_session()
|
|
|
|
# Calculate date range
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=days)
|
|
|
|
# Query top queries from the database
|
|
query_stats = db.query(BingQueryStats).filter(
|
|
BingQueryStats.user_id == user_id,
|
|
BingQueryStats.site_url == site_url,
|
|
BingQueryStats.query_date >= start_date,
|
|
BingQueryStats.query_date <= end_date
|
|
).order_by(BingQueryStats.clicks.desc()).limit(limit).all()
|
|
|
|
# Convert to list of dictionaries
|
|
top_queries = []
|
|
for stat in query_stats:
|
|
top_queries.append({
|
|
'query': stat.query,
|
|
'clicks': stat.clicks,
|
|
'impressions': stat.impressions,
|
|
'ctr': stat.ctr,
|
|
'position': stat.avg_click_position,
|
|
'date': stat.query_date.isoformat()
|
|
})
|
|
|
|
db.close()
|
|
return top_queries
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting top queries: {e}")
|
|
if 'db' in locals():
|
|
db.close()
|
|
return []
|
|
|
|
def get_daily_metrics(self, user_id: str, site_url: str, days: int = 30) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get daily metrics for a site over a specified period
|
|
"""
|
|
try:
|
|
db = self._get_db_session()
|
|
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=days)
|
|
|
|
daily_metrics = db.query(BingDailyMetrics).filter(
|
|
BingDailyMetrics.user_id == user_id,
|
|
BingDailyMetrics.site_url == site_url,
|
|
BingDailyMetrics.metric_date >= start_date,
|
|
BingDailyMetrics.metric_date <= end_date
|
|
).order_by(BingDailyMetrics.metric_date.desc()).all()
|
|
|
|
metrics_list = []
|
|
for metric in daily_metrics:
|
|
metrics_list.append({
|
|
'date': metric.metric_date.isoformat(),
|
|
'total_clicks': metric.total_clicks,
|
|
'total_impressions': metric.total_impressions,
|
|
'total_queries': metric.total_queries,
|
|
'avg_ctr': metric.avg_ctr,
|
|
'avg_position': metric.avg_position,
|
|
'clicks_change': metric.clicks_change,
|
|
'impressions_change': metric.impressions_change,
|
|
'ctr_change': metric.ctr_change
|
|
})
|
|
|
|
db.close()
|
|
return metrics_list
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting daily metrics: {e}")
|
|
if 'db' in locals():
|
|
db.close()
|
|
return []
|
|
|
|
def collect_and_store_data(self, user_id: str, site_url: str, days_back: int = 30) -> bool:
|
|
"""
|
|
Collect fresh data from Bing API and store it
|
|
|
|
Args:
|
|
user_id: User identifier
|
|
site_url: Site URL
|
|
days_back: How many days back to collect data for
|
|
|
|
Returns:
|
|
bool: True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Calculate date range
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=days_back)
|
|
|
|
# Get query stats from Bing API
|
|
query_data = self.bing_service.get_query_stats(
|
|
user_id=user_id,
|
|
site_url=site_url,
|
|
start_date=start_date.strftime('%Y-%m-%d'),
|
|
end_date=end_date.strftime('%Y-%m-%d'),
|
|
page=0
|
|
)
|
|
|
|
if 'error' in query_data:
|
|
logger.error(f"Bing API error: {query_data['error']}")
|
|
return False
|
|
|
|
# Extract queries from response
|
|
queries = self._extract_queries_from_response(query_data)
|
|
if not queries:
|
|
logger.warning(f"No queries found in Bing API response for {site_url}")
|
|
return False
|
|
|
|
# Store raw data
|
|
if not self.store_raw_query_data(user_id, site_url, queries):
|
|
logger.error("Failed to store raw query data")
|
|
return False
|
|
|
|
# Generate daily metrics for each day
|
|
current_date = start_date
|
|
while current_date < end_date:
|
|
if not self.generate_daily_metrics(user_id, site_url, current_date):
|
|
logger.warning(f"Failed to generate daily metrics for {current_date.date()}")
|
|
current_date += timedelta(days=1)
|
|
|
|
logger.info(f"Successfully collected and stored Bing data for {site_url}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting and storing Bing data: {e}")
|
|
return False
|
|
|
|
def _parse_bing_date(self, date_str: str) -> datetime:
|
|
"""Parse Bing API date format"""
|
|
try:
|
|
# Bing uses /Date(timestamp-0700)/ format
|
|
if date_str.startswith('/Date(') and date_str.endswith(')/'):
|
|
timestamp_str = date_str[6:-2].split('-')[0]
|
|
timestamp = int(timestamp_str) / 1000 # Convert from milliseconds
|
|
return datetime.fromtimestamp(timestamp)
|
|
else:
|
|
return datetime.now()
|
|
except:
|
|
return datetime.now()
|
|
|
|
def _is_brand_query(self, query: str, site_url: str) -> bool:
|
|
"""Determine if a query is a brand query"""
|
|
# Extract domain from site URL
|
|
domain = site_url.replace('https://', '').replace('http://', '').split('/')[0]
|
|
brand_terms = domain.split('.')
|
|
|
|
# Check if query contains brand terms
|
|
query_lower = query.lower()
|
|
for term in brand_terms:
|
|
if len(term) > 3 and term in query_lower:
|
|
return True
|
|
return False
|
|
|
|
def _categorize_query(self, query: str) -> str:
|
|
"""Categorize a query based on keywords"""
|
|
query_lower = query.lower()
|
|
|
|
if any(term in query_lower for term in ['ai', 'artificial intelligence', 'machine learning']):
|
|
return 'ai'
|
|
elif any(term in query_lower for term in ['story', 'narrative', 'tale', 'fiction']):
|
|
return 'story_writing'
|
|
elif any(term in query_lower for term in ['business', 'plan', 'strategy', 'company']):
|
|
return 'business'
|
|
elif any(term in query_lower for term in ['letter', 'email', 'correspondence']):
|
|
return 'letter_writing'
|
|
elif any(term in query_lower for term in ['blog', 'article', 'content', 'post']):
|
|
return 'content_writing'
|
|
elif any(term in query_lower for term in ['free', 'generator', 'tool', 'online']):
|
|
return 'tools'
|
|
else:
|
|
return 'general'
|
|
|
|
def _extract_queries_from_response(self, response_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Extract queries from Bing API response"""
|
|
try:
|
|
if isinstance(response_data, dict) and 'd' in response_data:
|
|
d_data = response_data['d']
|
|
if isinstance(d_data, dict) and 'results' in d_data:
|
|
return d_data['results']
|
|
elif isinstance(d_data, list):
|
|
return d_data
|
|
elif isinstance(response_data, list):
|
|
return response_data
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error extracting queries from response: {e}")
|
|
return []
|
|
|
|
def _get_previous_day_metrics(self, db: Session, user_id: str, site_url: str, current_date: datetime) -> Dict[str, float]:
|
|
"""Get metrics from the previous day for comparison"""
|
|
try:
|
|
prev_date = current_date - timedelta(days=1)
|
|
prev_metrics = db.query(BingDailyMetrics).filter(
|
|
and_(
|
|
BingDailyMetrics.user_id == user_id,
|
|
BingDailyMetrics.site_url == site_url,
|
|
BingDailyMetrics.metric_date == prev_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
)
|
|
).first()
|
|
|
|
if prev_metrics:
|
|
return {
|
|
'total_clicks': prev_metrics.total_clicks,
|
|
'total_impressions': prev_metrics.total_impressions,
|
|
'avg_ctr': prev_metrics.avg_ctr
|
|
}
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Error getting previous day metrics: {e}")
|
|
return {}
|
|
|
|
def _calculate_percentage_change(self, current: float, previous: float) -> float:
|
|
"""Calculate percentage change between two values"""
|
|
if previous == 0:
|
|
return 100.0 if current > 0 else 0.0
|
|
return ((current - previous) / previous) * 100
|