Files
ALwrity/backend/services/analytics/handlers/gsc_handler.py

423 lines
20 KiB
Python

"""
Google Search Console Analytics Handler
Handles GSC analytics data retrieval and processing.
"""
from typing import Dict, Any
from datetime import datetime, timedelta
from loguru import logger
from services.gsc_service import GSCService
from ...analytics_cache_service import analytics_cache
from ..models.analytics_data import AnalyticsData
from ..models.platform_types import PlatformType
from .base_handler import BaseAnalyticsHandler
class GSCAnalyticsHandler(BaseAnalyticsHandler):
"""Handler for Google Search Console analytics"""
def __init__(self):
super().__init__(PlatformType.GSC)
self.gsc_service = GSCService()
async def get_analytics(self, user_id: str, target_url: str = None, start_date: str = None, end_date: str = None, **kwargs) -> AnalyticsData:
"""
Get Google Search Console analytics data with caching
Args:
user_id: User ID to get analytics for
target_url: Optional URL to prefer when selecting GSC site
Returns comprehensive SEO metrics including clicks, impressions, CTR, and position data.
"""
self.log_analytics_request(user_id, "get_analytics")
# Check cache first - GSC API calls can be expensive
# Include target_url and date range in cache key if provided
cache_key_parts = [user_id]
if target_url:
cache_key_parts.append(str(target_url))
if start_date:
cache_key_parts.append(str(start_date))
if end_date:
cache_key_parts.append(str(end_date))
# Bump cache version to include page insights (v2)
cache_key = "_".join(cache_key_parts + ['v2pages'])
cached_data = analytics_cache.get('gsc_analytics', cache_key)
if cached_data:
logger.info("Using cached GSC analytics for user {user_id}", user_id=user_id)
return AnalyticsData(**cached_data)
logger.info("Fetching fresh GSC analytics for user {user_id}", user_id=user_id)
try:
# Get user's sites
sites = self.gsc_service.get_site_list(user_id)
logger.info(f"GSC Sites found for user {user_id}: {sites}")
if not sites:
logger.warning(f"No GSC sites found for user {user_id}")
return self.create_error_response('No GSC sites found')
# Select site: Prefer target_url match, otherwise first site
selected_site = sites[0]
if target_url:
logger.info(f"Attempting to match target URL: {target_url}")
# Normalize target URL (remove protocol, trailing slash)
normalized_target = target_url.replace('https://', '').replace('http://', '').rstrip('/')
for site in sites:
site_url = site['siteUrl']
normalized_site = site_url.replace('https://', '').replace('http://', '').rstrip('/')
if normalized_target in normalized_site or normalized_site in normalized_target:
selected_site = site
logger.info(f"Found matching GSC site: {site_url}")
break
site_url = selected_site['siteUrl']
logger.info(f"Using GSC site URL: {site_url}")
# Determine date range (defaults to last 30 days)
if not end_date:
end_date = datetime.now().strftime('%Y-%m-%d')
if not start_date:
start_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
logger.info(f"GSC Date range: {start_date} to {end_date}")
search_analytics = self.gsc_service.get_search_analytics(
user_id=user_id,
site_url=site_url,
start_date=start_date,
end_date=end_date
)
logger.info(f"GSC Search analytics retrieved for user {user_id}")
# Process GSC data into standardized format
processed_metrics = self._process_gsc_metrics(search_analytics)
result = self.create_success_response(metrics=processed_metrics, date_range={'start': start_date, 'end': end_date})
# Cache the result to avoid expensive API calls
analytics_cache.set('gsc_analytics', cache_key, result.__dict__)
logger.info("Cached GSC analytics data for user {user_id}", user_id=user_id)
return result
except Exception as e:
self.log_analytics_error(user_id, "get_analytics", e)
error_result = self.create_error_response(str(e))
# Cache error result briefly to avoid repeated failures but allow quick recovery
analytics_cache.set('gsc_analytics', cache_key, error_result.__dict__, ttl_override=30) # 30 seconds
return error_result
def get_connection_status(self, user_id: str) -> Dict[str, Any]:
"""Get GSC connection status"""
self.log_analytics_request(user_id, "get_connection_status")
try:
sites = self.gsc_service.get_site_list(user_id)
return {
'connected': len(sites) > 0,
'sites_count': len(sites),
'sites': sites[:3] if sites else [], # Show first 3 sites
'error': None
}
except Exception as e:
self.log_analytics_error(user_id, "get_connection_status", e)
return {
'connected': False,
'sites_count': 0,
'sites': [],
'error': str(e)
}
def _process_gsc_metrics(self, search_analytics: Dict[str, Any]) -> Dict[str, Any]:
"""Process GSC raw data into standardized metrics"""
try:
# Debug: Log the raw search analytics data structure
logger.info(f"GSC Raw search analytics structure: {search_analytics}")
logger.info(f"GSC Raw search analytics keys: {list(search_analytics.keys())}")
# Handle new data structure with overall_metrics and query_data
if 'overall_metrics' in search_analytics:
# New structure from updated GSC service
overall_rows = search_analytics.get('overall_metrics', {}).get('rows', [])
query_rows = search_analytics.get('query_data', {}).get('rows', [])
# Calculate totals from overall_rows (most accurate as it includes anonymized queries)
total_clicks = 0
total_impressions = 0
total_position = 0
valid_position_rows = 0
# Use overall_rows for totals if available, otherwise fallback to query_rows
calc_rows = overall_rows if overall_rows else query_rows
for row in calc_rows:
clicks = row.get('clicks', 0)
impressions = row.get('impressions', 0)
position = row.get('position', 0)
total_clicks += clicks
total_impressions += impressions
if position and position > 0:
total_position += position * impressions # Weighted average
# Calculate weighted average position
avg_position = total_position / total_impressions if total_impressions > 0 else 0
avg_ctr = (total_clicks / total_impressions * 100) if total_impressions > 0 else 0
# Use query_rows for top queries list
top_queries_source = query_rows
else:
# Legacy structure
rows = search_analytics.get('rows', [])
# ... existing legacy logic ...
calc_rows = rows
top_queries_source = rows
total_clicks = 0
total_impressions = 0
total_position = 0
valid_position_rows = 0
for row in calc_rows:
clicks = row.get('clicks', 0)
impressions = row.get('impressions', 0)
position = row.get('position', 0)
total_clicks += clicks
total_impressions += impressions
if position and position > 0:
# Simple average for legacy/unknown structure if we can't do weighted
total_position += position
valid_position_rows += 1
avg_ctr = (total_clicks / total_impressions * 100) if total_impressions > 0 else 0
avg_position = total_position / valid_position_rows if valid_position_rows > 0 else 0
# Get top performing queries
top_queries = []
if top_queries_source:
# Sort by clicks
sorted_queries = sorted(top_queries_source, key=lambda x: x.get('clicks', 0), reverse=True)[:10]
for row in sorted_queries:
clicks_val = row.get('clicks', 0) or 0
impr_val = row.get('impressions', 0) or 0
raw_ctr = row.get('ctr', None)
# Calculate CTR% robustly even if 'ctr' field is missing in row
if raw_ctr is not None:
ctr_percent = round(float(raw_ctr) * 100, 2)
else:
ctr_percent = round(((clicks_val / impr_val) * 100), 2) if impr_val > 0 else 0.0
top_queries.append({
'query': self._extract_query_from_row(row),
'clicks': clicks_val,
'impressions': impr_val,
'ctr': ctr_percent,
'position': round(row.get('position', 0) or 0, 2)
})
# Prepare Top Pages from page_data when available
top_pages = []
try:
page_rows = search_analytics.get('page_data', {}).get('rows', [])
qp_rows = search_analytics.get('query_page_data', {}).get('rows', [])
# Build queries-by-page map
queries_by_page: Dict[str, list] = {}
if qp_rows:
for r in qp_rows:
keys = r.get('keys', [])
if not keys or len(keys) < 2:
continue
query_key = keys[0]['keys'][0] if isinstance(keys[0], dict) else str(keys[0])
page_key = keys[1]['keys'][0] if isinstance(keys[1], dict) else str(keys[1])
clicks_val = r.get('clicks', 0) or 0
impr_val = r.get('impressions', 0) or 0
raw_ctr = r.get('ctr', None)
if raw_ctr is not None:
ctr_percent = round(float(raw_ctr) * 100, 2)
else:
ctr_percent = round(((clicks_val / impr_val) * 100), 2) if impr_val > 0 else 0.0
lst = queries_by_page.setdefault(page_key, [])
lst.append({
'query': query_key,
'clicks': clicks_val,
'impressions': impr_val,
'ctr': ctr_percent,
})
if page_rows:
sorted_pages = sorted(page_rows, key=lambda x: x.get('clicks', 0), reverse=True)[:10]
for row in sorted_pages:
clicks_val = row.get('clicks', 0) or 0
impr_val = row.get('impressions', 0) or 0
raw_ctr = row.get('ctr', None)
if raw_ctr is not None:
ctr_percent = round(float(raw_ctr) * 100, 2)
else:
ctr_percent = round(((clicks_val / impr_val) * 100), 2) if impr_val > 0 else 0.0
page_url = self._extract_page_from_row(row)
# attach top queries pointing to this page, sorted by clicks
page_queries = sorted(queries_by_page.get(page_url, []), key=lambda x: x.get('clicks', 0), reverse=True)[:5]
top_pages.append({
'page': page_url,
'clicks': clicks_val,
'impressions': impr_val,
'ctr': ctr_percent,
'position': round(row.get('position', 0) or 0, 2) if 'position' in row else None,
'queries': page_queries
})
except Exception as e:
logger.warning(f"Failed processing top_pages: {e}")
# Detect Cannibalization (query mapping to multiple pages)
cannibalization = []
try:
qp_rows = search_analytics.get('query_page_data', {}).get('rows', [])
q_rows = search_analytics.get('query_data', {}).get('rows', [])
if qp_rows:
# Determine window days for thresholding
from datetime import datetime
start_s = search_analytics.get('startDate')
end_s = search_analytics.get('endDate')
window_days = 30
try:
if start_s and end_s:
sd = datetime.strptime(start_s, "%Y-%m-%d")
ed = datetime.strptime(end_s, "%Y-%m-%d")
window_days = max((ed - sd).days + 1, 1)
except Exception:
pass
min_clicks = 10 if window_days <= 7 else (30 if window_days <= 30 else 60)
# Build map: query -> { page -> metrics }
by_query: Dict[str, Dict[str, Dict[str, float]]] = {}
for r in qp_rows:
keys = r.get('keys', [])
if not keys or len(keys) < 2:
continue
qk = keys[0]['keys'][0] if isinstance(keys[0], dict) else str(keys[0])
pk = keys[1]['keys'][0] if isinstance(keys[1], dict) else str(keys[1])
clicks_val = float(r.get('clicks', 0) or 0)
impr_val = float(r.get('impressions', 0) or 0)
raw_ctr = r.get('ctr', None)
if raw_ctr is not None:
ctr_percent = float(raw_ctr) * 100.0
else:
ctr_percent = (clicks_val / impr_val * 100.0) if impr_val > 0 else 0.0
pos_val = float(r.get('position', 0) or 0)
by_query.setdefault(qk, {}).setdefault(pk, {"clicks": 0.0, "impressions": 0.0, "ctr": 0.0, "position_sum": 0.0, "position_count": 0.0})
agg = by_query[qk][pk]
agg["clicks"] += clicks_val
agg["impressions"] += impr_val
agg["ctr"] = max(agg["ctr"], ctr_percent)
if pos_val > 0:
agg["position_sum"] += pos_val
agg["position_count"] += 1
# Use query totals for context
total_by_query: Dict[str, Dict[str, float]] = {}
for r in q_rows or []:
qk = self._extract_query_from_row(r)
total_by_query[qk] = {
"clicks": float(r.get('clicks', 0) or 0),
"impressions": float(r.get('impressions', 0) or 0),
"position": float(r.get('position', 0) or 0)
}
for qk, pages_map in by_query.items():
if len(pages_map) < 2:
continue
total_clicks = sum(p["clicks"] for p in pages_map.values())
if total_clicks < min_clicks:
continue
qpos = total_by_query.get(qk, {}).get("position", 0.0)
if not (3.0 <= qpos <= 20.0) and qpos != 0.0:
# Skip queries already ranking very well or very poorly (if pos present)
continue
pages_list = []
for pk, m in pages_map.items():
avg_pos = (m["position_sum"] / m["position_count"]) if m["position_count"] > 0 else 0.0
pages_list.append({
"page": pk,
"clicks": round(m["clicks"], 0),
"impressions": round(m["impressions"], 0),
"ctr": round(m["ctr"], 2),
"position": round(avg_pos, 2) if avg_pos > 0 else None
})
pages_list.sort(key=lambda x: x.get("clicks", 0), reverse=True)
target_page = pages_list[0]["page"] if pages_list else None
cannibalization.append({
"query": qk,
"total_clicks": int(round(total_clicks)),
"recommended_target_page": target_page,
"pages": pages_list[:3]
})
# Sort by impact
cannibalization.sort(key=lambda item: item.get("total_clicks", 0), reverse=True)
cannibalization = cannibalization[:10]
except Exception as e:
logger.warning(f"Failed computing cannibalization: {e}")
return {
'connection_status': 'connected',
'connected_sites': 1,
'total_clicks': total_clicks,
'total_impressions': total_impressions,
'avg_ctr': round(avg_ctr, 2),
'avg_position': round(avg_position, 2),
'total_queries': len(top_queries_source) if top_queries_source else 0,
'top_queries': top_queries,
'top_pages': top_pages,
'cannibalization': cannibalization
}
except Exception as e:
logger.error(f"Error processing GSC metrics: {e}")
return {
'connection_status': 'error',
'connected_sites': 0,
'total_clicks': 0,
'total_impressions': 0,
'avg_ctr': 0,
'avg_position': 0,
'total_queries': 0,
'top_queries': [],
'top_pages': [],
'error': str(e)
}
def _extract_query_from_row(self, row: Dict[str, Any]) -> str:
"""Extract query text from GSC API row data"""
try:
keys = row.get('keys', [])
if keys and len(keys) > 0:
first_key = keys[0]
if isinstance(first_key, dict):
return first_key.get('keys', ['Unknown'])[0]
else:
return str(first_key)
return 'Unknown'
except Exception as e:
logger.error(f"Error extracting query from row: {e}")
return 'Unknown'
def _extract_page_from_row(self, row: Dict[str, Any]) -> str:
"""Extract page URL from GSC API row data"""
try:
keys = row.get('keys', [])
if keys and len(keys) > 0:
first_key = keys[0]
if isinstance(first_key, dict):
return first_key.get('keys', [''])[0]
else:
return str(first_key)
return ''
except Exception as e:
logger.error(f"Error extracting page from row: {e}")
return ''