"""Google Search Console Service for ALwrity.""" import os import json import sqlite3 import secrets from typing import Dict, List, Optional, Any from datetime import datetime, timedelta from google.auth.transport.requests import Request as GoogleRequest from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import Flow from googleapiclient.discovery import build from loguru import logger from services.database import get_user_db_path from dotenv import load_dotenv class GSCService: """Service for Google Search Console integration.""" def __init__(self, db_path: str = None): """Initialize GSC service.""" # db_path is deprecated in favor of dynamic user_id based paths self.db_path = db_path # Resolve credentials file robustly: env override or project-relative default env_credentials_path = os.getenv("GSC_CREDENTIALS_FILE") if env_credentials_path: self.credentials_file = env_credentials_path else: # Default to /gsc_credentials.json regardless of CWD services_dir = os.path.dirname(__file__) backend_dir = os.path.abspath(os.path.join(services_dir, os.pardir)) self.credentials_file = os.path.join(backend_dir, "gsc_credentials.json") # Load client config from file or environment variables self.client_config = self._load_client_config() if self.client_config: logger.info("GSC client configuration loaded successfully") else: logger.warning(f"GSC credentials not found in {self.credentials_file} or environment variables") self.scopes = ['https://www.googleapis.com/auth/webmasters.readonly'] # Note: Tables are initialized lazily per user logger.info("GSC Service initialized successfully") def _load_client_config(self) -> Optional[Dict[str, Any]]: """Load Google client configuration from environment variables or file.""" # Reload environment variables to catch any runtime changes (e.g. .env updates) load_dotenv(override=True) # 1. Check Environment Variables (Priority) client_id = os.getenv("GOOGLE_CLIENT_ID") client_secret = os.getenv("GOOGLE_CLIENT_SECRET") if client_id and client_secret: redirect_uri = os.getenv('GSC_REDIRECT_URI', 'http://localhost:8000/gsc/callback') logger.info("Loading GSC credentials from environment variables") # Construct the config dictionary expected by google_auth_oauthlib return { "web": { "client_id": client_id, "client_secret": client_secret, "project_id": os.getenv("GOOGLE_PROJECT_ID", "alwrity"), "auth_uri": "https://accounts.google.com/o/oauth2/auth", "token_uri": "https://oauth2.googleapis.com/token", "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", "redirect_uris": [ "http://localhost:5173/onboarding", redirect_uri ], "javascript_origins": [ "http://localhost:5173", "http://localhost:8000" ] } } # 2. Fallback to File if os.path.exists(self.credentials_file): try: with open(self.credentials_file, 'r') as f: config = json.load(f) logger.info(f"Loading GSC credentials from file: {self.credentials_file}") return config except Exception as e: logger.warning(f"Failed to load GSC credentials from file: {e}") return None def _get_db_path(self, user_id: str) -> str: return get_user_db_path(user_id) def _init_gsc_tables(self, user_id: str): """Initialize GSC-related database tables.""" try: db_path = self._get_db_path(user_id) os.makedirs(os.path.dirname(db_path), exist_ok=True) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() # GSC credentials table cursor.execute(''' CREATE TABLE IF NOT EXISTS gsc_credentials ( user_id TEXT PRIMARY KEY, credentials_json TEXT NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # GSC data cache table cursor.execute(''' CREATE TABLE IF NOT EXISTS gsc_data_cache ( id INTEGER PRIMARY KEY AUTOINCREMENT, user_id TEXT NOT NULL, site_url TEXT NOT NULL, data_type TEXT NOT NULL, data_json TEXT NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, expires_at TIMESTAMP NOT NULL, FOREIGN KEY (user_id) REFERENCES gsc_credentials (user_id) ) ''') # GSC OAuth states table cursor.execute(''' CREATE TABLE IF NOT EXISTS gsc_oauth_states ( state TEXT PRIMARY KEY, user_id TEXT NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() # logger.debug(f"GSC database tables initialized for user {user_id}") except Exception as e: logger.error(f"Error initializing GSC tables for user {user_id}: {e}") raise def save_user_credentials(self, user_id: str, credentials: Credentials) -> bool: """Save user's GSC credentials to database.""" try: self._init_gsc_tables(user_id) db_path = self._get_db_path(user_id) if not self.client_config: logger.error("Cannot save credentials: Client configuration not loaded") return False web_config = self.client_config.get('web', {}) credentials_json = json.dumps({ 'token': credentials.token, 'refresh_token': credentials.refresh_token, 'token_uri': credentials.token_uri or web_config.get('token_uri'), 'client_id': credentials.client_id or web_config.get('client_id'), 'client_secret': credentials.client_secret or web_config.get('client_secret'), 'scopes': credentials.scopes }) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO gsc_credentials (user_id, credentials_json, updated_at) VALUES (?, ?, CURRENT_TIMESTAMP) ''', (user_id, credentials_json)) conn.commit() logger.info(f"GSC credentials saved for user: {user_id}") return True except Exception as e: logger.error(f"Error saving GSC credentials for user {user_id}: {e}") return False def load_user_credentials(self, user_id: str) -> Optional[Credentials]: """Load user's GSC credentials from database.""" try: db_path = self._get_db_path(user_id) if not os.path.exists(db_path): return None with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='gsc_credentials'") if not cursor.fetchone(): return None cursor.execute(''' SELECT credentials_json FROM gsc_credentials WHERE user_id = ? ''', (user_id,)) result = cursor.fetchone() if not result: return None credentials_data = json.loads(result[0]) required_fields = ['token_uri', 'client_id', 'client_secret'] missing_fields = [field for field in required_fields if not credentials_data.get(field)] if missing_fields: logger.warning(f"GSC credentials for user {user_id} missing required fields: {missing_fields}") return None credentials = Credentials.from_authorized_user_info(credentials_data, self.scopes) if credentials.expired: if credentials.refresh_token: try: credentials.refresh(GoogleRequest()) self.save_user_credentials(user_id, credentials) except Exception as e: logger.error(f"Failed to refresh GSC token for user {user_id}: {e}") self.clear_incomplete_credentials(user_id) return None else: logger.warning(f"GSC token expired for user {user_id} but no refresh token available - user needs to re-authorize") self.clear_incomplete_credentials(user_id) return None return credentials except Exception as e: logger.error(f"Error loading GSC credentials for user {user_id}: {e}") return None def get_oauth_url(self, user_id: str) -> str: """Get OAuth authorization URL for GSC.""" try: logger.info(f"Generating OAuth URL for user: {user_id}") # Retry loading config if missing (in case .env was added later) if not self.client_config: self.client_config = self._load_client_config() if not self.client_config: raise FileNotFoundError("GSC credentials not found in file or environment variables.") redirect_uri = os.getenv('GSC_REDIRECT_URI', 'http://localhost:8000/gsc/callback') flow = Flow.from_client_config( self.client_config, scopes=self.scopes, redirect_uri=redirect_uri, autogenerate_code_verifier=False, ) random_state = secrets.token_urlsafe(32) state = f"{user_id}:{random_state}" authorization_url, _ = flow.authorization_url( access_type='offline', include_granted_scopes='true', prompt='consent', state=state ) # Store state for verification in the user-specific DB self._init_gsc_tables(user_id) db_path = self._get_db_path(user_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO gsc_oauth_states (state, user_id) VALUES (?, ?) ''', (state, user_id)) conn.commit() logger.info(f"OAuth URL generated successfully for user: {user_id}") return authorization_url except Exception as e: logger.error(f"Error generating OAuth URL for user {user_id}: {e}") raise def handle_oauth_callback(self, authorization_code: str, state: str) -> bool: """Handle OAuth callback and save credentials.""" try: logger.info(f"Handling GSC OAuth callback with state: {state[:20]}...") if ':' not in state: logger.error(f"Invalid GSC state format: {state}") return False user_id = state.split(':')[0] db_path = self._get_db_path(user_id) if not os.path.exists(db_path): logger.error(f"User database not found for user {user_id}") return False # Verify state in user's DB (best effort — if missing, attempt code exchange anyway) state_valid = False try: with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,)) state_valid = cursor.fetchone() is not None except Exception as state_err: logger.warning(f"State verification query failed, proceeding anyway: {state_err}") if not state_valid: logger.warning(f"GSC OAuth state not found in DB for user {user_id} — will attempt code exchange without state verification") if not self.client_config: logger.error("Cannot handle callback: Client configuration not loaded") return False flow = Flow.from_client_config( self.client_config, scopes=self.scopes, redirect_uri=os.getenv('GSC_REDIRECT_URI', 'http://localhost:8000/gsc/callback'), autogenerate_code_verifier=False, ) flow.fetch_token(code=authorization_code) credentials = flow.credentials if not credentials or not credentials.token: logger.error(f"Token exchange returned empty credentials for user {user_id}") return False # Clean up state if it was valid if state_valid: try: with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,)) conn.commit() except Exception as cleanup_err: logger.warning(f"Failed to clean up OAuth state: {cleanup_err}") result = self.save_user_credentials(user_id, credentials) if result: logger.info(f"GSC OAuth callback succeeded for user {user_id} (state_valid={state_valid})") else: logger.error(f"GSC OAuth callback: token exchange succeeded but failed to save credentials for user {user_id}") return result except Exception as e: logger.error(f"Error handling GSC OAuth callback for user {user_id if 'user_id' in dir() else 'unknown'}: {e}") return False def get_authenticated_service(self, user_id: str): """Get authenticated GSC service for user.""" try: credentials = self.load_user_credentials(user_id) if not credentials: raise ValueError("No valid credentials found") # Disable discovery file cache (suppress oauth2client file_cache warnings) with safe fallback try: service = build('searchconsole', 'v1', credentials=credentials, cache_discovery=False) except TypeError: service = build('searchconsole', 'v1', credentials=credentials) logger.info(f"Authenticated GSC service created for user: {user_id}") return service except ValueError as e: # Log as warning only, as this is expected for unconnected users # logger.warning(f"Cannot create GSC service for user {user_id}: {e}") raise e except Exception as e: logger.error(f"Error creating authenticated GSC service for user {user_id}: {e}") raise def get_site_list(self, user_id: str) -> List[Dict[str, Any]]: """Get list of sites from GSC.""" try: try: service = self.get_authenticated_service(user_id) except ValueError: # User not connected or credentials invalid # logger.warning(f"User {user_id} not connected to GSC. Returning empty site list.") return [] except Exception as e: logger.warning(f"Failed to get authenticated service for {user_id}: {e}") return [] if not service: return [] sites = service.sites().list().execute() site_list = [] if 'siteEntry' in sites: for site in sites.get('siteEntry', []): site_list.append({ 'siteUrl': site.get('siteUrl'), 'permissionLevel': site.get('permissionLevel') }) logger.info(f"Retrieved {len(site_list)} sites for user: {user_id}") return site_list except Exception as e: logger.error(f"Error getting site list for user {user_id}: {e}") # Return empty list instead of raising to prevent frontend 500s return [] def _calculate_previous_period(self, start_date: str, end_date: str): """Calculate previous period date window matching current range length.""" try: start_dt = datetime.strptime(start_date, "%Y-%m-%d") end_dt = datetime.strptime(end_date, "%Y-%m-%d") window_days = max((end_dt - start_dt).days + 1, 1) prev_end = start_dt - timedelta(days=1) prev_start = prev_end - timedelta(days=window_days - 1) return prev_start.strftime("%Y-%m-%d"), prev_end.strftime("%Y-%m-%d") except Exception: return None, None def get_search_analytics(self, user_id: str, site_url: str, start_date: str = None, end_date: str = None) -> Dict[str, Any]: """Get search analytics data from GSC.""" try: # Set default date range (last 30 days) if not end_date: end_date = datetime.now().strftime('%Y-%m-%d') if not start_date: start_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d') # Check cache first (only return cached data with non-empty query rows) cache_key = f"{user_id}_{site_url}_{start_date}_{end_date}" cached_data = self._get_cached_data(user_id, site_url, 'analytics', cache_key) if cached_data and isinstance(cached_data, dict): has_pages = 'page_data' in cached_data and isinstance(cached_data.get('page_data'), dict) has_queries = 'query_data' in cached_data and isinstance(cached_data.get('query_data'), dict) has_query_rows = cached_data.get('query_data', {}).get('rows', []) if has_pages and has_queries and has_query_rows: logger.info(f"Returning cached analytics data for user: {user_id} (includes page_data, {len(has_query_rows)} query rows)") return cached_data try: service = self.get_authenticated_service(user_id) except ValueError: logger.warning(f"User {user_id} not connected to GSC. Returning empty analytics.") return {'error': 'User not connected to GSC', 'rows': [], 'rowCount': 0} if not service: logger.error(f"Failed to get authenticated GSC service for user: {user_id}") return {'error': 'Authentication failed', 'rows': [], 'rowCount': 0} # Step 1: Verify data presence first (as per GSC API documentation) verification_request = { 'startDate': start_date, 'endDate': end_date, 'dimensions': ['date'] # Only date dimension for verification } logger.info(f"GSC Data verification request for user {user_id}: {verification_request}") try: verification_response = service.searchanalytics().query( siteUrl=site_url, body=verification_request ).execute() logger.info(f"GSC Data verification response for user {user_id}: {verification_response}") # Check if we have any data verification_rows = verification_response.get('rows', []) if not verification_rows: logger.warning(f"No GSC data available for user {user_id} in date range {start_date} to {end_date}") return {'error': 'No data available for this date range', 'rows': [], 'rowCount': 0} logger.info(f"GSC Data verification successful - found {len(verification_rows)} days with data") except Exception as verification_error: logger.error(f"GSC Data verification failed for user {user_id}: {verification_error}") return {'error': f'Data verification failed: {str(verification_error)}', 'rows': [], 'rowCount': 0} # Step 2: Get daily metrics for charting (ensure we have rows) request = { 'startDate': start_date, 'endDate': end_date, 'dimensions': ['date'], # Use date dimension to get time-series data 'rowLimit': 1000 } logger.info(f"GSC API request for user {user_id}: {request}") try: response = service.searchanalytics().query( siteUrl=site_url, body=request ).execute() logger.info(f"GSC API response for user {user_id}: {response}") except Exception as api_error: logger.error(f"GSC API call failed for user {user_id}: {api_error}") return {'error': str(api_error), 'rows': [], 'rowCount': 0} # Step 3: Get query-level data for insights (as per documentation) query_request = { 'startDate': start_date, 'endDate': end_date, 'dimensions': ['query'], # Get query-level data 'rowLimit': 1000 } logger.info(f"GSC Query-level request for user {user_id}: {query_request}") try: query_response = service.searchanalytics().query( siteUrl=site_url, body=query_request ).execute() logger.info(f"GSC Query-level response for user {user_id}: {query_response}") # Step 4: Get page-level data for top pages insights page_request = { 'startDate': start_date, 'endDate': end_date, 'dimensions': ['page'], # Get page-level data 'rowLimit': 1000 } logger.info(f"GSC Page-level request for user {user_id}: {page_request}") page_rows = [] page_row_count = 0 try: page_response = service.searchanalytics().query( siteUrl=site_url, body=page_request ).execute() logger.info(f"GSC Page-level response for user {user_id}: {page_response}") page_rows = page_response.get('rows', []) page_row_count = page_response.get('rowCount', 0) except Exception as page_error: logger.warning(f"GSC Page-level request failed for user {user_id}: {page_error}") page_rows = [] page_row_count = 0 # Step 5: Get query+page combined data for mapping queries to pages qp_rows = [] qp_row_count = 0 try: qp_request = { 'startDate': start_date, 'endDate': end_date, 'dimensions': ['query', 'page'], 'rowLimit': 1000 } logger.info(f"GSC Query+Page request for user {user_id}: {qp_request}") qp_response = service.searchanalytics().query( siteUrl=site_url, body=qp_request ).execute() logger.info(f"GSC Query+Page response for user {user_id}: {qp_response}") qp_rows = qp_response.get('rows', []) qp_row_count = qp_response.get('rowCount', 0) except Exception as qp_error: logger.warning(f"GSC Query+Page request failed for user {user_id}: {qp_error}") qp_rows = [] qp_row_count = 0 # Optional previous-period windows for opportunity trend detection prev_query_rows = [] prev_page_rows = [] prev_start_date, prev_end_date = self._calculate_previous_period(start_date, end_date) if prev_start_date and prev_end_date: try: prev_query_request = { 'startDate': prev_start_date, 'endDate': prev_end_date, 'dimensions': ['query'], 'rowLimit': 1000 } prev_query_response = service.searchanalytics().query( siteUrl=site_url, body=prev_query_request ).execute() prev_query_rows = prev_query_response.get('rows', []) except Exception as prev_query_error: logger.warning(f"GSC previous query request failed for user {user_id}: {prev_query_error}") try: prev_page_request = { 'startDate': prev_start_date, 'endDate': prev_end_date, 'dimensions': ['page'], 'rowLimit': 1000 } prev_page_response = service.searchanalytics().query( siteUrl=site_url, body=prev_page_request ).execute() prev_page_rows = prev_page_response.get('rows', []) except Exception as prev_page_error: logger.warning(f"GSC previous page request failed for user {user_id}: {prev_page_error}") # Combine overall, query, page and query+page data analytics_data = { 'overall_metrics': { 'rows': response.get('rows', []), 'rowCount': response.get('rowCount', 0) }, 'query_data': { 'rows': query_response.get('rows', []), 'rowCount': query_response.get('rowCount', 0) }, 'page_data': { 'rows': page_rows, 'rowCount': page_row_count }, 'query_page_data': { 'rows': qp_rows, 'rowCount': qp_row_count }, 'previous_period': { 'startDate': prev_start_date, 'endDate': prev_end_date, 'query_data': {'rows': prev_query_rows, 'rowCount': len(prev_query_rows)}, 'page_data': {'rows': prev_page_rows, 'rowCount': len(prev_page_rows)} }, 'verification_data': { 'rows': verification_rows, 'rowCount': len(verification_rows) }, 'startDate': start_date, 'endDate': end_date, 'siteUrl': site_url } if analytics_data.get('query_data', {}).get('rows'): self._cache_data(user_id, site_url, 'analytics', analytics_data, cache_key) logger.info(f"Analytics data cached for user: {user_id}, site: {site_url} ({len(analytics_data.get('query_data', {}).get('rows', []))} query rows)") else: logger.info(f"Skipping cache for user: {user_id} — empty query_data rows; next request will retry fresh") logger.info(f"Retrieved comprehensive analytics data for user: {user_id}, site: {site_url}") return analytics_data except Exception as query_error: logger.error(f"GSC Query-level request failed for user {user_id}: {query_error}") # Fall back to overall metrics only analytics_data = { 'overall_metrics': { 'rows': response.get('rows', []), 'rowCount': response.get('rowCount', 0) }, 'query_data': {'rows': [], 'rowCount': 0}, 'page_data': {'rows': [], 'rowCount': 0}, 'query_page_data': {'rows': [], 'rowCount': 0}, 'previous_period': { 'startDate': None, 'endDate': None, 'query_data': {'rows': [], 'rowCount': 0}, 'page_data': {'rows': [], 'rowCount': 0} }, 'verification_data': { 'rows': verification_rows, 'rowCount': len(verification_rows) }, 'startDate': start_date, 'endDate': end_date, 'siteUrl': site_url, 'warning': f'Query-level data unavailable: {str(query_error)}' } logger.info(f"Query-level data unavailable for user {user_id}; fallback analytics returned (not cached)") return analytics_data except Exception as e: logger.error(f"Error getting search analytics for user {user_id}: {e}") raise def get_sitemaps(self, user_id: str, site_url: str) -> List[Dict[str, Any]]: """Get sitemaps from GSC.""" try: service = self.get_authenticated_service(user_id) response = service.sitemaps().list(siteUrl=site_url).execute() sitemaps = [] for sitemap in response.get('sitemap', []): sitemaps.append({ 'path': sitemap.get('path'), 'lastSubmitted': sitemap.get('lastSubmitted'), 'contents': sitemap.get('contents', []) }) logger.info(f"Retrieved {len(sitemaps)} sitemaps for user: {user_id}, site: {site_url}") return sitemaps except Exception as e: logger.error(f"Error getting sitemaps for user {user_id}: {e}") raise def revoke_user_access(self, user_id: str) -> bool: """Revoke user's GSC access.""" try: db_path = self._get_db_path(user_id) if not os.path.exists(db_path): return True with sqlite3.connect(db_path) as conn: cursor = conn.cursor() # Delete credentials cursor.execute('DELETE FROM gsc_credentials WHERE user_id = ?', (user_id,)) # Delete cached data cursor.execute('DELETE FROM gsc_data_cache WHERE user_id = ?', (user_id,)) # Delete OAuth states cursor.execute('DELETE FROM gsc_oauth_states WHERE user_id = ?', (user_id,)) conn.commit() logger.info(f"GSC access revoked for user: {user_id}") return True except Exception as e: logger.error(f"Error revoking GSC access for user {user_id}: {e}") return False def clear_incomplete_credentials(self, user_id: str) -> bool: """Clear incomplete GSC credentials that are missing required fields.""" try: db_path = self._get_db_path(user_id) if not os.path.exists(db_path): return True with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute('DELETE FROM gsc_credentials WHERE user_id = ?', (user_id,)) cursor.execute('DELETE FROM gsc_data_cache WHERE user_id = ?', (user_id,)) cursor.execute('DELETE FROM gsc_oauth_states WHERE user_id = ?', (user_id,)) conn.commit() logger.info(f"Cleared incomplete GSC credentials for user: {user_id}") return True except Exception as e: logger.error(f"Error clearing incomplete credentials for user {user_id}: {e}") return False def _get_cached_data(self, user_id: str, site_url: str, data_type: str, cache_key: str) -> Optional[Dict]: """Get cached data if not expired.""" try: db_path = self._get_db_path(user_id) if not os.path.exists(db_path): return None with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute(''' SELECT data_json FROM gsc_data_cache WHERE user_id = ? AND site_url = ? AND data_type = ? AND expires_at > CURRENT_TIMESTAMP ''', (user_id, site_url, data_type)) result = cursor.fetchone() if result: return json.loads(result[0]) return None except Exception as e: logger.error(f"Error getting cached data: {e}") return None def _cache_data(self, user_id: str, site_url: str, data_type: str, data: Dict, cache_key: str): """Cache data with expiration.""" try: self._init_gsc_tables(user_id) db_path = self._get_db_path(user_id) expires_at = datetime.now() + timedelta(hours=1) # Cache for 1 hour with sqlite3.connect(db_path) as conn: cursor = conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO gsc_data_cache (user_id, site_url, data_type, data_json, expires_at) VALUES (?, ?, ?, ?, ?) ''', (user_id, site_url, data_type, json.dumps(data), expires_at)) conn.commit() logger.info(f"Data cached for user: {user_id}, type: {data_type}") except Exception as e: logger.error(f"Error caching data: {e}")