import advertools as adv import pandas as pd import asyncio from typing import Dict, Any, List, Optional from datetime import datetime, timedelta from loguru import logger import json import os import tempfile class AdvertoolsService: """ Centralized service for leveraging the Advertools library for deep SEO intelligence. Provides functions for sitemap analysis, content auditing, and link extraction. """ def __init__(self): self.logger = logger.bind(service="AdvertoolsService") async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]: """ Analyzes a website's sitemap to extract metrics on publishing velocity and freshness. """ try: self.logger.info(f"Analyzing sitemap: {sitemap_url}") # advertools sitemap_to_df is blocking, run in executor loop = asyncio.get_event_loop() df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url)) if df is None or df.empty: return {"success": False, "error": "Sitemap is empty or could not be parsed."} # Convert lastmod to datetime if 'lastmod' in df.columns: df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True) total_urls = len(df) # Handle potential empty datetime columns if 'lastmod' in df.columns and not df['lastmod'].isna().all(): now = datetime.now(df['lastmod'].dt.tz) thirty_days_ago = now - timedelta(days=30) recent_urls = df[df['lastmod'] > thirty_days_ago] six_months_ago = now - timedelta(days=180) stale_urls = df[df['lastmod'] < six_months_ago] publishing_velocity = len(recent_urls) / 4.0 # URLs per week stale_count = len(stale_urls) else: publishing_velocity = 0 stale_count = 0 # Enhanced Content Pillars (Top folder patterns - 3 levels deep) def extract_hierarchy(url: str): try: parts = urlparse(url).path.strip('/').split('/') if not parts or not parts[0]: return "home" return "/".join(parts[:2]) # Capture top 2 segments except: return "other" df['pillar'] = df['loc'].apply(extract_hierarchy) pillars = df['pillar'].value_counts().head(15).to_dict() # Return a sample of URLs for auditing (top 15 most recent if available) audit_urls = [] if 'lastmod' in df.columns and not df['lastmod'].isna().all(): audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist() else: audit_urls = df['loc'].head(15).tolist() return { "success": True, "metrics": { "total_urls": total_urls, "publishing_velocity": round(publishing_velocity, 2), "stale_content_count": stale_count, "stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0, "top_pillars": pillars, "audit_sample_urls": audit_urls }, "timestamp": datetime.utcnow().isoformat() } except Exception as e: self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}") return {"success": False, "error": str(e)} async def audit_content(self, url_list: List[str]) -> Dict[str, Any]: """ Performs a shallow crawl and theme analysis using word frequency. Uses unique temporary files for thread safety. """ temp_file = None try: self.logger.info(f"Auditing content for {len(url_list)} URLs") # Create a unique temporary file with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf: temp_file = tf.name # advertools crawl is blocking loop = asyncio.get_event_loop() await loop.run_in_executor(None, lambda: adv.crawl( url_list=url_list, output_file=temp_file, follow_links=False, custom_settings={ 'LOG_LEVEL': 'WARNING', 'CLOSESPIDER_PAGECOUNT': 15, # Guardrail: Max 15 pages 'DOWNLOAD_TIMEOUT': 30 # Guardrail: 30s timeout per page } )) if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0: return {"success": False, "error": "Crawl failed to generate output or output is empty."} crawl_df = pd.read_json(temp_file, lines=True) # Extract themes using word frequency text_columns = [col for col in ['body_text', 'h1', 'h2', 'title'] if col in crawl_df.columns] if not text_columns: return {"success": False, "error": "No text content found to analyze."} all_text = " ".join(crawl_df[text_columns].fillna("").values.flatten()) if not all_text.strip(): return {"success": False, "error": "Extracted text is empty."} word_freq = await loop.run_in_executor(None, lambda: adv.word_frequency([all_text], rm_stopwords=True)) top_themes = word_freq.head(20).to_dict(orient='records') # Additional metrics: Readability, word count avg_word_count = 0 if 'body_text' in crawl_df.columns: crawl_df['word_count'] = crawl_df['body_text'].fillna("").str.split().str.len() avg_word_count = crawl_df['word_count'].mean() return { "success": True, "themes": top_themes, "page_count": len(crawl_df), "avg_word_count": round(avg_word_count, 1), "timestamp": datetime.utcnow().isoformat() } except Exception as e: self.logger.error(f"Failed to audit content: {str(e)}") return {"success": False, "error": str(e)} finally: if temp_file and os.path.exists(temp_file): try: os.remove(temp_file) except Exception as e: self.logger.warning(f"Failed to remove temp file {temp_file}: {e}") async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]: """ Analyzes linking patterns and social media presence using unique temporary files. """ temp_file = None try: self.logger.info(f"Extracting communication style for {len(url_list)} URLs") with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf: temp_file = tf.name loop = asyncio.get_event_loop() await loop.run_in_executor(None, lambda: adv.crawl( url_list=url_list, output_file=temp_file, follow_links=False, custom_settings={ 'LOG_LEVEL': 'WARNING', 'CLOSESPIDER_PAGECOUNT': 10, 'DOWNLOAD_TIMEOUT': 30 } )) if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0: return {"success": False, "error": "Link extraction crawl failed."} crawl_df = pd.read_json(temp_file, lines=True) # Extract social links and internal/external stats all_links = [] if 'links_url' in crawl_df.columns: for links in crawl_df['links_url'].dropna(): if isinstance(links, str): all_links.extend(links.split("@@")) elif isinstance(links, list): all_links.extend(links) if not all_links: return {"success": True, "social_links": [], "link_stats": {"total_links_found": 0, "unique_domains": 0}} # Analyze links link_df = adv.url_to_df(all_links) social_domains = ['twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com', 'youtube.com', 'github.com'] social_links = [] if not link_df.empty and 'netloc' in link_df.columns: social_links = link_df[link_df['netloc'].isin(social_domains)]['url'].unique().tolist() return { "success": True, "social_links": social_links, "link_stats": { "total_links_found": len(all_links), "unique_domains": link_df['netloc'].nunique() if not link_df.empty else 0 }, "timestamp": datetime.utcnow().isoformat() } except Exception as e: self.logger.error(f"Failed to extract communication style: {str(e)}") return {"success": False, "error": str(e)} finally: if temp_file and os.path.exists(temp_file): try: os.remove(temp_file) except Exception as e: self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")