222 lines
9.4 KiB
Python
222 lines
9.4 KiB
Python
import advertools as adv
|
|
import pandas as pd
|
|
import asyncio
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime, timedelta
|
|
from loguru import logger
|
|
import json
|
|
import os
|
|
import tempfile
|
|
|
|
class AdvertoolsService:
|
|
"""
|
|
Centralized service for leveraging the Advertools library for deep SEO intelligence.
|
|
Provides functions for sitemap analysis, content auditing, and link extraction.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.logger = logger.bind(service="AdvertoolsService")
|
|
|
|
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
|
|
"""
|
|
Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
|
|
"""
|
|
try:
|
|
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
|
|
|
|
# advertools sitemap_to_df is blocking, run in executor
|
|
loop = asyncio.get_event_loop()
|
|
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
|
|
|
|
if df is None or df.empty:
|
|
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
|
|
|
|
# Convert lastmod to datetime
|
|
if 'lastmod' in df.columns:
|
|
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
|
|
|
|
total_urls = len(df)
|
|
|
|
# Handle potential empty datetime columns
|
|
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
|
now = datetime.now(df['lastmod'].dt.tz)
|
|
thirty_days_ago = now - timedelta(days=30)
|
|
recent_urls = df[df['lastmod'] > thirty_days_ago]
|
|
six_months_ago = now - timedelta(days=180)
|
|
stale_urls = df[df['lastmod'] < six_months_ago]
|
|
|
|
publishing_velocity = len(recent_urls) / 4.0 # URLs per week
|
|
stale_count = len(stale_urls)
|
|
else:
|
|
publishing_velocity = 0
|
|
stale_count = 0
|
|
|
|
# Enhanced Content Pillars (Top folder patterns - 3 levels deep)
|
|
def extract_hierarchy(url: str):
|
|
try:
|
|
parts = urlparse(url).path.strip('/').split('/')
|
|
if not parts or not parts[0]: return "home"
|
|
return "/".join(parts[:2]) # Capture top 2 segments
|
|
except:
|
|
return "other"
|
|
|
|
df['pillar'] = df['loc'].apply(extract_hierarchy)
|
|
pillars = df['pillar'].value_counts().head(15).to_dict()
|
|
|
|
# Return a sample of URLs for auditing (top 15 most recent if available)
|
|
audit_urls = []
|
|
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
|
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
|
|
else:
|
|
audit_urls = df['loc'].head(15).tolist()
|
|
|
|
return {
|
|
"success": True,
|
|
"metrics": {
|
|
"total_urls": total_urls,
|
|
"publishing_velocity": round(publishing_velocity, 2),
|
|
"stale_content_count": stale_count,
|
|
"stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
|
|
"top_pillars": pillars,
|
|
"audit_sample_urls": audit_urls
|
|
},
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
|
|
return {"success": False, "error": str(e)}
|
|
|
|
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Performs a shallow crawl and theme analysis using word frequency.
|
|
Uses unique temporary files for thread safety.
|
|
"""
|
|
temp_file = None
|
|
try:
|
|
self.logger.info(f"Auditing content for {len(url_list)} URLs")
|
|
|
|
# Create a unique temporary file
|
|
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
|
temp_file = tf.name
|
|
|
|
# advertools crawl is blocking
|
|
loop = asyncio.get_event_loop()
|
|
await loop.run_in_executor(None, lambda: adv.crawl(
|
|
url_list=url_list,
|
|
output_file=temp_file,
|
|
follow_links=False,
|
|
custom_settings={
|
|
'LOG_LEVEL': 'WARNING',
|
|
'CLOSESPIDER_PAGECOUNT': 15, # Guardrail: Max 15 pages
|
|
'DOWNLOAD_TIMEOUT': 30 # Guardrail: 30s timeout per page
|
|
}
|
|
))
|
|
|
|
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
|
return {"success": False, "error": "Crawl failed to generate output or output is empty."}
|
|
|
|
crawl_df = pd.read_json(temp_file, lines=True)
|
|
|
|
# Extract themes using word frequency
|
|
text_columns = [col for col in ['body_text', 'h1', 'h2', 'title'] if col in crawl_df.columns]
|
|
if not text_columns:
|
|
return {"success": False, "error": "No text content found to analyze."}
|
|
|
|
all_text = " ".join(crawl_df[text_columns].fillna("").values.flatten())
|
|
|
|
if not all_text.strip():
|
|
return {"success": False, "error": "Extracted text is empty."}
|
|
|
|
word_freq = await loop.run_in_executor(None, lambda: adv.word_frequency([all_text], rm_stopwords=True))
|
|
top_themes = word_freq.head(20).to_dict(orient='records')
|
|
|
|
# Additional metrics: Readability, word count
|
|
avg_word_count = 0
|
|
if 'body_text' in crawl_df.columns:
|
|
crawl_df['word_count'] = crawl_df['body_text'].fillna("").str.split().str.len()
|
|
avg_word_count = crawl_df['word_count'].mean()
|
|
|
|
return {
|
|
"success": True,
|
|
"themes": top_themes,
|
|
"page_count": len(crawl_df),
|
|
"avg_word_count": round(avg_word_count, 1),
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to audit content: {str(e)}")
|
|
return {"success": False, "error": str(e)}
|
|
finally:
|
|
if temp_file and os.path.exists(temp_file):
|
|
try:
|
|
os.remove(temp_file)
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|
|
|
|
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Analyzes linking patterns and social media presence using unique temporary files.
|
|
"""
|
|
temp_file = None
|
|
try:
|
|
self.logger.info(f"Extracting communication style for {len(url_list)} URLs")
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
|
temp_file = tf.name
|
|
|
|
loop = asyncio.get_event_loop()
|
|
await loop.run_in_executor(None, lambda: adv.crawl(
|
|
url_list=url_list,
|
|
output_file=temp_file,
|
|
follow_links=False,
|
|
custom_settings={
|
|
'LOG_LEVEL': 'WARNING',
|
|
'CLOSESPIDER_PAGECOUNT': 10,
|
|
'DOWNLOAD_TIMEOUT': 30
|
|
}
|
|
))
|
|
|
|
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
|
return {"success": False, "error": "Link extraction crawl failed."}
|
|
|
|
crawl_df = pd.read_json(temp_file, lines=True)
|
|
|
|
# Extract social links and internal/external stats
|
|
all_links = []
|
|
if 'links_url' in crawl_df.columns:
|
|
for links in crawl_df['links_url'].dropna():
|
|
if isinstance(links, str):
|
|
all_links.extend(links.split("@@"))
|
|
elif isinstance(links, list):
|
|
all_links.extend(links)
|
|
|
|
if not all_links:
|
|
return {"success": True, "social_links": [], "link_stats": {"total_links_found": 0, "unique_domains": 0}}
|
|
|
|
# Analyze links
|
|
link_df = adv.url_to_df(all_links)
|
|
|
|
social_domains = ['twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com', 'youtube.com', 'github.com']
|
|
social_links = []
|
|
if not link_df.empty and 'netloc' in link_df.columns:
|
|
social_links = link_df[link_df['netloc'].isin(social_domains)]['url'].unique().tolist()
|
|
|
|
return {
|
|
"success": True,
|
|
"social_links": social_links,
|
|
"link_stats": {
|
|
"total_links_found": len(all_links),
|
|
"unique_domains": link_df['netloc'].nunique() if not link_df.empty else 0
|
|
},
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to extract communication style: {str(e)}")
|
|
return {"success": False, "error": str(e)}
|
|
finally:
|
|
if temp_file and os.path.exists(temp_file):
|
|
try:
|
|
os.remove(temp_file)
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|