Files
ALwrity/backend/services/seo/advertools_service.py

885 lines
40 KiB
Python

import advertools as adv
import pandas as pd
import asyncio
from typing import Dict, Any, List, Optional, Tuple
from datetime import datetime, timedelta
from loguru import logger
import json
import os
import tempfile
from urllib.parse import urlparse
from collections import Counter
import urllib.request
import urllib.error
import socket
import re
class AdvertoolsService:
"""
Centralized service for leveraging the Advertools library for deep SEO intelligence.
Provides functions for sitemap analysis, content auditing, and link extraction.
"""
def __init__(self):
self.logger = logger.bind(service="AdvertoolsService")
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
"""
Analyzes a website's sitemap to extract metrics on publishing velocity, freshness,
URL structure patterns, and topic distribution.
"""
try:
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
loop = asyncio.get_event_loop()
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
if df is None or df.empty:
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
if 'lastmod' in df.columns:
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
total_urls = len(df)
# --- Content Freshness Scoring ---
freshness = self._compute_freshness(df)
# --- URL Structure Analysis ---
url_structure = {}
if 'loc' in df.columns:
url_structure = await self._analyze_url_structure(df['loc'].tolist())
# --- Content Pillars via url_to_df ---
pillars = {}
url_df = None
try:
url_df = adv.url_to_df(df['loc'])
if url_df is not None and not url_df.empty:
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
if dir_cols:
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
for col in dir_cols[1:3]:
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
pillars = pillar_series.value_counts().head(15).to_dict()
except Exception:
fallback_pillars = {}
if 'loc' in df.columns:
def extract_hierarchy(url: str):
try:
parts = urlparse(url).path.strip('/').split('/')
if not parts or not parts[0]: return "home"
return "/".join(parts[:2])
except:
return "other"
fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict()
pillars = fallback_pillars
# Sample URLs for auditing (top 15 most recent)
audit_urls = []
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
else:
audit_urls = df['loc'].head(15).tolist()
return {
"success": True,
"metrics": {
"total_urls": total_urls,
"publishing_velocity": freshness.get("publishing_velocity"),
"stale_content_count": freshness.get("stale_count"),
"stale_content_percentage": freshness.get("stale_percentage"),
"freshness_score": freshness.get("freshness_score"),
"publishing_recency": freshness.get("publishing_recency"),
"publishing_trend": freshness.get("publishing_trend"),
"top_pillars": pillars,
"url_structure": url_structure,
"audit_sample_urls": audit_urls
},
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
return {"success": False, "error": str(e)}
def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]:
"""Compute content freshness, publishing velocity, and staleness metrics."""
result = {
"publishing_velocity": 0,
"stale_count": 0,
"stale_percentage": 0,
"freshness_score": 0,
"publishing_recency": {},
"publishing_trend": "unknown"
}
if 'lastmod' not in df.columns or df['lastmod'].isna().all():
return result
lastmod = df['lastmod'].dropna()
if lastmod.empty:
return result
now = datetime.now(lastmod.dt.tz)
thirty_days_ago = now - timedelta(days=30)
ninety_days_ago = now - timedelta(days=90)
six_months_ago = now - timedelta(days=180)
recent_urls = df[df['lastmod'] > thirty_days_ago]
stale_urls = df[df['lastmod'] < six_months_ago]
total_urls = len(df)
stale_count = len(stale_urls)
stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0
# Publishing velocity: URLs per week over last 90 days
recent_90 = df[df['lastmod'] > ninety_days_ago]
publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0
# Freshness score (0-100): weighted combination of metrics
non_stale_ratio = 1.0 - (stale_percentage / 100.0)
recency_ratio = len(recent_urls) / max(total_urls, 1)
velocity_score = min(publishing_velocity / 10.0, 1.0)
freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1)
# Publishing recency: URLs published in last 1d, 7d, 30d, 90d
publishing_recency = {
"last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])),
"last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])),
"last_30d": int(len(recent_urls)),
"last_90d": int(len(recent_90)),
}
# Publishing trend: compare recent 30d vs prior 30d
prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))]
recent_count = len(recent_urls)
prior_count = len(prior_30)
if recent_count > prior_count * 1.1:
publishing_trend = "increasing"
elif recent_count < prior_count * 0.9:
publishing_trend = "decreasing"
else:
publishing_trend = "stable"
return {
"publishing_velocity": publishing_velocity,
"stale_count": stale_count,
"stale_percentage": stale_percentage,
"freshness_score": freshness_score,
"publishing_recency": publishing_recency,
"publishing_trend": publishing_trend
}
async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]:
"""Analyze URL patterns for parameter bloat, directory depth, and path patterns."""
try:
loop = asyncio.get_event_loop()
url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls))
if url_df is None or url_df.empty:
return {}
total = len(url_df)
# Query param analysis
has_query = url_df['query'].notna() & (url_df['query'] != '')
param_count = has_query.sum()
param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0
# Extract individual parameters
all_params = []
param_frequency = {}
if param_count > 0:
for q in url_df.loc[has_query, 'query'].dropna().unique():
for pair in q.split('&'):
key = pair.split('=')[0] if '=' in pair else pair
all_params.append(key)
from collections import Counter
param_frequency = dict(Counter(all_params).most_common(10))
# Directory depth analysis
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
def count_depth(row):
for i, col in enumerate(dir_cols):
val = row[col]
if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '':
return i
return len(dir_cols)
depths = url_df.apply(count_depth, axis=1)
avg_depth = round(depths.mean(), 1) if not depths.empty else 0
max_depth = int(depths.max()) if not depths.empty else 0
depth_distribution = depths.value_counts().sort_index().head(10).to_dict()
depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()}
# Protocol consistency
schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {}
# Subdomain analysis
netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None
unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0
primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else ""
return {
"total_urls_analyzed": total,
"parameter_usage": {
"urls_with_params": int(param_count),
"percentage_with_params": param_percentage,
"top_parameters": param_frequency
},
"directory_depth": {
"average_depth": avg_depth,
"max_depth": max_depth,
"distribution": depth_distribution
},
"protocols": {str(k): int(v) for k, v in schemes.items()},
"subdomains": {
"primary": primary_domain,
"unique_count": unique_subdomains
}
}
except Exception as e:
self.logger.warning(f"URL structure analysis failed: {e}")
return {}
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
"""
Performs a shallow crawl and theme analysis using word frequency.
Uses unique temporary files for thread safety.
"""
temp_file = None
try:
self.logger.info(f"Auditing content for {len(url_list)} URLs")
# Create a unique temporary file
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
# advertools crawl is blocking
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=url_list,
output_file=temp_file,
follow_links=False,
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 15, # Guardrail: Max 15 pages
'DOWNLOAD_TIMEOUT': 30 # Guardrail: 30s timeout per page
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Crawl failed to generate output or output is empty."}
crawl_df = pd.read_json(temp_file, lines=True)
# Extract themes using word frequency
text_columns = [col for col in ['body_text', 'h1', 'h2', 'title'] if col in crawl_df.columns]
if not text_columns:
return {"success": False, "error": "No text content found to analyze."}
all_text = " ".join(crawl_df[text_columns].fillna("").values.flatten())
if not all_text.strip():
return {"success": False, "error": "Extracted text is empty."}
word_freq = await loop.run_in_executor(None, lambda: adv.word_frequency([all_text], rm_stopwords=True))
top_themes = word_freq.head(20).to_dict(orient='records')
# Additional metrics: Readability, word count
avg_word_count = 0
if 'body_text' in crawl_df.columns:
crawl_df['word_count'] = crawl_df['body_text'].fillna("").str.split().str.len()
avg_word_count = crawl_df['word_count'].mean()
return {
"success": True,
"themes": top_themes,
"page_count": len(crawl_df),
"avg_word_count": round(avg_word_count, 1),
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
self.logger.error(f"Failed to audit content: {str(e)}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try:
os.remove(temp_file)
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]:
"""
Crawls a set of pages with link following to analyze internal link health,
redirect chains, and page-level SEO elements.
Extracts metrics via crawlytics: link distribution, redirect chains, image SEO.
"""
temp_file = None
try:
self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}")
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=url_list,
output_file=temp_file,
follow_links=True,
allowed_domains=[site_domain] if site_domain else None,
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 50,
'DOWNLOAD_TIMEOUT': 30,
'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
'DEPTH_LIMIT': 3,
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Site structure crawl produced no output."}
crawl_df = pd.read_json(temp_file, lines=True)
page_count = len(crawl_df)
result = {"success": True, "page_count": page_count}
# --- Link Health via crawlytics ---
try:
internal_regex = site_domain if site_domain else None
link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex)
if link_df is not None and not link_df.empty:
total_links = len(link_df)
internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0
external_links = total_links - internal_links
nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0
# Count links per page
links_per_page = link_df.groupby(level=0).size()
avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0
# Most common anchor text (internal links only)
anchor_texts = []
if 'text' in link_df.columns and 'internal' in link_df.columns:
internal_anchors = link_df[link_df['internal'] == True]['text'].dropna()
for t in internal_anchors:
if isinstance(t, str) and t.strip():
anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2])
from collections import Counter
top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {}
result["link_health"] = {
"total_links_found": total_links,
"internal_link_count": internal_links,
"external_link_count": external_links,
"internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0,
"nofollow_link_count": nofollow_links,
"avg_links_per_page": avg_links_per_page,
"top_anchor_words": top_anchors
}
else:
result["link_health"] = {"error": "No links found in crawl data"}
except Exception as e:
self.logger.warning(f"Link analysis failed: {e}")
result["link_health"] = {"error": str(e)}
# --- Redirect Chain Audit via crawlytics ---
try:
redirect_df = adv.crawlytics.redirects(crawl_df)
if redirect_df is not None and not redirect_df.empty:
total_redirects = len(redirect_df)
redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0
redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {}
multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame()
result["redirect_audit"] = {
"total_redirects": int(total_redirects),
"unique_chains": int(redirect_chains),
"status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()},
"multi_hop_chains": int(len(multi_hop)),
"affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else []
}
else:
result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"}
except Exception as e:
self.logger.warning(f"Redirect analysis failed: {e}")
result["redirect_audit"] = {"error": str(e)}
# --- Image SEO overview via crawlytics ---
try:
img_df = adv.crawlytics.images(crawl_df)
if img_df is not None and not img_df.empty:
total_images = len(img_df)
missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0
alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0
result["image_seo"] = {
"total_images": total_images,
"missing_alt_count": missing_alt,
"alt_coverage_percentage": alt_coverage
}
except Exception as e:
self.logger.warning(f"Image analysis failed: {e}")
# --- Page-level metrics ---
if 'status' in crawl_df.columns:
status_dist = crawl_df['status'].value_counts().to_dict()
result["page_status"] = {str(k): int(v) for k, v in status_dist.items()}
if 'title' in crawl_df.columns:
missing_titles = int(crawl_df['title'].isna().sum())
result["missing_titles"] = missing_titles
if 'meta_desc' in crawl_df.columns:
missing_descriptions = int(crawl_df['meta_desc'].isna().sum())
result["missing_descriptions"] = missing_descriptions
result["timestamp"] = datetime.utcnow().isoformat()
return result
except Exception as e:
self.logger.error(f"Failed to analyze site structure: {str(e)}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try:
os.remove(temp_file)
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]:
"""
Fetch and analyze robots.txt for compliance issues.
Checks directives, sitemap declaration, crawl-delay, and common problems.
"""
try:
self.logger.info(f"Analyzing robots.txt for {website_url}")
parsed = urlparse(website_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
robots_url = f"{base_url}/robots.txt"
result = {
"success": True,
"url": robots_url,
"accessible": True,
"total_directives": 0,
"user_agents_found": [],
"has_sitemap_directive": False,
"sitemap_urls": [],
"has_crawl_delay": False,
"disallow_rules": [],
"issues": [],
"compliance_score": 100,
}
loop = asyncio.get_event_loop()
try:
robots_df = await loop.run_in_executor(
None, lambda: adv.robotstxt_to_df(robots_url)
)
if robots_df is None or robots_df.empty:
raise ValueError("Empty result from robotstxt_to_df")
except Exception as adv_err:
self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}")
robots_df = await loop.run_in_executor(
None, lambda: self._parse_robots_txt_manual(robots_url)
)
if robots_df is None or robots_df.empty:
result["success"] = False
result["error"] = "Could not fetch or parse robots.txt"
result["accessible"] = False
return result
result["total_directives"] = len(robots_df)
if 'user_agent' in robots_df.columns:
result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist()
rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None
value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None
if rule_col and value_col:
rules_lower = robots_df[rule_col].astype(str).str.lower()
result["has_sitemap_directive"] = 'sitemap' in rules_lower.values
result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values
has_disallow_all = any(
str(row.get(value_col, '')).strip() == '/'
for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows()
) if 'disallow' in rules_lower.values else False
disallow_mask = rules_lower == 'disallow'
if disallow_mask.any():
for _, row in robots_df[disallow_mask].iterrows():
val = str(row.get(value_col, ''))
ua = str(row.get('user_agent', '*'))
if val:
result["disallow_rules"].append({"user_agent": ua, "path": val})
sitemap_mask = rules_lower == 'sitemap'
if sitemap_mask.any():
result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist()
if has_disallow_all:
result["issues"].append({
"severity": "critical", "code": "DISALLOW_ALL",
"detail": "robots.txt disallows all user agents from all paths (Disallow: /)"
})
if not result["has_sitemap_directive"]:
result["issues"].append({
"severity": "warning", "code": "NO_SITEMAP",
"detail": "No Sitemap directive found — search engines may miss pages"
})
if not result["has_crawl_delay"]:
result["issues"].append({
"severity": "info", "code": "NO_CRAWL_DELAY",
"detail": "No Crawl-delay directive set — not critical for most sites"
})
for issue in result["issues"]:
sev = issue["severity"]
if sev == "critical":
result["compliance_score"] -= 30
elif sev == "warning":
result["compliance_score"] -= 15
elif sev == "info":
result["compliance_score"] -= 5
result["compliance_score"] = max(result["compliance_score"], 0)
return result
except Exception as e:
self.logger.error(f"Robots.txt analysis failed: {e}")
return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url}
def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame:
"""Fallback: manually fetch and parse robots.txt."""
records = []
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=15) as resp:
content = resp.read().decode("utf-8", errors="replace")
current_ua = "*"
for line in content.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
if line.lower().startswith("user-agent"):
parts = line.split(":", 1)
current_ua = parts[1].strip() if len(parts) > 1 else "*"
continue
if ":" in line:
directive, _, value = line.partition(":")
records.append({
"user_agent": current_ua,
"rule": directive.strip(),
"value": value.strip(),
})
except Exception as e:
self.logger.warning(f"Manual robots.txt fetch failed: {e}")
if not records:
return pd.DataFrame()
return pd.DataFrame(records)
async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]:
"""
Analyze crawl budget by comparing sitemap inventory against actual crawl results.
Estimates budget utilization, waste from redirects/errors, and optimization score.
"""
temp_file = None
try:
self.logger.info(f"Analyzing crawl budget for {site_domain}")
loop = asyncio.get_event_loop()
sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0
start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=[start_url],
output_file=temp_file,
follow_links=True,
allowed_domains=[site_domain],
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 30,
'DOWNLOAD_TIMEOUT': 15,
'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
'DEPTH_LIMIT': 2,
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Crawl produced no output"}
crawl_df = pd.read_json(temp_file, lines=True)
crawled_count = len(crawl_df)
status_dist = {}
if 'status' in crawl_df.columns:
raw = crawl_df['status'].value_counts().to_dict()
status_dist = {str(k): int(v) for k, v in raw.items()}
wasted = 0
for code_s in status_dist:
code = int(code_s)
if code >= 300 or code < 200:
wasted += status_dist[code_s]
budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3)
waste_ratio = round(wasted / max(crawled_count, 1), 3)
depth_dist = {}
if 'depth' in crawl_df.columns:
raw = crawl_df['depth'].value_counts().sort_index().to_dict()
depth_dist = {str(k): int(v) for k, v in raw.items()}
param_count = 0
url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None
if url_col:
param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum())
optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1))
return {
"success": True,
"sitemap_total_urls": sitemap_total,
"pages_crawled": crawled_count,
"crawl_coverage_percentage": round(budget_usage_ratio * 100, 1),
"status_distribution": status_dist,
"wasted_crawl_requests": int(wasted),
"waste_percentage": round(waste_ratio * 100, 1),
"depth_distribution": depth_dist,
"urls_with_parameters": int(param_count),
"optimization_score": optimization_score,
}
except Exception as e:
self.logger.error(f"Crawl budget analysis failed: {e}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try: os.remove(temp_file)
except Exception: pass
async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]:
"""
Compare two sitemaps for competitive content gap analysis.
Analyzes URL count, freshness, directory pillars, and identifies
patterns unique to each sitemap.
"""
try:
self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}")
loop = asyncio.get_event_loop()
df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a))
df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b))
total_a = len(df_a) if df_a is not None and not df_a.empty else 0
total_b = len(df_b) if df_b is not None and not df_b.empty else 0
result = {
"success": True,
"sitemap_a": {"url": sitemap_a, "total_urls": total_a},
"sitemap_b": {"url": sitemap_b, "total_urls": total_b},
"url_count_diff": total_a - total_b,
"ratio": round(total_a / max(total_b, 1), 2),
"pillars_a": {},
"pillars_b": {},
"shared_pillars": [],
"unique_to_a": [],
"unique_to_b": [],
"freshness_comparison": {},
"overlap_score": 0,
}
if total_a == 0 or total_b == 0:
return result
def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]:
pillars = {}
if 'loc' in df.columns:
try:
url_df = adv.url_to_df(df['loc'])
if url_df is not None and not url_df.empty:
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
if dir_cols:
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
for col in dir_cols[1:3]:
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
pillars = pillar_series.value_counts().head(20).to_dict()
except Exception:
pass
if not pillars:
seen = {}
for url in df['loc'].dropna():
parts = urlparse(url).path.strip('/').split('/')
key = parts[0] if parts and parts[0] else "home"
seen[key] = seen.get(key, 0) + 1
pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20])
pillar_keys = list(pillars.keys()) if pillars else []
return pillars, pillar_keys
pillars_a, keys_a = extract_pillars(df_a, "a")
pillars_b, keys_b = extract_pillars(df_b, "b")
result["pillars_a"] = pillars_a
result["pillars_b"] = pillars_b
set_a = set(keys_a)
set_b = set(keys_b)
shared = set_a & set_b
result["shared_pillars"] = sorted(shared)
result["unique_to_a"] = sorted(set_a - set_b)
result["unique_to_b"] = sorted(set_b - set_a)
total_keys = max(len(set_a | set_b), 1)
overlap_count = len(shared)
result["overlap_score"] = round((overlap_count / total_keys) * 100, 1)
def compute_freshness_stats(df: pd.DataFrame) -> dict:
stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0}
if 'lastmod' in df.columns:
lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna()
if not lm.empty:
stats["has_lastmod"] = True
stats["total_with_dates"] = int(len(lm))
stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum())
return stats
result["freshness_comparison"] = {
"a": compute_freshness_stats(df_a),
"b": compute_freshness_stats(df_b),
}
return result
except Exception as e:
self.logger.error(f"Sitemap comparison failed: {e}")
return {"success": False, "error": str(e)}
async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]:
"""
Compare two crawl analysis result dicts to surface changes over time.
Useful for tracking SEO improvements between scheduled executions.
"""
try:
diff = {
"success": True,
"page_count_change": 0,
"status_distribution_changes": {},
"link_health_changes": {},
"redirect_changes": {},
"new_issues": [],
"resolved_issues": [],
}
pc_a = result_a.get("page_count", 0)
pc_b = result_b.get("page_count", 0)
diff["page_count_change"] = pc_b - pc_a
sd_a = result_a.get("page_status", {})
sd_b = result_b.get("page_status", {})
all_codes = set(list(sd_a.keys()) + list(sd_b.keys()))
for c in sorted(all_codes):
va = sd_a.get(c, 0)
vb = sd_b.get(c, 0)
change = vb - va
if change != 0:
diff["status_distribution_changes"][c] = change
def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict:
changes = {}
all_keys = set(list(d_a.keys()) + list(d_b.keys()))
for k in all_keys:
va = d_a.get(k, 0)
vb = d_b.get(k, 0)
if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
change = round(vb - va, 2)
if change != 0:
changes[f"{prefix}_{k}"] = change
return changes
lh_a = result_a.get("link_health", {})
lh_b = result_b.get("link_health", {})
diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link")
rd_a = result_a.get("redirect_audit", {})
rd_b = result_b.get("redirect_audit", {})
diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect")
return diff
except Exception as e:
self.logger.error(f"Crawl comparison failed: {e}")
return {"success": False, "error": str(e)}
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
"""
Analyzes linking patterns and social media presence using unique temporary files.
"""
temp_file = None
try:
self.logger.info(f"Extracting communication style for {len(url_list)} URLs")
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=url_list,
output_file=temp_file,
follow_links=False,
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 10,
'DOWNLOAD_TIMEOUT': 30
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Link extraction crawl failed."}
crawl_df = pd.read_json(temp_file, lines=True)
# Extract social links and internal/external stats
all_links = []
if 'links_url' in crawl_df.columns:
for links in crawl_df['links_url'].dropna():
if isinstance(links, str):
all_links.extend(links.split("@@"))
elif isinstance(links, list):
all_links.extend(links)
if not all_links:
return {"success": True, "social_links": [], "link_stats": {"total_links_found": 0, "unique_domains": 0}}
# Analyze links
link_df = adv.url_to_df(all_links)
social_domains = ['twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com', 'youtube.com', 'github.com']
social_links = []
if not link_df.empty and 'netloc' in link_df.columns:
social_links = link_df[link_df['netloc'].isin(social_domains)]['url'].unique().tolist()
return {
"success": True,
"social_links": social_links,
"link_stats": {
"total_links_found": len(all_links),
"unique_domains": link_df['netloc'].nunique() if not link_df.empty else 0
},
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
self.logger.error(f"Failed to extract communication style: {str(e)}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try:
os.remove(temp_file)
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")