chore: bulk commit of local changes across blog writer, SEO dashboard, scheduler, docs-site, and frontend

This commit is contained in:
ajaysi
2026-06-05 12:40:04 +05:30
parent b894bc0abb
commit e54aaa7a3e
74 changed files with 5667 additions and 996 deletions

View File

@@ -1,12 +1,18 @@
import advertools as adv
import pandas as pd
import asyncio
from typing import Dict, Any, List, Optional
from typing import Dict, Any, List, Optional, Tuple
from datetime import datetime, timedelta
from loguru import logger
import json
import os
import tempfile
from urllib.parse import urlparse
from collections import Counter
import urllib.request
import urllib.error
import socket
import re
class AdvertoolsService:
"""
@@ -19,51 +25,58 @@ class AdvertoolsService:
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
"""
Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
Analyzes a website's sitemap to extract metrics on publishing velocity, freshness,
URL structure patterns, and topic distribution.
"""
try:
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
# advertools sitemap_to_df is blocking, run in executor
loop = asyncio.get_event_loop()
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
if df is None or df.empty:
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
# Convert lastmod to datetime
if 'lastmod' in df.columns:
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
total_urls = len(df)
# Handle potential empty datetime columns
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
now = datetime.now(df['lastmod'].dt.tz)
thirty_days_ago = now - timedelta(days=30)
recent_urls = df[df['lastmod'] > thirty_days_ago]
six_months_ago = now - timedelta(days=180)
stale_urls = df[df['lastmod'] < six_months_ago]
publishing_velocity = len(recent_urls) / 4.0 # URLs per week
stale_count = len(stale_urls)
else:
publishing_velocity = 0
stale_count = 0
# --- Content Freshness Scoring ---
freshness = self._compute_freshness(df)
# Enhanced Content Pillars (Top folder patterns - 3 levels deep)
def extract_hierarchy(url: str):
try:
parts = urlparse(url).path.strip('/').split('/')
if not parts or not parts[0]: return "home"
return "/".join(parts[:2]) # Capture top 2 segments
except:
return "other"
# --- URL Structure Analysis ---
url_structure = {}
if 'loc' in df.columns:
url_structure = await self._analyze_url_structure(df['loc'].tolist())
# --- Content Pillars via url_to_df ---
pillars = {}
url_df = None
try:
url_df = adv.url_to_df(df['loc'])
if url_df is not None and not url_df.empty:
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
if dir_cols:
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
for col in dir_cols[1:3]:
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
pillars = pillar_series.value_counts().head(15).to_dict()
except Exception:
fallback_pillars = {}
if 'loc' in df.columns:
def extract_hierarchy(url: str):
try:
parts = urlparse(url).path.strip('/').split('/')
if not parts or not parts[0]: return "home"
return "/".join(parts[:2])
except:
return "other"
fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict()
pillars = fallback_pillars
df['pillar'] = df['loc'].apply(extract_hierarchy)
pillars = df['pillar'].value_counts().head(15).to_dict()
# Return a sample of URLs for auditing (top 15 most recent if available)
# Sample URLs for auditing (top 15 most recent)
audit_urls = []
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
@@ -74,10 +87,14 @@ class AdvertoolsService:
"success": True,
"metrics": {
"total_urls": total_urls,
"publishing_velocity": round(publishing_velocity, 2),
"stale_content_count": stale_count,
"stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
"publishing_velocity": freshness.get("publishing_velocity"),
"stale_content_count": freshness.get("stale_count"),
"stale_content_percentage": freshness.get("stale_percentage"),
"freshness_score": freshness.get("freshness_score"),
"publishing_recency": freshness.get("publishing_recency"),
"publishing_trend": freshness.get("publishing_trend"),
"top_pillars": pillars,
"url_structure": url_structure,
"audit_sample_urls": audit_urls
},
"timestamp": datetime.utcnow().isoformat()
@@ -86,6 +103,146 @@ class AdvertoolsService:
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
return {"success": False, "error": str(e)}
def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]:
"""Compute content freshness, publishing velocity, and staleness metrics."""
result = {
"publishing_velocity": 0,
"stale_count": 0,
"stale_percentage": 0,
"freshness_score": 0,
"publishing_recency": {},
"publishing_trend": "unknown"
}
if 'lastmod' not in df.columns or df['lastmod'].isna().all():
return result
lastmod = df['lastmod'].dropna()
if lastmod.empty:
return result
now = datetime.now(lastmod.dt.tz)
thirty_days_ago = now - timedelta(days=30)
ninety_days_ago = now - timedelta(days=90)
six_months_ago = now - timedelta(days=180)
recent_urls = df[df['lastmod'] > thirty_days_ago]
stale_urls = df[df['lastmod'] < six_months_ago]
total_urls = len(df)
stale_count = len(stale_urls)
stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0
# Publishing velocity: URLs per week over last 90 days
recent_90 = df[df['lastmod'] > ninety_days_ago]
publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0
# Freshness score (0-100): weighted combination of metrics
non_stale_ratio = 1.0 - (stale_percentage / 100.0)
recency_ratio = len(recent_urls) / max(total_urls, 1)
velocity_score = min(publishing_velocity / 10.0, 1.0)
freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1)
# Publishing recency: URLs published in last 1d, 7d, 30d, 90d
publishing_recency = {
"last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])),
"last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])),
"last_30d": int(len(recent_urls)),
"last_90d": int(len(recent_90)),
}
# Publishing trend: compare recent 30d vs prior 30d
prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))]
recent_count = len(recent_urls)
prior_count = len(prior_30)
if recent_count > prior_count * 1.1:
publishing_trend = "increasing"
elif recent_count < prior_count * 0.9:
publishing_trend = "decreasing"
else:
publishing_trend = "stable"
return {
"publishing_velocity": publishing_velocity,
"stale_count": stale_count,
"stale_percentage": stale_percentage,
"freshness_score": freshness_score,
"publishing_recency": publishing_recency,
"publishing_trend": publishing_trend
}
async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]:
"""Analyze URL patterns for parameter bloat, directory depth, and path patterns."""
try:
loop = asyncio.get_event_loop()
url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls))
if url_df is None or url_df.empty:
return {}
total = len(url_df)
# Query param analysis
has_query = url_df['query'].notna() & (url_df['query'] != '')
param_count = has_query.sum()
param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0
# Extract individual parameters
all_params = []
param_frequency = {}
if param_count > 0:
for q in url_df.loc[has_query, 'query'].dropna().unique():
for pair in q.split('&'):
key = pair.split('=')[0] if '=' in pair else pair
all_params.append(key)
from collections import Counter
param_frequency = dict(Counter(all_params).most_common(10))
# Directory depth analysis
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
def count_depth(row):
for i, col in enumerate(dir_cols):
val = row[col]
if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '':
return i
return len(dir_cols)
depths = url_df.apply(count_depth, axis=1)
avg_depth = round(depths.mean(), 1) if not depths.empty else 0
max_depth = int(depths.max()) if not depths.empty else 0
depth_distribution = depths.value_counts().sort_index().head(10).to_dict()
depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()}
# Protocol consistency
schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {}
# Subdomain analysis
netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None
unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0
primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else ""
return {
"total_urls_analyzed": total,
"parameter_usage": {
"urls_with_params": int(param_count),
"percentage_with_params": param_percentage,
"top_parameters": param_frequency
},
"directory_depth": {
"average_depth": avg_depth,
"max_depth": max_depth,
"distribution": depth_distribution
},
"protocols": {str(k): int(v) for k, v in schemes.items()},
"subdomains": {
"primary": primary_domain,
"unique_count": unique_subdomains
}
}
except Exception as e:
self.logger.warning(f"URL structure analysis failed: {e}")
return {}
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
"""
Performs a shallow crawl and theme analysis using word frequency.
@@ -153,6 +310,512 @@ class AdvertoolsService:
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]:
"""
Crawls a set of pages with link following to analyze internal link health,
redirect chains, and page-level SEO elements.
Extracts metrics via crawlytics: link distribution, redirect chains, image SEO.
"""
temp_file = None
try:
self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}")
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=url_list,
output_file=temp_file,
follow_links=True,
allowed_domains=[site_domain] if site_domain else None,
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 50,
'DOWNLOAD_TIMEOUT': 30,
'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
'DEPTH_LIMIT': 3,
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Site structure crawl produced no output."}
crawl_df = pd.read_json(temp_file, lines=True)
page_count = len(crawl_df)
result = {"success": True, "page_count": page_count}
# --- Link Health via crawlytics ---
try:
internal_regex = site_domain if site_domain else None
link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex)
if link_df is not None and not link_df.empty:
total_links = len(link_df)
internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0
external_links = total_links - internal_links
nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0
# Count links per page
links_per_page = link_df.groupby(level=0).size()
avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0
# Most common anchor text (internal links only)
anchor_texts = []
if 'text' in link_df.columns and 'internal' in link_df.columns:
internal_anchors = link_df[link_df['internal'] == True]['text'].dropna()
for t in internal_anchors:
if isinstance(t, str) and t.strip():
anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2])
from collections import Counter
top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {}
result["link_health"] = {
"total_links_found": total_links,
"internal_link_count": internal_links,
"external_link_count": external_links,
"internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0,
"nofollow_link_count": nofollow_links,
"avg_links_per_page": avg_links_per_page,
"top_anchor_words": top_anchors
}
else:
result["link_health"] = {"error": "No links found in crawl data"}
except Exception as e:
self.logger.warning(f"Link analysis failed: {e}")
result["link_health"] = {"error": str(e)}
# --- Redirect Chain Audit via crawlytics ---
try:
redirect_df = adv.crawlytics.redirects(crawl_df)
if redirect_df is not None and not redirect_df.empty:
total_redirects = len(redirect_df)
redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0
redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {}
multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame()
result["redirect_audit"] = {
"total_redirects": int(total_redirects),
"unique_chains": int(redirect_chains),
"status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()},
"multi_hop_chains": int(len(multi_hop)),
"affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else []
}
else:
result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"}
except Exception as e:
self.logger.warning(f"Redirect analysis failed: {e}")
result["redirect_audit"] = {"error": str(e)}
# --- Image SEO overview via crawlytics ---
try:
img_df = adv.crawlytics.images(crawl_df)
if img_df is not None and not img_df.empty:
total_images = len(img_df)
missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0
alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0
result["image_seo"] = {
"total_images": total_images,
"missing_alt_count": missing_alt,
"alt_coverage_percentage": alt_coverage
}
except Exception as e:
self.logger.warning(f"Image analysis failed: {e}")
# --- Page-level metrics ---
if 'status' in crawl_df.columns:
status_dist = crawl_df['status'].value_counts().to_dict()
result["page_status"] = {str(k): int(v) for k, v in status_dist.items()}
if 'title' in crawl_df.columns:
missing_titles = int(crawl_df['title'].isna().sum())
result["missing_titles"] = missing_titles
if 'meta_desc' in crawl_df.columns:
missing_descriptions = int(crawl_df['meta_desc'].isna().sum())
result["missing_descriptions"] = missing_descriptions
result["timestamp"] = datetime.utcnow().isoformat()
return result
except Exception as e:
self.logger.error(f"Failed to analyze site structure: {str(e)}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try:
os.remove(temp_file)
except Exception as e:
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]:
"""
Fetch and analyze robots.txt for compliance issues.
Checks directives, sitemap declaration, crawl-delay, and common problems.
"""
try:
self.logger.info(f"Analyzing robots.txt for {website_url}")
parsed = urlparse(website_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
robots_url = f"{base_url}/robots.txt"
result = {
"success": True,
"url": robots_url,
"accessible": True,
"total_directives": 0,
"user_agents_found": [],
"has_sitemap_directive": False,
"sitemap_urls": [],
"has_crawl_delay": False,
"disallow_rules": [],
"issues": [],
"compliance_score": 100,
}
loop = asyncio.get_event_loop()
try:
robots_df = await loop.run_in_executor(
None, lambda: adv.robotstxt_to_df(robots_url)
)
if robots_df is None or robots_df.empty:
raise ValueError("Empty result from robotstxt_to_df")
except Exception as adv_err:
self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}")
robots_df = await loop.run_in_executor(
None, lambda: self._parse_robots_txt_manual(robots_url)
)
if robots_df is None or robots_df.empty:
result["success"] = False
result["error"] = "Could not fetch or parse robots.txt"
result["accessible"] = False
return result
result["total_directives"] = len(robots_df)
if 'user_agent' in robots_df.columns:
result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist()
rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None
value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None
if rule_col and value_col:
rules_lower = robots_df[rule_col].astype(str).str.lower()
result["has_sitemap_directive"] = 'sitemap' in rules_lower.values
result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values
has_disallow_all = any(
str(row.get(value_col, '')).strip() == '/'
for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows()
) if 'disallow' in rules_lower.values else False
disallow_mask = rules_lower == 'disallow'
if disallow_mask.any():
for _, row in robots_df[disallow_mask].iterrows():
val = str(row.get(value_col, ''))
ua = str(row.get('user_agent', '*'))
if val:
result["disallow_rules"].append({"user_agent": ua, "path": val})
sitemap_mask = rules_lower == 'sitemap'
if sitemap_mask.any():
result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist()
if has_disallow_all:
result["issues"].append({
"severity": "critical", "code": "DISALLOW_ALL",
"detail": "robots.txt disallows all user agents from all paths (Disallow: /)"
})
if not result["has_sitemap_directive"]:
result["issues"].append({
"severity": "warning", "code": "NO_SITEMAP",
"detail": "No Sitemap directive found — search engines may miss pages"
})
if not result["has_crawl_delay"]:
result["issues"].append({
"severity": "info", "code": "NO_CRAWL_DELAY",
"detail": "No Crawl-delay directive set — not critical for most sites"
})
for issue in result["issues"]:
sev = issue["severity"]
if sev == "critical":
result["compliance_score"] -= 30
elif sev == "warning":
result["compliance_score"] -= 15
elif sev == "info":
result["compliance_score"] -= 5
result["compliance_score"] = max(result["compliance_score"], 0)
return result
except Exception as e:
self.logger.error(f"Robots.txt analysis failed: {e}")
return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url}
def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame:
"""Fallback: manually fetch and parse robots.txt."""
records = []
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=15) as resp:
content = resp.read().decode("utf-8", errors="replace")
current_ua = "*"
for line in content.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
if line.lower().startswith("user-agent"):
parts = line.split(":", 1)
current_ua = parts[1].strip() if len(parts) > 1 else "*"
continue
if ":" in line:
directive, _, value = line.partition(":")
records.append({
"user_agent": current_ua,
"rule": directive.strip(),
"value": value.strip(),
})
except Exception as e:
self.logger.warning(f"Manual robots.txt fetch failed: {e}")
if not records:
return pd.DataFrame()
return pd.DataFrame(records)
async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]:
"""
Analyze crawl budget by comparing sitemap inventory against actual crawl results.
Estimates budget utilization, waste from redirects/errors, and optimization score.
"""
temp_file = None
try:
self.logger.info(f"Analyzing crawl budget for {site_domain}")
loop = asyncio.get_event_loop()
sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0
start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
temp_file = tf.name
await loop.run_in_executor(None, lambda: adv.crawl(
url_list=[start_url],
output_file=temp_file,
follow_links=True,
allowed_domains=[site_domain],
custom_settings={
'LOG_LEVEL': 'WARNING',
'CLOSESPIDER_PAGECOUNT': 30,
'DOWNLOAD_TIMEOUT': 15,
'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
'DEPTH_LIMIT': 2,
}
))
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
return {"success": False, "error": "Crawl produced no output"}
crawl_df = pd.read_json(temp_file, lines=True)
crawled_count = len(crawl_df)
status_dist = {}
if 'status' in crawl_df.columns:
raw = crawl_df['status'].value_counts().to_dict()
status_dist = {str(k): int(v) for k, v in raw.items()}
wasted = 0
for code_s in status_dist:
code = int(code_s)
if code >= 300 or code < 200:
wasted += status_dist[code_s]
budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3)
waste_ratio = round(wasted / max(crawled_count, 1), 3)
depth_dist = {}
if 'depth' in crawl_df.columns:
raw = crawl_df['depth'].value_counts().sort_index().to_dict()
depth_dist = {str(k): int(v) for k, v in raw.items()}
param_count = 0
url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None
if url_col:
param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum())
optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1))
return {
"success": True,
"sitemap_total_urls": sitemap_total,
"pages_crawled": crawled_count,
"crawl_coverage_percentage": round(budget_usage_ratio * 100, 1),
"status_distribution": status_dist,
"wasted_crawl_requests": int(wasted),
"waste_percentage": round(waste_ratio * 100, 1),
"depth_distribution": depth_dist,
"urls_with_parameters": int(param_count),
"optimization_score": optimization_score,
}
except Exception as e:
self.logger.error(f"Crawl budget analysis failed: {e}")
return {"success": False, "error": str(e)}
finally:
if temp_file and os.path.exists(temp_file):
try: os.remove(temp_file)
except Exception: pass
async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]:
"""
Compare two sitemaps for competitive content gap analysis.
Analyzes URL count, freshness, directory pillars, and identifies
patterns unique to each sitemap.
"""
try:
self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}")
loop = asyncio.get_event_loop()
df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a))
df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b))
total_a = len(df_a) if df_a is not None and not df_a.empty else 0
total_b = len(df_b) if df_b is not None and not df_b.empty else 0
result = {
"success": True,
"sitemap_a": {"url": sitemap_a, "total_urls": total_a},
"sitemap_b": {"url": sitemap_b, "total_urls": total_b},
"url_count_diff": total_a - total_b,
"ratio": round(total_a / max(total_b, 1), 2),
"pillars_a": {},
"pillars_b": {},
"shared_pillars": [],
"unique_to_a": [],
"unique_to_b": [],
"freshness_comparison": {},
"overlap_score": 0,
}
if total_a == 0 or total_b == 0:
return result
def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]:
pillars = {}
if 'loc' in df.columns:
try:
url_df = adv.url_to_df(df['loc'])
if url_df is not None and not url_df.empty:
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
if dir_cols:
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
for col in dir_cols[1:3]:
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
pillars = pillar_series.value_counts().head(20).to_dict()
except Exception:
pass
if not pillars:
seen = {}
for url in df['loc'].dropna():
parts = urlparse(url).path.strip('/').split('/')
key = parts[0] if parts and parts[0] else "home"
seen[key] = seen.get(key, 0) + 1
pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20])
pillar_keys = list(pillars.keys()) if pillars else []
return pillars, pillar_keys
pillars_a, keys_a = extract_pillars(df_a, "a")
pillars_b, keys_b = extract_pillars(df_b, "b")
result["pillars_a"] = pillars_a
result["pillars_b"] = pillars_b
set_a = set(keys_a)
set_b = set(keys_b)
shared = set_a & set_b
result["shared_pillars"] = sorted(shared)
result["unique_to_a"] = sorted(set_a - set_b)
result["unique_to_b"] = sorted(set_b - set_a)
total_keys = max(len(set_a | set_b), 1)
overlap_count = len(shared)
result["overlap_score"] = round((overlap_count / total_keys) * 100, 1)
def compute_freshness_stats(df: pd.DataFrame) -> dict:
stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0}
if 'lastmod' in df.columns:
lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna()
if not lm.empty:
stats["has_lastmod"] = True
stats["total_with_dates"] = int(len(lm))
stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum())
return stats
result["freshness_comparison"] = {
"a": compute_freshness_stats(df_a),
"b": compute_freshness_stats(df_b),
}
return result
except Exception as e:
self.logger.error(f"Sitemap comparison failed: {e}")
return {"success": False, "error": str(e)}
async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]:
"""
Compare two crawl analysis result dicts to surface changes over time.
Useful for tracking SEO improvements between scheduled executions.
"""
try:
diff = {
"success": True,
"page_count_change": 0,
"status_distribution_changes": {},
"link_health_changes": {},
"redirect_changes": {},
"new_issues": [],
"resolved_issues": [],
}
pc_a = result_a.get("page_count", 0)
pc_b = result_b.get("page_count", 0)
diff["page_count_change"] = pc_b - pc_a
sd_a = result_a.get("page_status", {})
sd_b = result_b.get("page_status", {})
all_codes = set(list(sd_a.keys()) + list(sd_b.keys()))
for c in sorted(all_codes):
va = sd_a.get(c, 0)
vb = sd_b.get(c, 0)
change = vb - va
if change != 0:
diff["status_distribution_changes"][c] = change
def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict:
changes = {}
all_keys = set(list(d_a.keys()) + list(d_b.keys()))
for k in all_keys:
va = d_a.get(k, 0)
vb = d_b.get(k, 0)
if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
change = round(vb - va, 2)
if change != 0:
changes[f"{prefix}_{k}"] = change
return changes
lh_a = result_a.get("link_health", {})
lh_b = result_b.get("link_health", {})
diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link")
rd_a = result_a.get("redirect_audit", {})
rd_b = result_b.get("redirect_audit", {})
diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect")
return diff
except Exception as e:
self.logger.error(f"Crawl comparison failed: {e}")
return {"success": False, "error": str(e)}
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
"""
Analyzes linking patterns and social media presence using unique temporary files.

View File

@@ -454,14 +454,12 @@ class SEODashboardService:
def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
"""Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
try:
# 1. Get augmented persona themes from WebsiteAnalysis
session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
if not session:
return {}
analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
# 2. Get latest tasks status
tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
audit_status = "pending"
@@ -479,6 +477,14 @@ class SEODashboardService:
return {
"augmented_themes": brand_analysis.get('augmented_themes', []),
"link_health": brand_analysis.get('link_health', {}),
"redirect_audit": brand_analysis.get('redirect_audit', {}),
"image_seo": brand_analysis.get('image_seo', {}),
"page_status": brand_analysis.get('page_status', {}),
"url_structure": brand_analysis.get('url_structure', {}),
"freshness": brand_analysis.get('freshness', {}),
"robots_txt": brand_analysis.get('robots_txt', {}),
"crawl_budget": brand_analysis.get('crawl_budget', {}),
"last_audit": brand_analysis.get('last_advertools_audit'),
"site_health": seo_audit.get('site_health', {}),
"last_health_check": seo_audit.get('last_advertools_health_check'),