chore: bulk commit of local changes across blog writer, SEO dashboard, scheduler, docs-site, and frontend
This commit is contained in:
@@ -1,12 +1,18 @@
|
||||
import advertools as adv
|
||||
import pandas as pd
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from urllib.parse import urlparse
|
||||
from collections import Counter
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import socket
|
||||
import re
|
||||
|
||||
class AdvertoolsService:
|
||||
"""
|
||||
@@ -19,51 +25,58 @@ class AdvertoolsService:
|
||||
|
||||
async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyzes a website's sitemap to extract metrics on publishing velocity and freshness.
|
||||
Analyzes a website's sitemap to extract metrics on publishing velocity, freshness,
|
||||
URL structure patterns, and topic distribution.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Analyzing sitemap: {sitemap_url}")
|
||||
|
||||
# advertools sitemap_to_df is blocking, run in executor
|
||||
loop = asyncio.get_event_loop()
|
||||
df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
|
||||
|
||||
if df is None or df.empty:
|
||||
return {"success": False, "error": "Sitemap is empty or could not be parsed."}
|
||||
|
||||
# Convert lastmod to datetime
|
||||
if 'lastmod' in df.columns:
|
||||
df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True)
|
||||
|
||||
total_urls = len(df)
|
||||
|
||||
# Handle potential empty datetime columns
|
||||
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
||||
now = datetime.now(df['lastmod'].dt.tz)
|
||||
thirty_days_ago = now - timedelta(days=30)
|
||||
recent_urls = df[df['lastmod'] > thirty_days_ago]
|
||||
six_months_ago = now - timedelta(days=180)
|
||||
stale_urls = df[df['lastmod'] < six_months_ago]
|
||||
|
||||
publishing_velocity = len(recent_urls) / 4.0 # URLs per week
|
||||
stale_count = len(stale_urls)
|
||||
else:
|
||||
publishing_velocity = 0
|
||||
stale_count = 0
|
||||
# --- Content Freshness Scoring ---
|
||||
freshness = self._compute_freshness(df)
|
||||
|
||||
# Enhanced Content Pillars (Top folder patterns - 3 levels deep)
|
||||
def extract_hierarchy(url: str):
|
||||
try:
|
||||
parts = urlparse(url).path.strip('/').split('/')
|
||||
if not parts or not parts[0]: return "home"
|
||||
return "/".join(parts[:2]) # Capture top 2 segments
|
||||
except:
|
||||
return "other"
|
||||
# --- URL Structure Analysis ---
|
||||
url_structure = {}
|
||||
if 'loc' in df.columns:
|
||||
url_structure = await self._analyze_url_structure(df['loc'].tolist())
|
||||
|
||||
# --- Content Pillars via url_to_df ---
|
||||
pillars = {}
|
||||
url_df = None
|
||||
try:
|
||||
url_df = adv.url_to_df(df['loc'])
|
||||
if url_df is not None and not url_df.empty:
|
||||
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
|
||||
if dir_cols:
|
||||
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
|
||||
for col in dir_cols[1:3]:
|
||||
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
|
||||
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
|
||||
pillars = pillar_series.value_counts().head(15).to_dict()
|
||||
except Exception:
|
||||
fallback_pillars = {}
|
||||
if 'loc' in df.columns:
|
||||
def extract_hierarchy(url: str):
|
||||
try:
|
||||
parts = urlparse(url).path.strip('/').split('/')
|
||||
if not parts or not parts[0]: return "home"
|
||||
return "/".join(parts[:2])
|
||||
except:
|
||||
return "other"
|
||||
fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict()
|
||||
pillars = fallback_pillars
|
||||
|
||||
df['pillar'] = df['loc'].apply(extract_hierarchy)
|
||||
pillars = df['pillar'].value_counts().head(15).to_dict()
|
||||
|
||||
# Return a sample of URLs for auditing (top 15 most recent if available)
|
||||
# Sample URLs for auditing (top 15 most recent)
|
||||
audit_urls = []
|
||||
if 'lastmod' in df.columns and not df['lastmod'].isna().all():
|
||||
audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist()
|
||||
@@ -74,10 +87,14 @@ class AdvertoolsService:
|
||||
"success": True,
|
||||
"metrics": {
|
||||
"total_urls": total_urls,
|
||||
"publishing_velocity": round(publishing_velocity, 2),
|
||||
"stale_content_count": stale_count,
|
||||
"stale_content_percentage": round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0,
|
||||
"publishing_velocity": freshness.get("publishing_velocity"),
|
||||
"stale_content_count": freshness.get("stale_count"),
|
||||
"stale_content_percentage": freshness.get("stale_percentage"),
|
||||
"freshness_score": freshness.get("freshness_score"),
|
||||
"publishing_recency": freshness.get("publishing_recency"),
|
||||
"publishing_trend": freshness.get("publishing_trend"),
|
||||
"top_pillars": pillars,
|
||||
"url_structure": url_structure,
|
||||
"audit_sample_urls": audit_urls
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
@@ -86,6 +103,146 @@ class AdvertoolsService:
|
||||
self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""Compute content freshness, publishing velocity, and staleness metrics."""
|
||||
result = {
|
||||
"publishing_velocity": 0,
|
||||
"stale_count": 0,
|
||||
"stale_percentage": 0,
|
||||
"freshness_score": 0,
|
||||
"publishing_recency": {},
|
||||
"publishing_trend": "unknown"
|
||||
}
|
||||
|
||||
if 'lastmod' not in df.columns or df['lastmod'].isna().all():
|
||||
return result
|
||||
|
||||
lastmod = df['lastmod'].dropna()
|
||||
if lastmod.empty:
|
||||
return result
|
||||
|
||||
now = datetime.now(lastmod.dt.tz)
|
||||
thirty_days_ago = now - timedelta(days=30)
|
||||
ninety_days_ago = now - timedelta(days=90)
|
||||
six_months_ago = now - timedelta(days=180)
|
||||
|
||||
recent_urls = df[df['lastmod'] > thirty_days_ago]
|
||||
stale_urls = df[df['lastmod'] < six_months_ago]
|
||||
|
||||
total_urls = len(df)
|
||||
stale_count = len(stale_urls)
|
||||
stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0
|
||||
|
||||
# Publishing velocity: URLs per week over last 90 days
|
||||
recent_90 = df[df['lastmod'] > ninety_days_ago]
|
||||
publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0
|
||||
|
||||
# Freshness score (0-100): weighted combination of metrics
|
||||
non_stale_ratio = 1.0 - (stale_percentage / 100.0)
|
||||
recency_ratio = len(recent_urls) / max(total_urls, 1)
|
||||
velocity_score = min(publishing_velocity / 10.0, 1.0)
|
||||
freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1)
|
||||
|
||||
# Publishing recency: URLs published in last 1d, 7d, 30d, 90d
|
||||
publishing_recency = {
|
||||
"last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])),
|
||||
"last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])),
|
||||
"last_30d": int(len(recent_urls)),
|
||||
"last_90d": int(len(recent_90)),
|
||||
}
|
||||
|
||||
# Publishing trend: compare recent 30d vs prior 30d
|
||||
prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))]
|
||||
recent_count = len(recent_urls)
|
||||
prior_count = len(prior_30)
|
||||
if recent_count > prior_count * 1.1:
|
||||
publishing_trend = "increasing"
|
||||
elif recent_count < prior_count * 0.9:
|
||||
publishing_trend = "decreasing"
|
||||
else:
|
||||
publishing_trend = "stable"
|
||||
|
||||
return {
|
||||
"publishing_velocity": publishing_velocity,
|
||||
"stale_count": stale_count,
|
||||
"stale_percentage": stale_percentage,
|
||||
"freshness_score": freshness_score,
|
||||
"publishing_recency": publishing_recency,
|
||||
"publishing_trend": publishing_trend
|
||||
}
|
||||
|
||||
async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]:
|
||||
"""Analyze URL patterns for parameter bloat, directory depth, and path patterns."""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls))
|
||||
|
||||
if url_df is None or url_df.empty:
|
||||
return {}
|
||||
|
||||
total = len(url_df)
|
||||
|
||||
# Query param analysis
|
||||
has_query = url_df['query'].notna() & (url_df['query'] != '')
|
||||
param_count = has_query.sum()
|
||||
param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0
|
||||
|
||||
# Extract individual parameters
|
||||
all_params = []
|
||||
param_frequency = {}
|
||||
if param_count > 0:
|
||||
for q in url_df.loc[has_query, 'query'].dropna().unique():
|
||||
for pair in q.split('&'):
|
||||
key = pair.split('=')[0] if '=' in pair else pair
|
||||
all_params.append(key)
|
||||
from collections import Counter
|
||||
param_frequency = dict(Counter(all_params).most_common(10))
|
||||
|
||||
# Directory depth analysis
|
||||
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
|
||||
def count_depth(row):
|
||||
for i, col in enumerate(dir_cols):
|
||||
val = row[col]
|
||||
if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '':
|
||||
return i
|
||||
return len(dir_cols)
|
||||
|
||||
depths = url_df.apply(count_depth, axis=1)
|
||||
avg_depth = round(depths.mean(), 1) if not depths.empty else 0
|
||||
max_depth = int(depths.max()) if not depths.empty else 0
|
||||
depth_distribution = depths.value_counts().sort_index().head(10).to_dict()
|
||||
depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()}
|
||||
|
||||
# Protocol consistency
|
||||
schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {}
|
||||
|
||||
# Subdomain analysis
|
||||
netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None
|
||||
unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0
|
||||
primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else ""
|
||||
|
||||
return {
|
||||
"total_urls_analyzed": total,
|
||||
"parameter_usage": {
|
||||
"urls_with_params": int(param_count),
|
||||
"percentage_with_params": param_percentage,
|
||||
"top_parameters": param_frequency
|
||||
},
|
||||
"directory_depth": {
|
||||
"average_depth": avg_depth,
|
||||
"max_depth": max_depth,
|
||||
"distribution": depth_distribution
|
||||
},
|
||||
"protocols": {str(k): int(v) for k, v in schemes.items()},
|
||||
"subdomains": {
|
||||
"primary": primary_domain,
|
||||
"unique_count": unique_subdomains
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"URL structure analysis failed: {e}")
|
||||
return {}
|
||||
|
||||
async def audit_content(self, url_list: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Performs a shallow crawl and theme analysis using word frequency.
|
||||
@@ -153,6 +310,512 @@ class AdvertoolsService:
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|
||||
|
||||
async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Crawls a set of pages with link following to analyze internal link health,
|
||||
redirect chains, and page-level SEO elements.
|
||||
|
||||
Extracts metrics via crawlytics: link distribution, redirect chains, image SEO.
|
||||
"""
|
||||
temp_file = None
|
||||
try:
|
||||
self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
||||
temp_file = tf.name
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, lambda: adv.crawl(
|
||||
url_list=url_list,
|
||||
output_file=temp_file,
|
||||
follow_links=True,
|
||||
allowed_domains=[site_domain] if site_domain else None,
|
||||
custom_settings={
|
||||
'LOG_LEVEL': 'WARNING',
|
||||
'CLOSESPIDER_PAGECOUNT': 50,
|
||||
'DOWNLOAD_TIMEOUT': 30,
|
||||
'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
|
||||
'DEPTH_LIMIT': 3,
|
||||
}
|
||||
))
|
||||
|
||||
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
||||
return {"success": False, "error": "Site structure crawl produced no output."}
|
||||
|
||||
crawl_df = pd.read_json(temp_file, lines=True)
|
||||
page_count = len(crawl_df)
|
||||
result = {"success": True, "page_count": page_count}
|
||||
|
||||
# --- Link Health via crawlytics ---
|
||||
try:
|
||||
internal_regex = site_domain if site_domain else None
|
||||
link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex)
|
||||
if link_df is not None and not link_df.empty:
|
||||
total_links = len(link_df)
|
||||
internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0
|
||||
external_links = total_links - internal_links
|
||||
nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0
|
||||
|
||||
# Count links per page
|
||||
links_per_page = link_df.groupby(level=0).size()
|
||||
avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0
|
||||
|
||||
# Most common anchor text (internal links only)
|
||||
anchor_texts = []
|
||||
if 'text' in link_df.columns and 'internal' in link_df.columns:
|
||||
internal_anchors = link_df[link_df['internal'] == True]['text'].dropna()
|
||||
for t in internal_anchors:
|
||||
if isinstance(t, str) and t.strip():
|
||||
anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2])
|
||||
from collections import Counter
|
||||
top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {}
|
||||
|
||||
result["link_health"] = {
|
||||
"total_links_found": total_links,
|
||||
"internal_link_count": internal_links,
|
||||
"external_link_count": external_links,
|
||||
"internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0,
|
||||
"nofollow_link_count": nofollow_links,
|
||||
"avg_links_per_page": avg_links_per_page,
|
||||
"top_anchor_words": top_anchors
|
||||
}
|
||||
else:
|
||||
result["link_health"] = {"error": "No links found in crawl data"}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Link analysis failed: {e}")
|
||||
result["link_health"] = {"error": str(e)}
|
||||
|
||||
# --- Redirect Chain Audit via crawlytics ---
|
||||
try:
|
||||
redirect_df = adv.crawlytics.redirects(crawl_df)
|
||||
if redirect_df is not None and not redirect_df.empty:
|
||||
total_redirects = len(redirect_df)
|
||||
redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0
|
||||
redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {}
|
||||
multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame()
|
||||
|
||||
result["redirect_audit"] = {
|
||||
"total_redirects": int(total_redirects),
|
||||
"unique_chains": int(redirect_chains),
|
||||
"status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()},
|
||||
"multi_hop_chains": int(len(multi_hop)),
|
||||
"affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else []
|
||||
}
|
||||
else:
|
||||
result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Redirect analysis failed: {e}")
|
||||
result["redirect_audit"] = {"error": str(e)}
|
||||
|
||||
# --- Image SEO overview via crawlytics ---
|
||||
try:
|
||||
img_df = adv.crawlytics.images(crawl_df)
|
||||
if img_df is not None and not img_df.empty:
|
||||
total_images = len(img_df)
|
||||
missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0
|
||||
alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0
|
||||
result["image_seo"] = {
|
||||
"total_images": total_images,
|
||||
"missing_alt_count": missing_alt,
|
||||
"alt_coverage_percentage": alt_coverage
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Image analysis failed: {e}")
|
||||
|
||||
# --- Page-level metrics ---
|
||||
if 'status' in crawl_df.columns:
|
||||
status_dist = crawl_df['status'].value_counts().to_dict()
|
||||
result["page_status"] = {str(k): int(v) for k, v in status_dist.items()}
|
||||
if 'title' in crawl_df.columns:
|
||||
missing_titles = int(crawl_df['title'].isna().sum())
|
||||
result["missing_titles"] = missing_titles
|
||||
if 'meta_desc' in crawl_df.columns:
|
||||
missing_descriptions = int(crawl_df['meta_desc'].isna().sum())
|
||||
result["missing_descriptions"] = missing_descriptions
|
||||
|
||||
result["timestamp"] = datetime.utcnow().isoformat()
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to analyze site structure: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
if temp_file and os.path.exists(temp_file):
|
||||
try:
|
||||
os.remove(temp_file)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")
|
||||
|
||||
async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch and analyze robots.txt for compliance issues.
|
||||
Checks directives, sitemap declaration, crawl-delay, and common problems.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Analyzing robots.txt for {website_url}")
|
||||
parsed = urlparse(website_url)
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
robots_url = f"{base_url}/robots.txt"
|
||||
result = {
|
||||
"success": True,
|
||||
"url": robots_url,
|
||||
"accessible": True,
|
||||
"total_directives": 0,
|
||||
"user_agents_found": [],
|
||||
"has_sitemap_directive": False,
|
||||
"sitemap_urls": [],
|
||||
"has_crawl_delay": False,
|
||||
"disallow_rules": [],
|
||||
"issues": [],
|
||||
"compliance_score": 100,
|
||||
}
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
robots_df = await loop.run_in_executor(
|
||||
None, lambda: adv.robotstxt_to_df(robots_url)
|
||||
)
|
||||
if robots_df is None or robots_df.empty:
|
||||
raise ValueError("Empty result from robotstxt_to_df")
|
||||
except Exception as adv_err:
|
||||
self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}")
|
||||
robots_df = await loop.run_in_executor(
|
||||
None, lambda: self._parse_robots_txt_manual(robots_url)
|
||||
)
|
||||
if robots_df is None or robots_df.empty:
|
||||
result["success"] = False
|
||||
result["error"] = "Could not fetch or parse robots.txt"
|
||||
result["accessible"] = False
|
||||
return result
|
||||
|
||||
result["total_directives"] = len(robots_df)
|
||||
|
||||
if 'user_agent' in robots_df.columns:
|
||||
result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist()
|
||||
|
||||
rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None
|
||||
value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None
|
||||
|
||||
if rule_col and value_col:
|
||||
rules_lower = robots_df[rule_col].astype(str).str.lower()
|
||||
result["has_sitemap_directive"] = 'sitemap' in rules_lower.values
|
||||
result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values
|
||||
has_disallow_all = any(
|
||||
str(row.get(value_col, '')).strip() == '/'
|
||||
for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows()
|
||||
) if 'disallow' in rules_lower.values else False
|
||||
|
||||
disallow_mask = rules_lower == 'disallow'
|
||||
if disallow_mask.any():
|
||||
for _, row in robots_df[disallow_mask].iterrows():
|
||||
val = str(row.get(value_col, ''))
|
||||
ua = str(row.get('user_agent', '*'))
|
||||
if val:
|
||||
result["disallow_rules"].append({"user_agent": ua, "path": val})
|
||||
|
||||
sitemap_mask = rules_lower == 'sitemap'
|
||||
if sitemap_mask.any():
|
||||
result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist()
|
||||
|
||||
if has_disallow_all:
|
||||
result["issues"].append({
|
||||
"severity": "critical", "code": "DISALLOW_ALL",
|
||||
"detail": "robots.txt disallows all user agents from all paths (Disallow: /)"
|
||||
})
|
||||
|
||||
if not result["has_sitemap_directive"]:
|
||||
result["issues"].append({
|
||||
"severity": "warning", "code": "NO_SITEMAP",
|
||||
"detail": "No Sitemap directive found — search engines may miss pages"
|
||||
})
|
||||
if not result["has_crawl_delay"]:
|
||||
result["issues"].append({
|
||||
"severity": "info", "code": "NO_CRAWL_DELAY",
|
||||
"detail": "No Crawl-delay directive set — not critical for most sites"
|
||||
})
|
||||
|
||||
for issue in result["issues"]:
|
||||
sev = issue["severity"]
|
||||
if sev == "critical":
|
||||
result["compliance_score"] -= 30
|
||||
elif sev == "warning":
|
||||
result["compliance_score"] -= 15
|
||||
elif sev == "info":
|
||||
result["compliance_score"] -= 5
|
||||
result["compliance_score"] = max(result["compliance_score"], 0)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Robots.txt analysis failed: {e}")
|
||||
return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url}
|
||||
|
||||
def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame:
|
||||
"""Fallback: manually fetch and parse robots.txt."""
|
||||
records = []
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
content = resp.read().decode("utf-8", errors="replace")
|
||||
current_ua = "*"
|
||||
for line in content.splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if line.lower().startswith("user-agent"):
|
||||
parts = line.split(":", 1)
|
||||
current_ua = parts[1].strip() if len(parts) > 1 else "*"
|
||||
continue
|
||||
if ":" in line:
|
||||
directive, _, value = line.partition(":")
|
||||
records.append({
|
||||
"user_agent": current_ua,
|
||||
"rule": directive.strip(),
|
||||
"value": value.strip(),
|
||||
})
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Manual robots.txt fetch failed: {e}")
|
||||
if not records:
|
||||
return pd.DataFrame()
|
||||
return pd.DataFrame(records)
|
||||
|
||||
async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze crawl budget by comparing sitemap inventory against actual crawl results.
|
||||
Estimates budget utilization, waste from redirects/errors, and optimization score.
|
||||
"""
|
||||
temp_file = None
|
||||
try:
|
||||
self.logger.info(f"Analyzing crawl budget for {site_domain}")
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url))
|
||||
sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0
|
||||
|
||||
start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf:
|
||||
temp_file = tf.name
|
||||
|
||||
await loop.run_in_executor(None, lambda: adv.crawl(
|
||||
url_list=[start_url],
|
||||
output_file=temp_file,
|
||||
follow_links=True,
|
||||
allowed_domains=[site_domain],
|
||||
custom_settings={
|
||||
'LOG_LEVEL': 'WARNING',
|
||||
'CLOSESPIDER_PAGECOUNT': 30,
|
||||
'DOWNLOAD_TIMEOUT': 15,
|
||||
'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
|
||||
'DEPTH_LIMIT': 2,
|
||||
}
|
||||
))
|
||||
|
||||
if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0:
|
||||
return {"success": False, "error": "Crawl produced no output"}
|
||||
|
||||
crawl_df = pd.read_json(temp_file, lines=True)
|
||||
crawled_count = len(crawl_df)
|
||||
|
||||
status_dist = {}
|
||||
if 'status' in crawl_df.columns:
|
||||
raw = crawl_df['status'].value_counts().to_dict()
|
||||
status_dist = {str(k): int(v) for k, v in raw.items()}
|
||||
|
||||
wasted = 0
|
||||
for code_s in status_dist:
|
||||
code = int(code_s)
|
||||
if code >= 300 or code < 200:
|
||||
wasted += status_dist[code_s]
|
||||
|
||||
budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3)
|
||||
waste_ratio = round(wasted / max(crawled_count, 1), 3)
|
||||
|
||||
depth_dist = {}
|
||||
if 'depth' in crawl_df.columns:
|
||||
raw = crawl_df['depth'].value_counts().sort_index().to_dict()
|
||||
depth_dist = {str(k): int(v) for k, v in raw.items()}
|
||||
|
||||
param_count = 0
|
||||
url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None
|
||||
if url_col:
|
||||
param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum())
|
||||
|
||||
optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1))
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"sitemap_total_urls": sitemap_total,
|
||||
"pages_crawled": crawled_count,
|
||||
"crawl_coverage_percentage": round(budget_usage_ratio * 100, 1),
|
||||
"status_distribution": status_dist,
|
||||
"wasted_crawl_requests": int(wasted),
|
||||
"waste_percentage": round(waste_ratio * 100, 1),
|
||||
"depth_distribution": depth_dist,
|
||||
"urls_with_parameters": int(param_count),
|
||||
"optimization_score": optimization_score,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Crawl budget analysis failed: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
if temp_file and os.path.exists(temp_file):
|
||||
try: os.remove(temp_file)
|
||||
except Exception: pass
|
||||
|
||||
async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Compare two sitemaps for competitive content gap analysis.
|
||||
Analyzes URL count, freshness, directory pillars, and identifies
|
||||
patterns unique to each sitemap.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}")
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a))
|
||||
df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b))
|
||||
|
||||
total_a = len(df_a) if df_a is not None and not df_a.empty else 0
|
||||
total_b = len(df_b) if df_b is not None and not df_b.empty else 0
|
||||
result = {
|
||||
"success": True,
|
||||
"sitemap_a": {"url": sitemap_a, "total_urls": total_a},
|
||||
"sitemap_b": {"url": sitemap_b, "total_urls": total_b},
|
||||
"url_count_diff": total_a - total_b,
|
||||
"ratio": round(total_a / max(total_b, 1), 2),
|
||||
"pillars_a": {},
|
||||
"pillars_b": {},
|
||||
"shared_pillars": [],
|
||||
"unique_to_a": [],
|
||||
"unique_to_b": [],
|
||||
"freshness_comparison": {},
|
||||
"overlap_score": 0,
|
||||
}
|
||||
|
||||
if total_a == 0 or total_b == 0:
|
||||
return result
|
||||
|
||||
def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]:
|
||||
pillars = {}
|
||||
if 'loc' in df.columns:
|
||||
try:
|
||||
url_df = adv.url_to_df(df['loc'])
|
||||
if url_df is not None and not url_df.empty:
|
||||
dir_cols = [c for c in url_df.columns if c.startswith('dir_')]
|
||||
if dir_cols:
|
||||
pillar_series = url_df[dir_cols[0]].fillna("home").astype(str)
|
||||
for col in dir_cols[1:3]:
|
||||
mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan')
|
||||
pillar_series = pillar_series + "/" + url_df[col].where(mask, "")
|
||||
pillars = pillar_series.value_counts().head(20).to_dict()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not pillars:
|
||||
seen = {}
|
||||
for url in df['loc'].dropna():
|
||||
parts = urlparse(url).path.strip('/').split('/')
|
||||
key = parts[0] if parts and parts[0] else "home"
|
||||
seen[key] = seen.get(key, 0) + 1
|
||||
pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20])
|
||||
|
||||
pillar_keys = list(pillars.keys()) if pillars else []
|
||||
return pillars, pillar_keys
|
||||
|
||||
pillars_a, keys_a = extract_pillars(df_a, "a")
|
||||
pillars_b, keys_b = extract_pillars(df_b, "b")
|
||||
result["pillars_a"] = pillars_a
|
||||
result["pillars_b"] = pillars_b
|
||||
|
||||
set_a = set(keys_a)
|
||||
set_b = set(keys_b)
|
||||
shared = set_a & set_b
|
||||
result["shared_pillars"] = sorted(shared)
|
||||
result["unique_to_a"] = sorted(set_a - set_b)
|
||||
result["unique_to_b"] = sorted(set_b - set_a)
|
||||
|
||||
total_keys = max(len(set_a | set_b), 1)
|
||||
overlap_count = len(shared)
|
||||
result["overlap_score"] = round((overlap_count / total_keys) * 100, 1)
|
||||
|
||||
def compute_freshness_stats(df: pd.DataFrame) -> dict:
|
||||
stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0}
|
||||
if 'lastmod' in df.columns:
|
||||
lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna()
|
||||
if not lm.empty:
|
||||
stats["has_lastmod"] = True
|
||||
stats["total_with_dates"] = int(len(lm))
|
||||
stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum())
|
||||
return stats
|
||||
|
||||
result["freshness_comparison"] = {
|
||||
"a": compute_freshness_stats(df_a),
|
||||
"b": compute_freshness_stats(df_b),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Sitemap comparison failed: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Compare two crawl analysis result dicts to surface changes over time.
|
||||
Useful for tracking SEO improvements between scheduled executions.
|
||||
"""
|
||||
try:
|
||||
diff = {
|
||||
"success": True,
|
||||
"page_count_change": 0,
|
||||
"status_distribution_changes": {},
|
||||
"link_health_changes": {},
|
||||
"redirect_changes": {},
|
||||
"new_issues": [],
|
||||
"resolved_issues": [],
|
||||
}
|
||||
|
||||
pc_a = result_a.get("page_count", 0)
|
||||
pc_b = result_b.get("page_count", 0)
|
||||
diff["page_count_change"] = pc_b - pc_a
|
||||
|
||||
sd_a = result_a.get("page_status", {})
|
||||
sd_b = result_b.get("page_status", {})
|
||||
all_codes = set(list(sd_a.keys()) + list(sd_b.keys()))
|
||||
for c in sorted(all_codes):
|
||||
va = sd_a.get(c, 0)
|
||||
vb = sd_b.get(c, 0)
|
||||
change = vb - va
|
||||
if change != 0:
|
||||
diff["status_distribution_changes"][c] = change
|
||||
|
||||
def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict:
|
||||
changes = {}
|
||||
all_keys = set(list(d_a.keys()) + list(d_b.keys()))
|
||||
for k in all_keys:
|
||||
va = d_a.get(k, 0)
|
||||
vb = d_b.get(k, 0)
|
||||
if isinstance(va, (int, float)) and isinstance(vb, (int, float)):
|
||||
change = round(vb - va, 2)
|
||||
if change != 0:
|
||||
changes[f"{prefix}_{k}"] = change
|
||||
return changes
|
||||
|
||||
lh_a = result_a.get("link_health", {})
|
||||
lh_b = result_b.get("link_health", {})
|
||||
diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link")
|
||||
|
||||
rd_a = result_a.get("redirect_audit", {})
|
||||
rd_b = result_b.get("redirect_audit", {})
|
||||
diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect")
|
||||
|
||||
return diff
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Crawl comparison failed: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyzes linking patterns and social media presence using unique temporary files.
|
||||
|
||||
@@ -454,14 +454,12 @@ class SEODashboardService:
|
||||
def _get_advertools_insights(self, user_id: str, site_url: str) -> Dict[str, Any]:
|
||||
"""Fetch Advertools-based insights from WebsiteAnalysis and AdvertoolsTasks."""
|
||||
try:
|
||||
# 1. Get augmented persona themes from WebsiteAnalysis
|
||||
session = self.db.query(OnboardingSession).filter(OnboardingSession.user_id == user_id).first()
|
||||
if not session:
|
||||
return {}
|
||||
|
||||
analysis = self.db.query(WebsiteAnalysis).filter(WebsiteAnalysis.session_id == session.id).first()
|
||||
|
||||
# 2. Get latest tasks status
|
||||
tasks = self.db.query(AdvertoolsTask).filter(AdvertoolsTask.user_id == user_id).all()
|
||||
|
||||
audit_status = "pending"
|
||||
@@ -479,6 +477,14 @@ class SEODashboardService:
|
||||
|
||||
return {
|
||||
"augmented_themes": brand_analysis.get('augmented_themes', []),
|
||||
"link_health": brand_analysis.get('link_health', {}),
|
||||
"redirect_audit": brand_analysis.get('redirect_audit', {}),
|
||||
"image_seo": brand_analysis.get('image_seo', {}),
|
||||
"page_status": brand_analysis.get('page_status', {}),
|
||||
"url_structure": brand_analysis.get('url_structure', {}),
|
||||
"freshness": brand_analysis.get('freshness', {}),
|
||||
"robots_txt": brand_analysis.get('robots_txt', {}),
|
||||
"crawl_budget": brand_analysis.get('crawl_budget', {}),
|
||||
"last_audit": brand_analysis.get('last_advertools_audit'),
|
||||
"site_health": seo_audit.get('site_health', {}),
|
||||
"last_health_check": seo_audit.get('last_advertools_health_check'),
|
||||
|
||||
Reference in New Issue
Block a user