import advertools as adv import pandas as pd import asyncio from typing import Dict, Any, List, Optional, Tuple from datetime import datetime, timedelta from loguru import logger import json import os import tempfile from urllib.parse import urlparse from collections import Counter import urllib.request import urllib.error import socket import re class AdvertoolsService: """ Centralized service for leveraging the Advertools library for deep SEO intelligence. Provides functions for sitemap analysis, content auditing, and link extraction. """ def __init__(self): self.logger = logger.bind(service="AdvertoolsService") async def analyze_sitemap(self, sitemap_url: str) -> Dict[str, Any]: """ Analyzes a website's sitemap to extract metrics on publishing velocity, freshness, URL structure patterns, and topic distribution. """ try: self.logger.info(f"Analyzing sitemap: {sitemap_url}") loop = asyncio.get_event_loop() df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url)) if df is None or df.empty: return {"success": False, "error": "Sitemap is empty or could not be parsed."} if 'lastmod' in df.columns: df['lastmod'] = pd.to_datetime(df['lastmod'], errors='coerce', utc=True) total_urls = len(df) # --- Content Freshness Scoring --- freshness = self._compute_freshness(df) # --- URL Structure Analysis --- url_structure = {} if 'loc' in df.columns: url_structure = await self._analyze_url_structure(df['loc'].tolist()) # --- Content Pillars via url_to_df --- pillars = {} url_df = None try: url_df = adv.url_to_df(df['loc']) if url_df is not None and not url_df.empty: dir_cols = [c for c in url_df.columns if c.startswith('dir_')] if dir_cols: pillar_series = url_df[dir_cols[0]].fillna("home").astype(str) for col in dir_cols[1:3]: mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan') pillar_series = pillar_series + "/" + url_df[col].where(mask, "") pillars = pillar_series.value_counts().head(15).to_dict() except Exception: fallback_pillars = {} if 'loc' in df.columns: def extract_hierarchy(url: str): try: parts = urlparse(url).path.strip('/').split('/') if not parts or not parts[0]: return "home" return "/".join(parts[:2]) except: return "other" fallback_pillars = df['loc'].apply(extract_hierarchy).value_counts().head(15).to_dict() pillars = fallback_pillars # Sample URLs for auditing (top 15 most recent) audit_urls = [] if 'lastmod' in df.columns and not df['lastmod'].isna().all(): audit_urls = df.sort_values('lastmod', ascending=False).head(15)['loc'].tolist() else: audit_urls = df['loc'].head(15).tolist() return { "success": True, "metrics": { "total_urls": total_urls, "publishing_velocity": freshness.get("publishing_velocity"), "stale_content_count": freshness.get("stale_count"), "stale_content_percentage": freshness.get("stale_percentage"), "freshness_score": freshness.get("freshness_score"), "publishing_recency": freshness.get("publishing_recency"), "publishing_trend": freshness.get("publishing_trend"), "top_pillars": pillars, "url_structure": url_structure, "audit_sample_urls": audit_urls }, "timestamp": datetime.utcnow().isoformat() } except Exception as e: self.logger.error(f"Failed to analyze sitemap {sitemap_url}: {str(e)}") return {"success": False, "error": str(e)} def _compute_freshness(self, df: pd.DataFrame) -> Dict[str, Any]: """Compute content freshness, publishing velocity, and staleness metrics.""" result = { "publishing_velocity": 0, "stale_count": 0, "stale_percentage": 0, "freshness_score": 0, "publishing_recency": {}, "publishing_trend": "unknown" } if 'lastmod' not in df.columns or df['lastmod'].isna().all(): return result lastmod = df['lastmod'].dropna() if lastmod.empty: return result now = datetime.now(lastmod.dt.tz) thirty_days_ago = now - timedelta(days=30) ninety_days_ago = now - timedelta(days=90) six_months_ago = now - timedelta(days=180) recent_urls = df[df['lastmod'] > thirty_days_ago] stale_urls = df[df['lastmod'] < six_months_ago] total_urls = len(df) stale_count = len(stale_urls) stale_percentage = round((stale_count / total_urls) * 100, 2) if total_urls > 0 else 0 # Publishing velocity: URLs per week over last 90 days recent_90 = df[df['lastmod'] > ninety_days_ago] publishing_velocity = round(len(recent_90) / 13.0, 2) if not recent_90.empty else 0 # Freshness score (0-100): weighted combination of metrics non_stale_ratio = 1.0 - (stale_percentage / 100.0) recency_ratio = len(recent_urls) / max(total_urls, 1) velocity_score = min(publishing_velocity / 10.0, 1.0) freshness_score = round((non_stale_ratio * 50 + recency_ratio * 30 + velocity_score * 20), 1) # Publishing recency: URLs published in last 1d, 7d, 30d, 90d publishing_recency = { "last_24h": int(len(df[df['lastmod'] > (now - timedelta(days=1))])), "last_7d": int(len(df[df['lastmod'] > (now - timedelta(days=7))])), "last_30d": int(len(recent_urls)), "last_90d": int(len(recent_90)), } # Publishing trend: compare recent 30d vs prior 30d prior_30 = df[(df['lastmod'] <= thirty_days_ago) & (df['lastmod'] > (now - timedelta(days=60)))] recent_count = len(recent_urls) prior_count = len(prior_30) if recent_count > prior_count * 1.1: publishing_trend = "increasing" elif recent_count < prior_count * 0.9: publishing_trend = "decreasing" else: publishing_trend = "stable" return { "publishing_velocity": publishing_velocity, "stale_count": stale_count, "stale_percentage": stale_percentage, "freshness_score": freshness_score, "publishing_recency": publishing_recency, "publishing_trend": publishing_trend } async def _analyze_url_structure(self, urls: List[str]) -> Dict[str, Any]: """Analyze URL patterns for parameter bloat, directory depth, and path patterns.""" try: loop = asyncio.get_event_loop() url_df = await loop.run_in_executor(None, lambda: adv.url_to_df(urls)) if url_df is None or url_df.empty: return {} total = len(url_df) # Query param analysis has_query = url_df['query'].notna() & (url_df['query'] != '') param_count = has_query.sum() param_percentage = round((param_count / total) * 100, 2) if total > 0 else 0 # Extract individual parameters all_params = [] param_frequency = {} if param_count > 0: for q in url_df.loc[has_query, 'query'].dropna().unique(): for pair in q.split('&'): key = pair.split('=')[0] if '=' in pair else pair all_params.append(key) from collections import Counter param_frequency = dict(Counter(all_params).most_common(10)) # Directory depth analysis dir_cols = [c for c in url_df.columns if c.startswith('dir_')] def count_depth(row): for i, col in enumerate(dir_cols): val = row[col] if pd.isna(val) or str(val) == 'nan' or str(val).strip() == '': return i return len(dir_cols) depths = url_df.apply(count_depth, axis=1) avg_depth = round(depths.mean(), 1) if not depths.empty else 0 max_depth = int(depths.max()) if not depths.empty else 0 depth_distribution = depths.value_counts().sort_index().head(10).to_dict() depth_distribution = {str(k): int(v) for k, v in depth_distribution.items()} # Protocol consistency schemes = url_df['scheme'].value_counts().to_dict() if 'scheme' in url_df.columns else {} # Subdomain analysis netloc_counts = url_df['netloc'].value_counts() if 'netloc' in url_df.columns else None unique_subdomains = int(netloc_counts.nunique()) if netloc_counts is not None else 0 primary_domain = netloc_counts.index[0] if netloc_counts is not None and not netloc_counts.empty else "" return { "total_urls_analyzed": total, "parameter_usage": { "urls_with_params": int(param_count), "percentage_with_params": param_percentage, "top_parameters": param_frequency }, "directory_depth": { "average_depth": avg_depth, "max_depth": max_depth, "distribution": depth_distribution }, "protocols": {str(k): int(v) for k, v in schemes.items()}, "subdomains": { "primary": primary_domain, "unique_count": unique_subdomains } } except Exception as e: self.logger.warning(f"URL structure analysis failed: {e}") return {} async def audit_content(self, url_list: List[str]) -> Dict[str, Any]: """ Performs a shallow crawl and theme analysis using word frequency. Uses unique temporary files for thread safety. """ temp_file = None try: self.logger.info(f"Auditing content for {len(url_list)} URLs") # Create a unique temporary file with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf: temp_file = tf.name # advertools crawl is blocking loop = asyncio.get_event_loop() await loop.run_in_executor(None, lambda: adv.crawl( url_list=url_list, output_file=temp_file, follow_links=False, custom_settings={ 'LOG_LEVEL': 'WARNING', 'CLOSESPIDER_PAGECOUNT': 15, # Guardrail: Max 15 pages 'DOWNLOAD_TIMEOUT': 30 # Guardrail: 30s timeout per page } )) if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0: return {"success": False, "error": "Crawl failed to generate output or output is empty."} crawl_df = pd.read_json(temp_file, lines=True) # Extract themes using word frequency text_columns = [col for col in ['body_text', 'h1', 'h2', 'title'] if col in crawl_df.columns] if not text_columns: return {"success": False, "error": "No text content found to analyze."} all_text = " ".join(crawl_df[text_columns].fillna("").values.flatten()) if not all_text.strip(): return {"success": False, "error": "Extracted text is empty."} word_freq = await loop.run_in_executor(None, lambda: adv.word_frequency([all_text], rm_stopwords=True)) top_themes = word_freq.head(20).to_dict(orient='records') # Additional metrics: Readability, word count avg_word_count = 0 if 'body_text' in crawl_df.columns: crawl_df['word_count'] = crawl_df['body_text'].fillna("").str.split().str.len() avg_word_count = crawl_df['word_count'].mean() return { "success": True, "themes": top_themes, "page_count": len(crawl_df), "avg_word_count": round(avg_word_count, 1), "timestamp": datetime.utcnow().isoformat() } except Exception as e: self.logger.error(f"Failed to audit content: {str(e)}") return {"success": False, "error": str(e)} finally: if temp_file and os.path.exists(temp_file): try: os.remove(temp_file) except Exception as e: self.logger.warning(f"Failed to remove temp file {temp_file}: {e}") async def analyze_site_structure(self, url_list: List[str], site_domain: Optional[str] = None) -> Dict[str, Any]: """ Crawls a set of pages with link following to analyze internal link health, redirect chains, and page-level SEO elements. Extracts metrics via crawlytics: link distribution, redirect chains, image SEO. """ temp_file = None try: self.logger.info(f"Analyzing site structure for {len(url_list)} URLs, domain={site_domain}") with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf: temp_file = tf.name loop = asyncio.get_event_loop() await loop.run_in_executor(None, lambda: adv.crawl( url_list=url_list, output_file=temp_file, follow_links=True, allowed_domains=[site_domain] if site_domain else None, custom_settings={ 'LOG_LEVEL': 'WARNING', 'CLOSESPIDER_PAGECOUNT': 50, 'DOWNLOAD_TIMEOUT': 30, 'CONCURRENT_REQUESTS_PER_DOMAIN': 3, 'DEPTH_LIMIT': 3, } )) if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0: return {"success": False, "error": "Site structure crawl produced no output."} crawl_df = pd.read_json(temp_file, lines=True) page_count = len(crawl_df) result = {"success": True, "page_count": page_count} # --- Link Health via crawlytics --- try: internal_regex = site_domain if site_domain else None link_df = adv.crawlytics.links(crawl_df, internal_url_regex=internal_regex) if link_df is not None and not link_df.empty: total_links = len(link_df) internal_links = int(link_df['internal'].sum()) if 'internal' in link_df.columns else 0 external_links = total_links - internal_links nofollow_links = int(link_df['nofollow'].sum()) if 'nofollow' in link_df.columns else 0 # Count links per page links_per_page = link_df.groupby(level=0).size() avg_links_per_page = round(links_per_page.mean(), 1) if not links_per_page.empty else 0 # Most common anchor text (internal links only) anchor_texts = [] if 'text' in link_df.columns and 'internal' in link_df.columns: internal_anchors = link_df[link_df['internal'] == True]['text'].dropna() for t in internal_anchors: if isinstance(t, str) and t.strip(): anchor_texts.extend([w.strip() for w in t.split() if len(w.strip()) > 2]) from collections import Counter top_anchors = dict(Counter(anchor_texts).most_common(15)) if anchor_texts else {} result["link_health"] = { "total_links_found": total_links, "internal_link_count": internal_links, "external_link_count": external_links, "internal_link_percentage": round((internal_links / total_links) * 100, 1) if total_links > 0 else 0, "nofollow_link_count": nofollow_links, "avg_links_per_page": avg_links_per_page, "top_anchor_words": top_anchors } else: result["link_health"] = {"error": "No links found in crawl data"} except Exception as e: self.logger.warning(f"Link analysis failed: {e}") result["link_health"] = {"error": str(e)} # --- Redirect Chain Audit via crawlytics --- try: redirect_df = adv.crawlytics.redirects(crawl_df) if redirect_df is not None and not redirect_df.empty: total_redirects = len(redirect_df) redirect_chains = redirect_df['redirect_times'].nunique() if 'redirect_times' in redirect_df.columns else 0 redirect_statuses = redirect_df['status'].value_counts().to_dict() if 'status' in redirect_df.columns else {} multi_hop = redirect_df[redirect_df['redirect_times'] > 1] if 'redirect_times' in redirect_df.columns else pd.DataFrame() result["redirect_audit"] = { "total_redirects": int(total_redirects), "unique_chains": int(redirect_chains), "status_distribution": {str(k): int(v) for k, v in redirect_statuses.items()}, "multi_hop_chains": int(len(multi_hop)), "affected_pages": multi_hop.index.unique().tolist() if not multi_hop.empty else [] } else: result["redirect_audit"] = {"total_redirects": 0, "note": "No redirects detected"} except Exception as e: self.logger.warning(f"Redirect analysis failed: {e}") result["redirect_audit"] = {"error": str(e)} # --- Image SEO overview via crawlytics --- try: img_df = adv.crawlytics.images(crawl_df) if img_df is not None and not img_df.empty: total_images = len(img_df) missing_alt = int(img_df['img_alt'].isna().sum()) if 'img_alt' in img_df.columns else 0 alt_coverage = round(((total_images - missing_alt) / total_images) * 100, 1) if total_images > 0 else 0 result["image_seo"] = { "total_images": total_images, "missing_alt_count": missing_alt, "alt_coverage_percentage": alt_coverage } except Exception as e: self.logger.warning(f"Image analysis failed: {e}") # --- Page-level metrics --- if 'status' in crawl_df.columns: status_dist = crawl_df['status'].value_counts().to_dict() result["page_status"] = {str(k): int(v) for k, v in status_dist.items()} if 'title' in crawl_df.columns: missing_titles = int(crawl_df['title'].isna().sum()) result["missing_titles"] = missing_titles if 'meta_desc' in crawl_df.columns: missing_descriptions = int(crawl_df['meta_desc'].isna().sum()) result["missing_descriptions"] = missing_descriptions result["timestamp"] = datetime.utcnow().isoformat() return result except Exception as e: self.logger.error(f"Failed to analyze site structure: {str(e)}") return {"success": False, "error": str(e)} finally: if temp_file and os.path.exists(temp_file): try: os.remove(temp_file) except Exception as e: self.logger.warning(f"Failed to remove temp file {temp_file}: {e}") async def analyze_robots_txt(self, website_url: str) -> Dict[str, Any]: """ Fetch and analyze robots.txt for compliance issues. Checks directives, sitemap declaration, crawl-delay, and common problems. """ try: self.logger.info(f"Analyzing robots.txt for {website_url}") parsed = urlparse(website_url) base_url = f"{parsed.scheme}://{parsed.netloc}" robots_url = f"{base_url}/robots.txt" result = { "success": True, "url": robots_url, "accessible": True, "total_directives": 0, "user_agents_found": [], "has_sitemap_directive": False, "sitemap_urls": [], "has_crawl_delay": False, "disallow_rules": [], "issues": [], "compliance_score": 100, } loop = asyncio.get_event_loop() try: robots_df = await loop.run_in_executor( None, lambda: adv.robotstxt_to_df(robots_url) ) if robots_df is None or robots_df.empty: raise ValueError("Empty result from robotstxt_to_df") except Exception as adv_err: self.logger.warning(f"adv.robotstxt_to_df failed, using manual fallback: {adv_err}") robots_df = await loop.run_in_executor( None, lambda: self._parse_robots_txt_manual(robots_url) ) if robots_df is None or robots_df.empty: result["success"] = False result["error"] = "Could not fetch or parse robots.txt" result["accessible"] = False return result result["total_directives"] = len(robots_df) if 'user_agent' in robots_df.columns: result["user_agents_found"] = robots_df['user_agent'].dropna().unique().tolist() rule_col = 'rule' if 'rule' in robots_df.columns else 'directive' if 'directive' in robots_df.columns else None value_col = 'value' if 'value' in robots_df.columns else 'directive_value' if 'directive_value' in robots_df.columns else None if rule_col and value_col: rules_lower = robots_df[rule_col].astype(str).str.lower() result["has_sitemap_directive"] = 'sitemap' in rules_lower.values result["has_crawl_delay"] = 'crawl-delay' in rules_lower.values has_disallow_all = any( str(row.get(value_col, '')).strip() == '/' for _, row in robots_df[robots_df[rule_col].astype(str).str.lower() == 'disallow'].iterrows() ) if 'disallow' in rules_lower.values else False disallow_mask = rules_lower == 'disallow' if disallow_mask.any(): for _, row in robots_df[disallow_mask].iterrows(): val = str(row.get(value_col, '')) ua = str(row.get('user_agent', '*')) if val: result["disallow_rules"].append({"user_agent": ua, "path": val}) sitemap_mask = rules_lower == 'sitemap' if sitemap_mask.any(): result["sitemap_urls"] = robots_df.loc[sitemap_mask, value_col].dropna().unique().tolist() if has_disallow_all: result["issues"].append({ "severity": "critical", "code": "DISALLOW_ALL", "detail": "robots.txt disallows all user agents from all paths (Disallow: /)" }) if not result["has_sitemap_directive"]: result["issues"].append({ "severity": "warning", "code": "NO_SITEMAP", "detail": "No Sitemap directive found — search engines may miss pages" }) if not result["has_crawl_delay"]: result["issues"].append({ "severity": "info", "code": "NO_CRAWL_DELAY", "detail": "No Crawl-delay directive set — not critical for most sites" }) for issue in result["issues"]: sev = issue["severity"] if sev == "critical": result["compliance_score"] -= 30 elif sev == "warning": result["compliance_score"] -= 15 elif sev == "info": result["compliance_score"] -= 5 result["compliance_score"] = max(result["compliance_score"], 0) return result except Exception as e: self.logger.error(f"Robots.txt analysis failed: {e}") return {"success": False, "error": str(e), "url": robots_url if 'robots_url' in locals() else website_url} def _parse_robots_txt_manual(self, url: str) -> pd.DataFrame: """Fallback: manually fetch and parse robots.txt.""" records = [] try: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=15) as resp: content = resp.read().decode("utf-8", errors="replace") current_ua = "*" for line in content.splitlines(): line = line.strip() if not line or line.startswith("#"): continue if line.lower().startswith("user-agent"): parts = line.split(":", 1) current_ua = parts[1].strip() if len(parts) > 1 else "*" continue if ":" in line: directive, _, value = line.partition(":") records.append({ "user_agent": current_ua, "rule": directive.strip(), "value": value.strip(), }) except Exception as e: self.logger.warning(f"Manual robots.txt fetch failed: {e}") if not records: return pd.DataFrame() return pd.DataFrame(records) async def analyze_crawl_budget(self, sitemap_url: str, site_domain: str) -> Dict[str, Any]: """ Analyze crawl budget by comparing sitemap inventory against actual crawl results. Estimates budget utilization, waste from redirects/errors, and optimization score. """ temp_file = None try: self.logger.info(f"Analyzing crawl budget for {site_domain}") loop = asyncio.get_event_loop() sitemap_df = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_url)) sitemap_total = len(sitemap_df) if sitemap_df is not None and not sitemap_df.empty else 0 start_url = f"https://{site_domain}" if not site_domain.startswith("http") else site_domain with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf: temp_file = tf.name await loop.run_in_executor(None, lambda: adv.crawl( url_list=[start_url], output_file=temp_file, follow_links=True, allowed_domains=[site_domain], custom_settings={ 'LOG_LEVEL': 'WARNING', 'CLOSESPIDER_PAGECOUNT': 30, 'DOWNLOAD_TIMEOUT': 15, 'CONCURRENT_REQUESTS_PER_DOMAIN': 5, 'DEPTH_LIMIT': 2, } )) if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0: return {"success": False, "error": "Crawl produced no output"} crawl_df = pd.read_json(temp_file, lines=True) crawled_count = len(crawl_df) status_dist = {} if 'status' in crawl_df.columns: raw = crawl_df['status'].value_counts().to_dict() status_dist = {str(k): int(v) for k, v in raw.items()} wasted = 0 for code_s in status_dist: code = int(code_s) if code >= 300 or code < 200: wasted += status_dist[code_s] budget_usage_ratio = round(crawled_count / max(sitemap_total, 1), 3) waste_ratio = round(wasted / max(crawled_count, 1), 3) depth_dist = {} if 'depth' in crawl_df.columns: raw = crawl_df['depth'].value_counts().sort_index().to_dict() depth_dist = {str(k): int(v) for k, v in raw.items()} param_count = 0 url_col = 'url' if 'url' in crawl_df.columns else 'response_url' if 'response_url' in crawl_df.columns else None if url_col: param_count = int(crawl_df[url_col].astype(str).str.contains('?').sum()) optimization_score = max(0, round(100 - (waste_ratio * 100) - (budget_usage_ratio * 20), 1)) return { "success": True, "sitemap_total_urls": sitemap_total, "pages_crawled": crawled_count, "crawl_coverage_percentage": round(budget_usage_ratio * 100, 1), "status_distribution": status_dist, "wasted_crawl_requests": int(wasted), "waste_percentage": round(waste_ratio * 100, 1), "depth_distribution": depth_dist, "urls_with_parameters": int(param_count), "optimization_score": optimization_score, } except Exception as e: self.logger.error(f"Crawl budget analysis failed: {e}") return {"success": False, "error": str(e)} finally: if temp_file and os.path.exists(temp_file): try: os.remove(temp_file) except Exception: pass async def sitemap_compare(self, sitemap_a: str, sitemap_b: str) -> Dict[str, Any]: """ Compare two sitemaps for competitive content gap analysis. Analyzes URL count, freshness, directory pillars, and identifies patterns unique to each sitemap. """ try: self.logger.info(f"Comparing sitemaps: {sitemap_a} vs {sitemap_b}") loop = asyncio.get_event_loop() df_a = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_a)) df_b = await loop.run_in_executor(None, lambda: adv.sitemap_to_df(sitemap_b)) total_a = len(df_a) if df_a is not None and not df_a.empty else 0 total_b = len(df_b) if df_b is not None and not df_b.empty else 0 result = { "success": True, "sitemap_a": {"url": sitemap_a, "total_urls": total_a}, "sitemap_b": {"url": sitemap_b, "total_urls": total_b}, "url_count_diff": total_a - total_b, "ratio": round(total_a / max(total_b, 1), 2), "pillars_a": {}, "pillars_b": {}, "shared_pillars": [], "unique_to_a": [], "unique_to_b": [], "freshness_comparison": {}, "overlap_score": 0, } if total_a == 0 or total_b == 0: return result def extract_pillars(df: pd.DataFrame, label: str) -> Tuple[dict, list]: pillars = {} if 'loc' in df.columns: try: url_df = adv.url_to_df(df['loc']) if url_df is not None and not url_df.empty: dir_cols = [c for c in url_df.columns if c.startswith('dir_')] if dir_cols: pillar_series = url_df[dir_cols[0]].fillna("home").astype(str) for col in dir_cols[1:3]: mask = url_df[col].notna() & (url_df[col].astype(str) != 'nan') pillar_series = pillar_series + "/" + url_df[col].where(mask, "") pillars = pillar_series.value_counts().head(20).to_dict() except Exception: pass if not pillars: seen = {} for url in df['loc'].dropna(): parts = urlparse(url).path.strip('/').split('/') key = parts[0] if parts and parts[0] else "home" seen[key] = seen.get(key, 0) + 1 pillars = dict(sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20]) pillar_keys = list(pillars.keys()) if pillars else [] return pillars, pillar_keys pillars_a, keys_a = extract_pillars(df_a, "a") pillars_b, keys_b = extract_pillars(df_b, "b") result["pillars_a"] = pillars_a result["pillars_b"] = pillars_b set_a = set(keys_a) set_b = set(keys_b) shared = set_a & set_b result["shared_pillars"] = sorted(shared) result["unique_to_a"] = sorted(set_a - set_b) result["unique_to_b"] = sorted(set_b - set_a) total_keys = max(len(set_a | set_b), 1) overlap_count = len(shared) result["overlap_score"] = round((overlap_count / total_keys) * 100, 1) def compute_freshness_stats(df: pd.DataFrame) -> dict: stats = {"has_lastmod": False, "recent_30d": 0, "total_with_dates": 0} if 'lastmod' in df.columns: lm = pd.to_datetime(df['lastmod'], errors='coerce', utc=True).dropna() if not lm.empty: stats["has_lastmod"] = True stats["total_with_dates"] = int(len(lm)) stats["recent_30d"] = int((lm > (datetime.now(lm.dt.tz) - timedelta(days=30))).sum()) return stats result["freshness_comparison"] = { "a": compute_freshness_stats(df_a), "b": compute_freshness_stats(df_b), } return result except Exception as e: self.logger.error(f"Sitemap comparison failed: {e}") return {"success": False, "error": str(e)} async def compare_crawl_results(self, result_a: Dict[str, Any], result_b: Dict[str, Any]) -> Dict[str, Any]: """ Compare two crawl analysis result dicts to surface changes over time. Useful for tracking SEO improvements between scheduled executions. """ try: diff = { "success": True, "page_count_change": 0, "status_distribution_changes": {}, "link_health_changes": {}, "redirect_changes": {}, "new_issues": [], "resolved_issues": [], } pc_a = result_a.get("page_count", 0) pc_b = result_b.get("page_count", 0) diff["page_count_change"] = pc_b - pc_a sd_a = result_a.get("page_status", {}) sd_b = result_b.get("page_status", {}) all_codes = set(list(sd_a.keys()) + list(sd_b.keys())) for c in sorted(all_codes): va = sd_a.get(c, 0) vb = sd_b.get(c, 0) change = vb - va if change != 0: diff["status_distribution_changes"][c] = change def _safe_diff(d_a: dict, d_b: dict, prefix: str) -> dict: changes = {} all_keys = set(list(d_a.keys()) + list(d_b.keys())) for k in all_keys: va = d_a.get(k, 0) vb = d_b.get(k, 0) if isinstance(va, (int, float)) and isinstance(vb, (int, float)): change = round(vb - va, 2) if change != 0: changes[f"{prefix}_{k}"] = change return changes lh_a = result_a.get("link_health", {}) lh_b = result_b.get("link_health", {}) diff["link_health_changes"] = _safe_diff(lh_a, lh_b, "link") rd_a = result_a.get("redirect_audit", {}) rd_b = result_b.get("redirect_audit", {}) diff["redirect_changes"] = _safe_diff(rd_a, rd_b, "redirect") return diff except Exception as e: self.logger.error(f"Crawl comparison failed: {e}") return {"success": False, "error": str(e)} async def extract_communication_style(self, url_list: List[str]) -> Dict[str, Any]: """ Analyzes linking patterns and social media presence using unique temporary files. """ temp_file = None try: self.logger.info(f"Extracting communication style for {len(url_list)} URLs") with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False) as tf: temp_file = tf.name loop = asyncio.get_event_loop() await loop.run_in_executor(None, lambda: adv.crawl( url_list=url_list, output_file=temp_file, follow_links=False, custom_settings={ 'LOG_LEVEL': 'WARNING', 'CLOSESPIDER_PAGECOUNT': 10, 'DOWNLOAD_TIMEOUT': 30 } )) if not os.path.exists(temp_file) or os.path.getsize(temp_file) == 0: return {"success": False, "error": "Link extraction crawl failed."} crawl_df = pd.read_json(temp_file, lines=True) # Extract social links and internal/external stats all_links = [] if 'links_url' in crawl_df.columns: for links in crawl_df['links_url'].dropna(): if isinstance(links, str): all_links.extend(links.split("@@")) elif isinstance(links, list): all_links.extend(links) if not all_links: return {"success": True, "social_links": [], "link_stats": {"total_links_found": 0, "unique_domains": 0}} # Analyze links link_df = adv.url_to_df(all_links) social_domains = ['twitter.com', 'x.com', 'linkedin.com', 'facebook.com', 'instagram.com', 'youtube.com', 'github.com'] social_links = [] if not link_df.empty and 'netloc' in link_df.columns: social_links = link_df[link_df['netloc'].isin(social_domains)]['url'].unique().tolist() return { "success": True, "social_links": social_links, "link_stats": { "total_links_found": len(all_links), "unique_domains": link_df['netloc'].nunique() if not link_df.empty else 0 }, "timestamp": datetime.utcnow().isoformat() } except Exception as e: self.logger.error(f"Failed to extract communication style: {str(e)}") return {"success": False, "error": str(e)} finally: if temp_file and os.path.exists(temp_file): try: os.remove(temp_file) except Exception as e: self.logger.warning(f"Failed to remove temp file {temp_file}: {e}")