Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts
This commit is contained in:
@@ -5,10 +5,19 @@ AI-powered content strategy analyzer that provides insights into
|
||||
content gaps, opportunities, and competitive positioning.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
import json
|
||||
import re
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
import statistics
|
||||
from loguru import logger
|
||||
|
||||
from ..llm_providers.main_text_generation import llm_text_gen
|
||||
from middleware.logging_middleware import seo_logger
|
||||
|
||||
from .sitemap_service import SitemapService
|
||||
|
||||
class ContentStrategyService:
|
||||
"""Service for AI-powered content strategy analysis"""
|
||||
|
||||
@@ -22,30 +31,540 @@ class ContentStrategyService:
|
||||
website_url: str,
|
||||
competitors: List[str] = None,
|
||||
target_keywords: List[str] = None,
|
||||
custom_parameters: Dict[str, Any] = None
|
||||
custom_parameters: Dict[str, Any] = None,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze content strategy and opportunities"""
|
||||
# Placeholder implementation
|
||||
return {
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
competitors = competitors or []
|
||||
target_keywords = target_keywords or []
|
||||
custom_parameters = custom_parameters or {}
|
||||
|
||||
sitemap_service = SitemapService()
|
||||
|
||||
discovered_user_sitemap = await sitemap_service.discover_sitemap_url(website_url)
|
||||
user_sitemap_result = None
|
||||
if discovered_user_sitemap:
|
||||
user_sitemap_result = await sitemap_service.analyze_sitemap(
|
||||
sitemap_url=discovered_user_sitemap,
|
||||
analyze_content_trends=True,
|
||||
analyze_publishing_patterns=True,
|
||||
include_ai_insights=False
|
||||
)
|
||||
|
||||
competitor_sitemaps: Dict[str, Optional[str]] = {}
|
||||
competitor_results: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
for competitor_url in competitors[:5]:
|
||||
sitemap_url = await sitemap_service.discover_sitemap_url(competitor_url)
|
||||
competitor_sitemaps[competitor_url] = sitemap_url
|
||||
if sitemap_url:
|
||||
try:
|
||||
competitor_results[competitor_url] = await sitemap_service.analyze_sitemap(
|
||||
sitemap_url=sitemap_url,
|
||||
analyze_content_trends=True,
|
||||
analyze_publishing_patterns=True,
|
||||
include_ai_insights=False
|
||||
)
|
||||
except Exception as e:
|
||||
competitor_results[competitor_url] = {"error": str(e)}
|
||||
|
||||
deterministic = self._build_deterministic_insights(
|
||||
website_url=website_url,
|
||||
user_sitemap_url=discovered_user_sitemap,
|
||||
user_sitemap_result=user_sitemap_result,
|
||||
competitor_sitemaps=competitor_sitemaps,
|
||||
competitor_results=competitor_results,
|
||||
target_keywords=target_keywords
|
||||
)
|
||||
|
||||
ai_strategy = None
|
||||
ai_error = None
|
||||
if user_id:
|
||||
try:
|
||||
prompt = self._build_ai_prompt(
|
||||
website_url=website_url,
|
||||
target_keywords=target_keywords,
|
||||
custom_parameters=custom_parameters,
|
||||
deterministic_summary=deterministic
|
||||
)
|
||||
ai_response = llm_text_gen(
|
||||
prompt=prompt,
|
||||
system_prompt=self._get_system_prompt(),
|
||||
user_id=user_id
|
||||
)
|
||||
ai_strategy = self._parse_json_response(ai_response)
|
||||
|
||||
await seo_logger.log_ai_analysis(
|
||||
tool_name=self.service_name,
|
||||
prompt=prompt,
|
||||
response=ai_response,
|
||||
model_used="gemini-2.0-flash-001"
|
||||
)
|
||||
except Exception as e:
|
||||
ai_error = str(e)
|
||||
|
||||
execution_time = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
result = {
|
||||
"website_url": website_url,
|
||||
"analysis_type": "content_strategy",
|
||||
"competitors_analyzed": len(competitors) if competitors else 0,
|
||||
"content_gaps": [
|
||||
{"topic": "SEO best practices", "opportunity_score": 85, "difficulty": "Medium"},
|
||||
{"topic": "Content marketing", "opportunity_score": 78, "difficulty": "Low"}
|
||||
],
|
||||
"opportunities": [
|
||||
{"type": "Trending topics", "count": 15, "potential_traffic": "High"},
|
||||
{"type": "Long-tail keywords", "count": 45, "potential_traffic": "Medium"}
|
||||
],
|
||||
"content_performance": {"top_performing": 12, "underperforming": 8},
|
||||
"recommendations": [
|
||||
"Create content around trending SEO topics",
|
||||
"Optimize existing content for long-tail keywords",
|
||||
"Develop content series for better engagement"
|
||||
],
|
||||
"competitive_analysis": {"content_leadership": "moderate", "gaps_identified": 8}
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"execution_time": execution_time,
|
||||
"inputs": {
|
||||
"competitors": competitors[:5],
|
||||
"target_keywords": target_keywords,
|
||||
"custom_parameters": custom_parameters
|
||||
},
|
||||
"data_sources": {
|
||||
"user_sitemap_url": discovered_user_sitemap,
|
||||
"competitor_sitemaps": competitor_sitemaps
|
||||
},
|
||||
"deterministic_insights": deterministic,
|
||||
"ai_strategy": ai_strategy,
|
||||
"ai_error": ai_error
|
||||
}
|
||||
|
||||
await seo_logger.log_tool_usage(
|
||||
tool_name=self.service_name,
|
||||
input_data={
|
||||
"website_url": website_url,
|
||||
"competitors_count": len(competitors),
|
||||
"target_keywords_count": len(target_keywords),
|
||||
"has_user_sitemap": bool(discovered_user_sitemap)
|
||||
},
|
||||
output_data={
|
||||
"website_url": website_url,
|
||||
"has_ai_strategy": bool(ai_strategy),
|
||||
"has_ai_error": bool(ai_error),
|
||||
"execution_time": execution_time
|
||||
},
|
||||
success=True if (ai_strategy is not None or deterministic is not None) else False
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def analyze_competitive_sitemap_benchmarking(
|
||||
self,
|
||||
website_url: str,
|
||||
competitors: List[str],
|
||||
max_competitors: Optional[int] = None,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
start_time = datetime.utcnow()
|
||||
# Using WARNING level to ensure visibility in production logs as requested by user
|
||||
logger.warning(f"🚀 [START] Competitive sitemap benchmarking for {website_url} with {len(competitors)} competitors")
|
||||
|
||||
competitors = [c for c in (competitors or []) if isinstance(c, str) and c.strip()]
|
||||
if max_competitors:
|
||||
competitors = competitors[: max(0, int(max_competitors))]
|
||||
|
||||
if not competitors:
|
||||
logger.warning(f"No competitors provided for benchmarking {website_url}")
|
||||
|
||||
sitemap_service = SitemapService()
|
||||
|
||||
logger.warning(f"🔍 [PROGRESS] Discovering user sitemap for {website_url}")
|
||||
discovered_user_sitemap = await sitemap_service.discover_sitemap_url(website_url)
|
||||
user_sitemap_result = None
|
||||
user_error = None
|
||||
if discovered_user_sitemap:
|
||||
try:
|
||||
logger.warning(f"⚡ [PROGRESS] Analyzing user sitemap: {discovered_user_sitemap}")
|
||||
user_sitemap_result = await sitemap_service.analyze_sitemap(
|
||||
sitemap_url=discovered_user_sitemap,
|
||||
analyze_content_trends=True,
|
||||
analyze_publishing_patterns=True,
|
||||
include_ai_insights=False,
|
||||
user_id=user_id
|
||||
)
|
||||
except Exception as e:
|
||||
user_error = str(e)
|
||||
logger.error(f"Error analyzing user sitemap {discovered_user_sitemap}: {e}")
|
||||
else:
|
||||
user_error = "No sitemap discovered for your website. Please ensure your site has a valid sitemap.xml."
|
||||
logger.warning(f"⚠️ No sitemap found for user website {website_url}")
|
||||
|
||||
competitor_sitemaps: Dict[str, Optional[str]] = {}
|
||||
competitor_results: Dict[str, Dict[str, Any]] = {}
|
||||
competitor_errors: Dict[str, str] = {}
|
||||
|
||||
logger.warning(f"🔍 [PROGRESS] Discovering sitemaps for {len(competitors)} competitors")
|
||||
discovery_tasks = [sitemap_service.discover_sitemap_url(u) for u in competitors]
|
||||
discovery_results = await asyncio.gather(*discovery_tasks, return_exceptions=True)
|
||||
for i, url in enumerate(competitors):
|
||||
res = discovery_results[i]
|
||||
if isinstance(res, Exception):
|
||||
competitor_sitemaps[url] = None
|
||||
competitor_errors[url] = str(res)
|
||||
logger.warning(f"Error discovering sitemap for competitor {url}: {res}")
|
||||
else:
|
||||
competitor_sitemaps[url] = res
|
||||
if not res:
|
||||
competitor_errors[url] = "No sitemap found"
|
||||
logger.info(f"ℹ️ No sitemap found for competitor {url}")
|
||||
else:
|
||||
logger.info(f"✅ Found sitemap for competitor {url}: {res}")
|
||||
|
||||
to_analyze = [(url, competitor_sitemaps.get(url)) for url in competitors if competitor_sitemaps.get(url)]
|
||||
logger.warning(f"⚡ [PROGRESS] Analyzing {len(to_analyze)} competitor sitemaps")
|
||||
|
||||
# Helper for safe analysis with timeout
|
||||
async def analyze_with_timeout(url, sm):
|
||||
try:
|
||||
logger.warning(f"🕒 [START] Analyzing {url} with 300s timeout")
|
||||
# 5 minute timeout per competitor to prevent total blocking
|
||||
result = await asyncio.wait_for(
|
||||
sitemap_service.analyze_sitemap(
|
||||
sitemap_url=sm,
|
||||
analyze_content_trends=True,
|
||||
analyze_publishing_patterns=True,
|
||||
include_ai_insights=False,
|
||||
user_id=user_id
|
||||
),
|
||||
timeout=300.0
|
||||
)
|
||||
logger.warning(f"✅ [DONE] Analysis finished for {url}")
|
||||
return result
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"⏱️ Analysis timed out for competitor {url} (limit: 300s)")
|
||||
return TimeoutError(f"Analysis timed out after 300s")
|
||||
except Exception as e:
|
||||
msg = str(e)
|
||||
if "URL returned a webpage" in msg or "Failed to parse sitemap XML" in msg or "no element found" in msg:
|
||||
logger.warning(f"⚠️ Analysis skipped for {url}: Invalid sitemap ({msg})")
|
||||
else:
|
||||
logger.error(f"❌ Analysis failed for {url}: {e}")
|
||||
return e
|
||||
|
||||
analysis_tasks = [
|
||||
analyze_with_timeout(url, sm)
|
||||
for (url, sm) in to_analyze
|
||||
]
|
||||
analysis_results = await asyncio.gather(*analysis_tasks, return_exceptions=True)
|
||||
for i, (url, _) in enumerate(to_analyze):
|
||||
res = analysis_results[i]
|
||||
if isinstance(res, Exception):
|
||||
competitor_errors[url] = str(res)
|
||||
if "URL returned a webpage" not in str(res) and "Failed to parse sitemap XML" not in str(res) and "no element found" not in str(res):
|
||||
logger.error(f"Error analyzing sitemap for competitor {url}: {res}")
|
||||
else:
|
||||
competitor_results[url] = res
|
||||
|
||||
user_summary = self._summarize_sitemap(user_sitemap_result)
|
||||
competitor_summaries: Dict[str, Dict[str, Any]] = {}
|
||||
for competitor_url, result in competitor_results.items():
|
||||
if result and isinstance(result, dict) and "error" not in result:
|
||||
competitor_summaries[competitor_url] = self._summarize_sitemap(result)
|
||||
|
||||
benchmark = self._build_competitive_sitemap_benchmark(
|
||||
website_url=website_url,
|
||||
user_summary=user_summary,
|
||||
competitor_summaries=competitor_summaries
|
||||
)
|
||||
|
||||
execution_time = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
return {
|
||||
"analysis_type": "competitive_sitemap_benchmarking",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"execution_time": execution_time,
|
||||
"inputs": {
|
||||
"website_url": website_url,
|
||||
"competitors": competitors,
|
||||
"max_competitors": max_competitors
|
||||
},
|
||||
"data_sources": {
|
||||
"user_sitemap_url": discovered_user_sitemap,
|
||||
"competitor_sitemaps": competitor_sitemaps
|
||||
},
|
||||
"user": {
|
||||
"summary": user_summary,
|
||||
"error": user_error
|
||||
},
|
||||
"competitors": {
|
||||
"summaries": competitor_summaries,
|
||||
"errors": competitor_errors
|
||||
},
|
||||
"benchmark": benchmark
|
||||
}
|
||||
|
||||
def _safe_ratio(self, numerator: Any, denominator: Any) -> Optional[float]:
|
||||
try:
|
||||
num = float(numerator)
|
||||
den = float(denominator)
|
||||
if den <= 0:
|
||||
return None
|
||||
return round(num / den, 4)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _as_float(self, value: Any) -> Optional[float]:
|
||||
try:
|
||||
if value is None:
|
||||
return None
|
||||
return float(value)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _median(self, values: List[Optional[float]]) -> Optional[float]:
|
||||
cleaned = [v for v in values if isinstance(v, (int, float))]
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return float(statistics.median(cleaned))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _build_competitive_sitemap_benchmark(
|
||||
self,
|
||||
website_url: str,
|
||||
user_summary: Dict[str, Any],
|
||||
competitor_summaries: Dict[str, Dict[str, Any]]
|
||||
) -> Dict[str, Any]:
|
||||
user_patterns = user_summary.get("top_url_patterns") or {}
|
||||
user_sections = set(user_patterns.keys())
|
||||
|
||||
competitor_section_stats: Dict[str, Dict[str, Any]] = {}
|
||||
competitor_metrics: List[Dict[str, Any]] = []
|
||||
|
||||
for competitor_url, summary in competitor_summaries.items():
|
||||
patterns = summary.get("top_url_patterns") or {}
|
||||
total_urls = summary.get("total_urls") or 0
|
||||
span_days = (summary.get("date_range") or {}).get("span_days")
|
||||
competitor_metrics.append({
|
||||
"competitor_url": competitor_url,
|
||||
"total_urls": summary.get("total_urls"),
|
||||
"sections_count": len(patterns.keys()),
|
||||
"average_path_depth": summary.get("average_path_depth"),
|
||||
"max_path_depth": summary.get("max_path_depth"),
|
||||
"publishing_velocity": summary.get("publishing_velocity"),
|
||||
"lastmod_coverage": self._safe_ratio(summary.get("total_dated_urls"), total_urls) if isinstance(summary.get("total_dated_urls"), (int, float)) else None,
|
||||
"span_days": span_days
|
||||
})
|
||||
|
||||
for section, count in patterns.items():
|
||||
if not section:
|
||||
continue
|
||||
if section not in competitor_section_stats:
|
||||
competitor_section_stats[section] = {
|
||||
"competitor_presence": 0,
|
||||
"total_url_count": 0
|
||||
}
|
||||
competitor_section_stats[section]["competitor_presence"] += 1
|
||||
competitor_section_stats[section]["total_url_count"] += int(count or 0)
|
||||
|
||||
competitor_count = len(competitor_summaries)
|
||||
missing_sections = []
|
||||
for section, stats in sorted(
|
||||
competitor_section_stats.items(),
|
||||
key=lambda x: (x[1].get("competitor_presence", 0), x[1].get("total_url_count", 0)),
|
||||
reverse=True
|
||||
):
|
||||
# Filter out known non-content patterns:
|
||||
# 1. Sections present in user site
|
||||
# 2. Short sections <= 3 chars (likely language codes like /en, /es, /fr)
|
||||
# 3. Common technical paths (wp-content, wp-includes, cgi-bin)
|
||||
if section in user_sections:
|
||||
continue
|
||||
|
||||
if len(section) <= 3: # e.g., /es, /fr, /pt
|
||||
continue
|
||||
|
||||
if any(tech in section.lower() for tech in ['wp-content', 'wp-includes', 'cgi-bin', 'assets', 'static']):
|
||||
continue
|
||||
|
||||
if competitor_count > 0 and stats.get("competitor_presence", 0) >= max(2, int(round(0.4 * competitor_count))):
|
||||
missing_sections.append({
|
||||
"section": section,
|
||||
# Ensure presence is a normalized ratio (0.0 - 1.0)
|
||||
"competitor_presence": self._safe_ratio(stats.get("competitor_presence", 0), competitor_count) or 0,
|
||||
"competitor_count": stats.get("competitor_presence"),
|
||||
"total_url_count": stats.get("total_url_count", 0)
|
||||
})
|
||||
missing_sections = missing_sections[:15]
|
||||
|
||||
velocity_values = [self._as_float(s.get("publishing_velocity")) for s in competitor_summaries.values()]
|
||||
depth_values = [self._as_float(s.get("average_path_depth")) for s in competitor_summaries.values()]
|
||||
competitor_velocity_median = self._median(velocity_values)
|
||||
competitor_depth_median = self._median(depth_values)
|
||||
|
||||
user_velocity = self._as_float(user_summary.get("publishing_velocity"))
|
||||
user_depth = self._as_float(user_summary.get("average_path_depth"))
|
||||
user_total_urls = user_summary.get("total_urls") or 0
|
||||
|
||||
opportunities = []
|
||||
# Note: 'missing_sections' opportunity removed to avoid duplication with 'Competitor Content Strategy Patterns' section
|
||||
|
||||
# Insight 1: Content Volume Gap
|
||||
competitor_total_urls_list = [m["total_urls"] for m in competitor_metrics if m.get("total_urls")]
|
||||
competitor_urls_median = self._median(competitor_total_urls_list)
|
||||
|
||||
if competitor_urls_median and user_total_urls < competitor_urls_median * 0.8:
|
||||
opportunities.append({
|
||||
"type": "content_volume_gap",
|
||||
"title": "Competitors have significantly more content",
|
||||
"metrics": {
|
||||
"user_total_pages": user_total_urls,
|
||||
"competitor_median_total_pages": int(competitor_urls_median)
|
||||
}
|
||||
})
|
||||
|
||||
# Insight 2: Publishing Velocity Gap
|
||||
if competitor_velocity_median is not None and user_velocity is not None:
|
||||
if user_velocity < competitor_velocity_median * 0.75:
|
||||
opportunities.append({
|
||||
"type": "publishing_velocity_gap",
|
||||
"title": "Competitors appear to publish more frequently",
|
||||
"metrics": {
|
||||
"user_publishing_velocity": user_velocity,
|
||||
"competitor_median_publishing_velocity": competitor_velocity_median
|
||||
}
|
||||
})
|
||||
|
||||
# Insight 3: Architecture Depth Gap
|
||||
if competitor_depth_median is not None and user_depth is not None:
|
||||
if user_depth < competitor_depth_median - 0.5:
|
||||
opportunities.append({
|
||||
"type": "architecture_depth_gap",
|
||||
"title": "Competitors have deeper site structure",
|
||||
"metrics": {
|
||||
"user_average_path_depth": user_depth,
|
||||
"competitor_median_average_path_depth": competitor_depth_median
|
||||
}
|
||||
})
|
||||
|
||||
competitor_metrics_sorted = sorted(
|
||||
competitor_metrics,
|
||||
key=lambda x: (x.get("total_urls") or 0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return {
|
||||
"website_url": website_url,
|
||||
"competitors_analyzed": competitor_count,
|
||||
"user_sections_count": len(user_sections),
|
||||
"competitor_section_leaders": competitor_metrics_sorted[:10],
|
||||
"gaps": {
|
||||
"missing_sections": missing_sections
|
||||
},
|
||||
"opportunities": opportunities
|
||||
}
|
||||
|
||||
def _summarize_sitemap(self, sitemap_result: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
if not sitemap_result or not isinstance(sitemap_result, dict):
|
||||
return {}
|
||||
structure = sitemap_result.get("structure_analysis") or {}
|
||||
trends = sitemap_result.get("content_trends") or {}
|
||||
patterns = sitemap_result.get("publishing_patterns") or {}
|
||||
return {
|
||||
"total_urls": sitemap_result.get("total_urls"),
|
||||
"top_url_patterns": structure.get("url_patterns") or {},
|
||||
"file_types": structure.get("file_types") or {},
|
||||
"average_path_depth": structure.get("average_path_depth"),
|
||||
"max_path_depth": structure.get("max_path_depth"),
|
||||
"publishing_velocity": trends.get("publishing_velocity"),
|
||||
"date_range": trends.get("date_range") or {},
|
||||
"total_dated_urls": trends.get("total_dated_urls"),
|
||||
"priority_distribution": patterns.get("priority_distribution") or {},
|
||||
"changefreq_distribution": patterns.get("changefreq_distribution") or {},
|
||||
}
|
||||
|
||||
def _build_deterministic_insights(
|
||||
self,
|
||||
website_url: str,
|
||||
user_sitemap_url: Optional[str],
|
||||
user_sitemap_result: Optional[Dict[str, Any]],
|
||||
competitor_sitemaps: Dict[str, Optional[str]],
|
||||
competitor_results: Dict[str, Dict[str, Any]],
|
||||
target_keywords: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
user_summary = self._summarize_sitemap(user_sitemap_result)
|
||||
competitor_summaries: Dict[str, Dict[str, Any]] = {}
|
||||
for competitor_url, result in competitor_results.items():
|
||||
if result and isinstance(result, dict) and "error" not in result:
|
||||
competitor_summaries[competitor_url] = self._summarize_sitemap(result)
|
||||
|
||||
user_sections = set((user_summary.get("top_url_patterns") or {}).keys())
|
||||
competitor_section_union: Dict[str, int] = {}
|
||||
for comp_summary in competitor_summaries.values():
|
||||
patterns = comp_summary.get("top_url_patterns") or {}
|
||||
for k, v in patterns.items():
|
||||
competitor_section_union[k] = competitor_section_union.get(k, 0) + int(v or 0)
|
||||
|
||||
missing_vs_competitors = []
|
||||
for section, count in sorted(competitor_section_union.items(), key=lambda x: x[1], reverse=True):
|
||||
if section not in user_sections and section:
|
||||
missing_vs_competitors.append({"section": section, "competitor_url_count": count})
|
||||
missing_vs_competitors = missing_vs_competitors[:10]
|
||||
|
||||
keyword_hints = []
|
||||
if target_keywords:
|
||||
user_pattern_text = " ".join(sorted(user_sections))
|
||||
for kw in target_keywords[:25]:
|
||||
kw_clean = (kw or "").strip()
|
||||
if not kw_clean:
|
||||
continue
|
||||
hit = kw_clean.lower() in user_pattern_text.lower()
|
||||
keyword_hints.append({"keyword": kw_clean, "seen_in_url_patterns": hit})
|
||||
|
||||
return {
|
||||
"website_url": website_url,
|
||||
"sitemap_found": bool(user_sitemap_url),
|
||||
"user_sitemap_summary": user_summary,
|
||||
"competitor_sitemap_summaries": competitor_summaries,
|
||||
"gaps_vs_competitors": {
|
||||
"missing_sections": missing_vs_competitors
|
||||
},
|
||||
"keyword_hints": keyword_hints
|
||||
}
|
||||
|
||||
def _get_system_prompt(self) -> str:
|
||||
return (
|
||||
"You are an SEO and content strategy expert for non-technical content creators, "
|
||||
"digital marketers, and solopreneurs. Return ONLY valid minified JSON."
|
||||
)
|
||||
|
||||
def _build_ai_prompt(
|
||||
self,
|
||||
website_url: str,
|
||||
target_keywords: List[str],
|
||||
custom_parameters: Dict[str, Any],
|
||||
deterministic_summary: Dict[str, Any]
|
||||
) -> str:
|
||||
required_schema = {
|
||||
"positioning_summary": "",
|
||||
"content_gaps": [],
|
||||
"topic_clusters": [],
|
||||
"publishing_recommendations": {},
|
||||
"quick_wins": [],
|
||||
"risks": [],
|
||||
"meta": {"confidence": 0.0, "inputs_used": []}
|
||||
}
|
||||
|
||||
return (
|
||||
"RULES:\n"
|
||||
"- Return ONE single-line MINIFIED JSON object only.\n"
|
||||
"- No markdown, code fences, or prose.\n"
|
||||
"- Use EXACTLY the top-level keys from this schema: "
|
||||
f"{list(required_schema.keys())}.\n"
|
||||
"- For arrays of objects, keep objects small and consistent.\n\n"
|
||||
f"WEBSITE: {website_url}\n"
|
||||
f"TARGET_KEYWORDS: {target_keywords[:25]}\n"
|
||||
f"CUSTOM_PARAMETERS: {custom_parameters}\n\n"
|
||||
f"SITEMAP_DERIVED_DATA (compact): {json.dumps(deterministic_summary, ensure_ascii=False)[:12000]}\n\n"
|
||||
"Now produce the strategy JSON."
|
||||
)
|
||||
|
||||
def _parse_json_response(self, text: str) -> Dict[str, Any]:
|
||||
cleaned = text.strip()
|
||||
cleaned = cleaned.replace("```json", "").replace("```", "").strip()
|
||||
|
||||
match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
|
||||
if match:
|
||||
cleaned = match.group(0)
|
||||
|
||||
return json.loads(cleaned)
|
||||
|
||||
async def health_check(self) -> Dict[str, Any]:
|
||||
"""Health check for the content strategy service"""
|
||||
@@ -53,4 +572,4 @@ class ContentStrategyService:
|
||||
"status": "operational",
|
||||
"service": self.service_name,
|
||||
"last_check": datetime.utcnow().isoformat()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,8 @@ class MetaDescriptionService:
|
||||
tone: str = "General",
|
||||
search_intent: str = "Informational Intent",
|
||||
language: str = "English",
|
||||
custom_prompt: Optional[str] = None
|
||||
custom_prompt: Optional[str] = None,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate AI-powered meta descriptions based on keywords and parameters
|
||||
@@ -65,7 +66,8 @@ class MetaDescriptionService:
|
||||
|
||||
ai_response = llm_text_gen(
|
||||
prompt=prompt,
|
||||
system_prompt=self._get_system_prompt(language)
|
||||
system_prompt=self._get_system_prompt(language),
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Parse and structure the response
|
||||
@@ -417,4 +419,4 @@ Focus on creating descriptions that will improve click-through rates for content
|
||||
"service": self.service_name,
|
||||
"error": str(e),
|
||||
"last_check": datetime.utcnow().isoformat()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,9 +5,13 @@ Comprehensive on-page SEO analyzer with AI-enhanced insights
|
||||
for content optimization and technical improvements.
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class OnPageSEOService:
|
||||
"""Service for comprehensive on-page SEO analysis"""
|
||||
@@ -17,6 +21,155 @@ class OnPageSEOService:
|
||||
self.service_name = "on_page_seo_analyzer"
|
||||
logger.info(f"Initialized {self.service_name}")
|
||||
|
||||
async def _fetch_page(self, url: str) -> tuple[Optional[str], int]:
|
||||
"""Fetch page content"""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +https://alwrity.com)'
|
||||
}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers=headers, timeout=10) as response:
|
||||
if response.status == 200:
|
||||
return await response.text(), 200
|
||||
return None, response.status
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching {url}: {str(e)}")
|
||||
return None, 500
|
||||
|
||||
def _analyze_meta_tags(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Analyze meta tags"""
|
||||
title = soup.title.string if soup.title else None
|
||||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||||
viewport = soup.find('meta', attrs={'name': 'viewport'})
|
||||
robots = soup.find('meta', attrs={'name': 'robots'})
|
||||
charset = soup.find('meta', attrs={'charset': True})
|
||||
|
||||
# Social Tags
|
||||
og_title = soup.find('meta', property='og:title')
|
||||
og_desc = soup.find('meta', property='og:description')
|
||||
og_image = soup.find('meta', property='og:image')
|
||||
twitter_card = soup.find('meta', attrs={'name': 'twitter:card'})
|
||||
|
||||
issues = []
|
||||
score = 100
|
||||
|
||||
# Title Analysis
|
||||
if not title:
|
||||
issues.append("Missing title tag")
|
||||
score -= 20
|
||||
elif len(title) < 30 or len(title) > 60:
|
||||
issues.append(f"Title length ({len(title)} chars) should be 30-60 chars")
|
||||
score -= 10
|
||||
|
||||
# Description Analysis
|
||||
desc_content = meta_desc['content'] if meta_desc else None
|
||||
if not desc_content:
|
||||
issues.append("Missing meta description")
|
||||
score -= 20
|
||||
elif len(desc_content) < 70 or len(desc_content) > 160:
|
||||
issues.append(f"Description length ({len(desc_content)} chars) should be 70-160 chars")
|
||||
score -= 10
|
||||
|
||||
# Viewport
|
||||
if not viewport:
|
||||
issues.append("Missing viewport meta tag")
|
||||
score -= 20
|
||||
|
||||
og_found = list(filter(None, ['Title' if og_title else '', 'Desc' if og_desc else '', 'Image' if og_image else '']))
|
||||
|
||||
return {
|
||||
"title_length": f"{len(title)} chars" if title else "Missing",
|
||||
"meta_description_length": f"{len(desc_content)} chars" if desc_content else "Missing",
|
||||
"has_viewport": bool(viewport),
|
||||
"charset": charset['charset'] if charset else "Missing",
|
||||
"robots_meta": robots['content'] if robots else "Missing (Default: index, follow)",
|
||||
"og_tags": f"Found: {', '.join(og_found)}" if og_found else "None",
|
||||
"twitter_card": twitter_card['content'] if twitter_card else "Missing",
|
||||
"score": max(0, score),
|
||||
"issues": issues
|
||||
}
|
||||
|
||||
def _analyze_technical(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
||||
"""Analyze technical SEO elements"""
|
||||
canonical = soup.find('link', attrs={'rel': 'canonical'})
|
||||
schema = soup.find_all('script', type='application/ld+json')
|
||||
|
||||
issues = []
|
||||
score = 100
|
||||
|
||||
if not canonical:
|
||||
issues.append("Missing canonical tag")
|
||||
score -= 10
|
||||
|
||||
# Check H1
|
||||
h1_tags = soup.find_all('h1')
|
||||
if len(h1_tags) == 0:
|
||||
issues.append("Missing H1 tag")
|
||||
score -= 20
|
||||
elif len(h1_tags) > 1:
|
||||
issues.append(f"Multiple H1 tags found ({len(h1_tags)})")
|
||||
score -= 10
|
||||
|
||||
return {
|
||||
"canonical_tag": canonical['href'] if canonical else "Missing",
|
||||
"schema_markup": f"Found {len(schema)} schema objects",
|
||||
"h1_count": len(h1_tags),
|
||||
"score": max(0, score),
|
||||
"issues": issues
|
||||
}
|
||||
|
||||
def _analyze_content(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Analyze content quality"""
|
||||
# Remove scripts and styles
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
text = soup.get_text()
|
||||
words = len(re.findall(r'\w+', text))
|
||||
|
||||
images = soup.find_all('img')
|
||||
images_without_alt = sum(1 for img in images if not img.get('alt'))
|
||||
|
||||
issues = []
|
||||
score = 100
|
||||
|
||||
if words < 300:
|
||||
issues.append(f"Low word count ({words} words)")
|
||||
score -= 20
|
||||
|
||||
if images_without_alt > 0:
|
||||
issues.append(f"{images_without_alt} images missing alt text")
|
||||
score -= 10
|
||||
|
||||
return {
|
||||
"word_count": words,
|
||||
"total_images": len(images),
|
||||
"images_without_alt": images_without_alt,
|
||||
"readability": "Good" if words > 300 else "Needs Improvement", # Placeholder for readability algo
|
||||
"score": max(0, score),
|
||||
"issues": issues
|
||||
}
|
||||
|
||||
def _analyze_url_structure(self, url: str) -> Dict[str, Any]:
|
||||
parsed = urlparse(url)
|
||||
return {
|
||||
"protocol": parsed.scheme,
|
||||
"domain": parsed.netloc,
|
||||
"path_depth": len(parsed.path.strip('/').split('/')) if parsed.path else 0,
|
||||
"is_https": parsed.scheme == 'https'
|
||||
}
|
||||
|
||||
def _calculate_overall_score(self, *analyses) -> int:
|
||||
total = sum(a.get('score', 0) for a in analyses)
|
||||
return round(total / len(analyses))
|
||||
|
||||
def _generate_summary(self, *analyses) -> Dict[str, Any]:
|
||||
critical_issues = []
|
||||
for a in analyses:
|
||||
for issue in a.get('issues', []):
|
||||
critical_issues.append({"message": issue, "severity": "critical", "category": "SEO"})
|
||||
return {"critical_issues": critical_issues}
|
||||
|
||||
async def analyze_on_page_seo(
|
||||
self,
|
||||
url: str,
|
||||
@@ -25,18 +178,53 @@ class OnPageSEOService:
|
||||
analyze_content_quality: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze on-page SEO factors"""
|
||||
# Placeholder implementation
|
||||
return {
|
||||
"url": url,
|
||||
"overall_score": 75,
|
||||
"title_analysis": {"score": 80, "issues": [], "recommendations": []},
|
||||
"meta_description": {"score": 70, "issues": [], "recommendations": []},
|
||||
"heading_structure": {"score": 85, "issues": [], "recommendations": []},
|
||||
"content_analysis": {"score": 75, "word_count": 1500, "readability": "Good"},
|
||||
"keyword_analysis": {"target_keywords": target_keywords or [], "optimization": "Moderate"},
|
||||
"image_analysis": {"total_images": 10, "missing_alt": 2} if analyze_images else {},
|
||||
"recommendations": ["Optimize meta description", "Add more target keywords"]
|
||||
}
|
||||
try:
|
||||
# Add protocol if missing
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'https://' + url
|
||||
|
||||
html_content, status_code = await self._fetch_page(url)
|
||||
|
||||
if not html_content:
|
||||
# Return error structure
|
||||
return {
|
||||
"url": url,
|
||||
"overall_score": 0,
|
||||
"summary": {"critical_issues": [{"message": f"Failed to fetch URL (Status: {status_code})", "severity": "critical", "category": "Connectivity"}]},
|
||||
"meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
|
||||
}
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Run Analyses
|
||||
meta_analysis = self._analyze_meta_tags(soup)
|
||||
technical_analysis = self._analyze_technical(soup, url)
|
||||
content_analysis = self._analyze_content(soup)
|
||||
url_analysis = self._analyze_url_structure(url)
|
||||
|
||||
result = {
|
||||
"url": url,
|
||||
"overall_score": self._calculate_overall_score(meta_analysis, technical_analysis, content_analysis),
|
||||
"meta": meta_analysis,
|
||||
"technical": technical_analysis,
|
||||
"content_health": content_analysis,
|
||||
"url_structure": url_analysis,
|
||||
"performance": {"load_time": "Real-time check pending"},
|
||||
"accessibility": {"images_without_alt": content_analysis["images_without_alt"]},
|
||||
"ux": {"viewport": meta_analysis["has_viewport"], "mobile_friendly": bool(meta_analysis["has_viewport"])},
|
||||
"summary": self._generate_summary(meta_analysis, technical_analysis, content_analysis)
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing {url}: {str(e)}")
|
||||
return {
|
||||
"url": url,
|
||||
"overall_score": 0,
|
||||
"summary": {"critical_issues": [{"message": str(e), "severity": "critical", "category": "System"}]},
|
||||
"meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
|
||||
}
|
||||
|
||||
async def health_check(self) -> Dict[str, Any]:
|
||||
"""Health check for the on-page SEO service"""
|
||||
@@ -44,4 +232,4 @@ class OnPageSEOService:
|
||||
"status": "operational",
|
||||
"service": self.service_name,
|
||||
"last_check": datetime.utcnow().isoformat()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,8 @@ class PageSpeedService:
|
||||
url: str,
|
||||
strategy: str = "DESKTOP",
|
||||
locale: str = "en",
|
||||
categories: List[str] = None
|
||||
categories: List[str] = None,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze website performance using Google PageSpeed Insights
|
||||
@@ -70,7 +71,7 @@ class PageSpeedService:
|
||||
structured_results = self._structure_pagespeed_results(pagespeed_data)
|
||||
|
||||
# Generate AI-enhanced insights
|
||||
ai_insights = await self._generate_ai_insights(structured_results, url, strategy)
|
||||
ai_insights = await self._generate_ai_insights(structured_results, url, strategy, user_id=user_id)
|
||||
|
||||
# Calculate optimization priority
|
||||
optimization_plan = self._create_optimization_plan(structured_results)
|
||||
@@ -281,7 +282,8 @@ class PageSpeedService:
|
||||
self,
|
||||
structured_results: Dict[str, Any],
|
||||
url: str,
|
||||
strategy: str
|
||||
strategy: str,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate AI-powered insights and recommendations"""
|
||||
|
||||
@@ -299,7 +301,8 @@ class PageSpeedService:
|
||||
# Generate AI insights
|
||||
ai_response = llm_text_gen(
|
||||
prompt=prompt,
|
||||
system_prompt=self._get_system_prompt()
|
||||
system_prompt=self._get_system_prompt(),
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Parse AI response
|
||||
@@ -598,4 +601,4 @@ Focus on practical advice that content creators and digital marketers can unders
|
||||
"service": self.service_name,
|
||||
"error": str(e),
|
||||
"last_check": datetime.utcnow().isoformat()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,12 +8,14 @@ content distribution, and publishing patterns for SEO optimization.
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import re
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
import xml.etree.ElementTree as ET
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import pandas as pd
|
||||
import gzip
|
||||
|
||||
from ..llm_providers.main_text_generation import llm_text_gen
|
||||
from middleware.logging_middleware import seo_logger
|
||||
@@ -52,7 +54,9 @@ class SitemapService:
|
||||
self,
|
||||
sitemap_url: str,
|
||||
analyze_content_trends: bool = True,
|
||||
analyze_publishing_patterns: bool = True
|
||||
analyze_publishing_patterns: bool = True,
|
||||
include_ai_insights: bool = True,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze website sitemap for structure and patterns
|
||||
@@ -92,10 +96,11 @@ class SitemapService:
|
||||
if analyze_publishing_patterns and sitemap_data.get("urls"):
|
||||
publishing_patterns = self._analyze_publishing_patterns(sitemap_data["urls"])
|
||||
|
||||
# Generate AI insights
|
||||
ai_insights = await self._generate_ai_insights(
|
||||
structure_analysis, content_trends, publishing_patterns, sitemap_url
|
||||
)
|
||||
ai_insights = {}
|
||||
if include_ai_insights:
|
||||
ai_insights = await self._generate_ai_insights(
|
||||
structure_analysis, content_trends, publishing_patterns, sitemap_url, user_id=user_id
|
||||
)
|
||||
|
||||
execution_time = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
@@ -119,7 +124,8 @@ class SitemapService:
|
||||
input_data={
|
||||
"sitemap_url": sitemap_url,
|
||||
"analyze_content_trends": analyze_content_trends,
|
||||
"analyze_publishing_patterns": analyze_publishing_patterns
|
||||
"analyze_publishing_patterns": analyze_publishing_patterns,
|
||||
"include_ai_insights": include_ai_insights
|
||||
},
|
||||
output_data=result,
|
||||
success=True
|
||||
@@ -145,19 +151,88 @@ class SitemapService:
|
||||
|
||||
raise
|
||||
|
||||
async def _fetch_sitemap_data(self, sitemap_url: str) -> Dict[str, Any]:
|
||||
async def _fetch_sitemap_data(self, sitemap_url: str, depth: int = 0, session: aiohttp.ClientSession = None) -> Dict[str, Any]:
|
||||
"""Fetch and parse sitemap data"""
|
||||
|
||||
# Reduced max depth from 3 to 2 to prevent infinite recursion/hanging on massive sites
|
||||
if depth > 2:
|
||||
logger.info(f"🛑 Max recursion depth (2) reached for sitemap {sitemap_url}")
|
||||
return {"urls": [], "sitemaps": [], "total_urls": 0}
|
||||
|
||||
# Use passed session or create a new local one if it's the top-level call
|
||||
local_session = False
|
||||
if session is None:
|
||||
local_session = True
|
||||
# Limit pool size and set strict timeouts
|
||||
connector = aiohttp.TCPConnector(limit_per_host=5, force_close=True)
|
||||
# Increased total timeout to 60s for slow sitemaps, but kept connect/read strict
|
||||
timeout = aiohttp.ClientTimeout(total=60, connect=10, sock_read=30)
|
||||
session = aiohttp.ClientSession(connector=connector, timeout=timeout)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(sitemap_url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||
logger.info(f"🔍 Fetching sitemap: {sitemap_url} (depth={depth})")
|
||||
# 10MB limit for sitemaps
|
||||
MAX_SITEMAP_SIZE = 10 * 1024 * 1024
|
||||
|
||||
try:
|
||||
async with session.get(sitemap_url) as response:
|
||||
if response.status != 200:
|
||||
raise Exception(f"Failed to fetch sitemap: HTTP {response.status}")
|
||||
|
||||
content = await response.text()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(content)
|
||||
# Check Content-Type header
|
||||
content_type = response.headers.get("Content-Type", "").lower()
|
||||
if "text/html" in content_type:
|
||||
raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
|
||||
|
||||
# Check Content-Length header if available
|
||||
content_length = response.headers.get("Content-Length")
|
||||
if content_length and int(content_length) > MAX_SITEMAP_SIZE:
|
||||
raise Exception(f"Sitemap too large: {content_length} bytes")
|
||||
|
||||
# Read with size limit (safe read)
|
||||
raw = await response.content.read(MAX_SITEMAP_SIZE + 1)
|
||||
if len(raw) > MAX_SITEMAP_SIZE:
|
||||
raise Exception(f"Sitemap size exceeds limit of {MAX_SITEMAP_SIZE} bytes")
|
||||
|
||||
if sitemap_url.lower().endswith(".gz") or (len(raw) >= 2 and raw[0] == 0x1F and raw[1] == 0x8B):
|
||||
try:
|
||||
raw = gzip.decompress(raw)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
content = raw.decode(response.charset or "utf-8", errors="replace")
|
||||
except Exception:
|
||||
content = raw.decode("utf-8", errors="replace")
|
||||
|
||||
content_stripped = content.lstrip()
|
||||
|
||||
if not content_stripped.startswith("<"):
|
||||
urls = []
|
||||
# Limit text sitemaps to 50k lines
|
||||
lines = content.splitlines()[:50000]
|
||||
for line in lines:
|
||||
line_clean = (line or "").strip()
|
||||
if not line_clean or line_clean.startswith("#"):
|
||||
continue
|
||||
if line_clean.startswith("http://") or line_clean.startswith("https://"):
|
||||
urls.append({"loc": line_clean})
|
||||
return {
|
||||
"urls": urls,
|
||||
"sitemaps": [],
|
||||
"total_urls": len(urls)
|
||||
}
|
||||
|
||||
# Check for HTML content disguised as XML
|
||||
if content.strip().lower().startswith(("<!doctype html", "<html")):
|
||||
raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
|
||||
|
||||
# Use defusedxml for safety if available, otherwise standard ET
|
||||
try:
|
||||
import defusedxml.ElementTree as DET
|
||||
root = DET.fromstring(content)
|
||||
except ImportError:
|
||||
root = ET.fromstring(content)
|
||||
|
||||
# Handle different sitemap formats
|
||||
urls = []
|
||||
@@ -172,17 +247,28 @@ class SitemapService:
|
||||
if loc is not None:
|
||||
sitemaps.append(loc.text)
|
||||
|
||||
# Fetch and parse nested sitemaps
|
||||
for nested_url in sitemaps[:10]: # Limit to 10 sitemaps
|
||||
try:
|
||||
nested_data = await self._fetch_sitemap_data(nested_url)
|
||||
urls.extend(nested_data.get("urls", []))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch nested sitemap {nested_url}: {e}")
|
||||
# Fetch and parse nested sitemaps in parallel
|
||||
nested_tasks = []
|
||||
# Reduced nested limit from 10 to 5 to prevent fan-out explosion
|
||||
for nested_url in sitemaps[:5]:
|
||||
nested_tasks.append(self._fetch_sitemap_data(nested_url, depth + 1, session))
|
||||
|
||||
if nested_tasks:
|
||||
nested_results = await asyncio.gather(*nested_tasks, return_exceptions=True)
|
||||
for res in nested_results:
|
||||
if isinstance(res, Exception):
|
||||
logger.warning(f"Failed to fetch nested sitemap: {res}")
|
||||
elif isinstance(res, dict):
|
||||
urls.extend(res.get("urls", []))
|
||||
|
||||
else:
|
||||
# Regular sitemap with URLs
|
||||
# Limit to first 10k URLs per sitemap file to prevent memory issues
|
||||
url_count = 0
|
||||
for url_element in root:
|
||||
if url_count >= 10000:
|
||||
break
|
||||
|
||||
if url_element.tag.endswith('url'):
|
||||
url_data = {}
|
||||
|
||||
@@ -192,18 +278,42 @@ class SitemapService:
|
||||
|
||||
if 'loc' in url_data:
|
||||
urls.append(url_data)
|
||||
url_count += 1
|
||||
|
||||
return {
|
||||
"urls": urls,
|
||||
"sitemaps": sitemaps,
|
||||
"total_urls": len(urls)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
# Re-raise to be caught by outer try/except
|
||||
raise e
|
||||
|
||||
except ET.ParseError as e:
|
||||
# Check if content is empty
|
||||
if not content or not content.strip():
|
||||
logger.warning(f"Sitemap is empty: {sitemap_url}")
|
||||
return {"urls": [], "sitemaps": [], "total_urls": 0}
|
||||
|
||||
# Check if content looks like HTML to give a better error message
|
||||
try:
|
||||
if "content" in locals() and ("<html" in content.lower() or "<body" in content.lower() or "<div" in content.lower()):
|
||||
raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.warning(f"Failed to parse sitemap XML: {e}")
|
||||
raise Exception(f"Failed to parse sitemap XML: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching sitemap data: {e}")
|
||||
if "no element found" in str(e) or "not a valid XML sitemap" in str(e):
|
||||
logger.warning(f"⚠️ Sitemap parsing failed for {sitemap_url}: {e}")
|
||||
else:
|
||||
logger.error(f"Error fetching sitemap data for {sitemap_url}: {e}")
|
||||
raise
|
||||
finally:
|
||||
# Only close the session if we created it
|
||||
if local_session and session:
|
||||
await session.close()
|
||||
|
||||
def _analyze_sitemap_structure(self, sitemap_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Analyze the structure of the sitemap"""
|
||||
@@ -239,14 +349,60 @@ class SitemapService:
|
||||
# Calculate statistics
|
||||
avg_path_depth = sum(path_levels) / len(path_levels) if path_levels else 0
|
||||
|
||||
# Enhancement: Keyword Clustering & Strategic Pillar Mapping
|
||||
keyword_clusters = self._cluster_keywords_from_urls(urls)
|
||||
strategic_pillars = self._map_strategic_pillars(urls)
|
||||
|
||||
return {
|
||||
"total_urls": len(urls),
|
||||
"url_patterns": dict(sorted(url_patterns.items(), key=lambda x: x[1], reverse=True)[:10]),
|
||||
"file_types": dict(sorted(file_types.items(), key=lambda x: x[1], reverse=True)),
|
||||
"average_path_depth": round(avg_path_depth, 2),
|
||||
"max_path_depth": max(path_levels) if path_levels else 0,
|
||||
"keyword_clusters": keyword_clusters,
|
||||
"strategic_pillars": strategic_pillars,
|
||||
"structure_quality": self._assess_structure_quality(url_patterns, avg_path_depth)
|
||||
}
|
||||
|
||||
def _cluster_keywords_from_urls(self, urls: List[Dict[str, Any]]) -> Dict[str, int]:
|
||||
"""Extract and cluster keywords from URL slugs to identify content strategy focus."""
|
||||
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'from', 'category', 'tag', 'blog', 'posts', 'archive'}
|
||||
keywords: Dict[str, int] = {}
|
||||
|
||||
for u in urls[:1000]: # Sample 1000 for performance
|
||||
path = urlparse(u.get('loc', '')).path
|
||||
# Split by non-alphanumeric and underscores
|
||||
parts = re.split(r'[^a-zA-Z0-9]', path)
|
||||
for part in parts:
|
||||
p = part.lower()
|
||||
if len(p) > 3 and p not in stop_words and not p.isdigit():
|
||||
keywords[p] = keywords.get(p, 0) + 1
|
||||
|
||||
# Return top 15 clusters
|
||||
return dict(sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:15])
|
||||
|
||||
def _map_strategic_pillars(self, urls: List[Dict[str, Any]]) -> Dict[str, int]:
|
||||
"""Categorize URLs into strategic content pillars based on common path patterns."""
|
||||
pillars = {
|
||||
"Educational": ["blog", "guides", "how-to", "learn", "academy", "resource", "documentation", "docs"],
|
||||
"Transactional": ["product", "features", "pricing", "plans", "solutions", "buy", "checkout", "cart"],
|
||||
"Comparison": ["vs", "alternative", "comparison", "reviews", "best-of"],
|
||||
"Company": ["about", "careers", "press", "contact", "team", "legal", "privacy", "terms"],
|
||||
"Tools": ["calculator", "tool", "generator", "checker", "analyzer"]
|
||||
}
|
||||
|
||||
results = {k: 0 for k in pillars}
|
||||
for u in urls:
|
||||
loc = u.get('loc', '').lower()
|
||||
found = False
|
||||
for pillar, tokens in pillars.items():
|
||||
if any(token in loc for token in tokens):
|
||||
results[pillar] += 1
|
||||
found = True
|
||||
break
|
||||
# Optional: Add "Other" category if needed
|
||||
|
||||
return results
|
||||
|
||||
def _analyze_content_trends(self, urls: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""Analyze content publishing trends"""
|
||||
@@ -334,7 +490,9 @@ class SitemapService:
|
||||
competitors: List[str] = None,
|
||||
industry_context: str = None,
|
||||
analyze_content_trends: bool = True,
|
||||
analyze_publishing_patterns: bool = True
|
||||
analyze_publishing_patterns: bool = True,
|
||||
include_ai_insights: bool = True,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Enhanced sitemap analysis specifically for onboarding Step 3 competitive analysis"""
|
||||
|
||||
@@ -343,7 +501,9 @@ class SitemapService:
|
||||
analysis_result = await self.analyze_sitemap(
|
||||
sitemap_url=sitemap_url,
|
||||
analyze_content_trends=analyze_content_trends,
|
||||
analyze_publishing_patterns=analyze_publishing_patterns
|
||||
analyze_publishing_patterns=analyze_publishing_patterns,
|
||||
include_ai_insights=include_ai_insights,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Enhance with onboarding-specific insights
|
||||
@@ -351,7 +511,8 @@ class SitemapService:
|
||||
analysis_result,
|
||||
user_url,
|
||||
competitors,
|
||||
industry_context
|
||||
industry_context,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Combine results
|
||||
@@ -374,7 +535,8 @@ class SitemapService:
|
||||
analysis_result: Dict[str, Any],
|
||||
user_url: str,
|
||||
competitors: List[str] = None,
|
||||
industry_context: str = None
|
||||
industry_context: str = None,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate onboarding-specific insights for competitive analysis"""
|
||||
|
||||
@@ -389,10 +551,37 @@ class SitemapService:
|
||||
user_url, competitors, industry_context
|
||||
)
|
||||
|
||||
# Define JSON schema for structured output
|
||||
json_struct = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"competitive_positioning": {"type": "string"},
|
||||
"content_gaps": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"growth_opportunities": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"industry_benchmarks": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"strategic_recommendations": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"required": ["competitive_positioning", "content_gaps", "growth_opportunities", "industry_benchmarks", "strategic_recommendations"]
|
||||
}
|
||||
|
||||
# Generate AI insights
|
||||
ai_response = llm_text_gen(
|
||||
prompt=prompt,
|
||||
system_prompt=self._get_onboarding_system_prompt()
|
||||
system_prompt=self._get_onboarding_system_prompt(),
|
||||
json_struct=json_struct,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Parse and structure insights
|
||||
@@ -402,7 +591,7 @@ class SitemapService:
|
||||
await seo_logger.log_ai_analysis(
|
||||
tool_name=f"{self.service_name}_onboarding",
|
||||
prompt=prompt,
|
||||
response=ai_response,
|
||||
response=ai_response if isinstance(ai_response, str) else str(ai_response),
|
||||
model_used="gemini-2.0-flash-001"
|
||||
)
|
||||
|
||||
@@ -422,7 +611,8 @@ class SitemapService:
|
||||
structure_analysis: Dict[str, Any],
|
||||
content_trends: Dict[str, Any],
|
||||
publishing_patterns: Dict[str, Any],
|
||||
sitemap_url: str
|
||||
sitemap_url: str,
|
||||
user_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate AI-powered insights for sitemap analysis"""
|
||||
|
||||
@@ -435,7 +625,8 @@ class SitemapService:
|
||||
# Generate AI insights
|
||||
ai_response = llm_text_gen(
|
||||
prompt=prompt,
|
||||
system_prompt=self._get_system_prompt()
|
||||
system_prompt=self._get_system_prompt(),
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Parse and structure insights
|
||||
@@ -697,7 +888,12 @@ Focus on actionable insights for content creators and digital marketing professi
|
||||
try:
|
||||
# Test with a simple sitemap
|
||||
test_url = "https://www.google.com/sitemap.xml"
|
||||
result = await self.analyze_sitemap(test_url, False, False)
|
||||
result = await self.analyze_sitemap(
|
||||
sitemap_url=test_url,
|
||||
analyze_content_trends=False,
|
||||
analyze_publishing_patterns=False,
|
||||
include_ai_insights=False
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "operational",
|
||||
@@ -731,7 +927,7 @@ Focus on actionable insights for content creators and digital marketing professi
|
||||
|
||||
competitor_info = ""
|
||||
if competitors:
|
||||
competitor_info = f"\nCompetitors to consider: {', '.join(competitors[:5])}"
|
||||
competitor_info = f"\nCompetitors to consider: {', '.join(competitors)}"
|
||||
|
||||
industry_info = ""
|
||||
if industry_context:
|
||||
@@ -753,12 +949,12 @@ Content Publishing Patterns:
|
||||
- Publishing Rate: {publishing_velocity:.2f} pages per day
|
||||
- Content Categories: {len(url_patterns)} main categories identified
|
||||
|
||||
Please provide competitive analysis insights focusing on:
|
||||
Please provide competitive analysis insights focusing on the following sections:
|
||||
|
||||
1. **COMPETITIVE POSITIONING**: How does this site's content structure compare to industry standards?
|
||||
2. **CONTENT GAPS**: What content categories or topics are missing based on the URL structure?
|
||||
3. **GROWTH OPPORTUNITIES**: Specific content expansion opportunities to compete better
|
||||
4. **INDUSTRY BENCHMARKS**: How does publishing frequency and content depth compare to competitors?
|
||||
1. **COMPETITIVE POSITIONING**: How does this site's content structure compare to industry standards? (Provide a brief paragraph)
|
||||
2. **CONTENT GAPS**: What content categories or topics are missing based on the URL structure? (List 3-5 specific gaps)
|
||||
3. **GROWTH OPPORTUNITIES**: Specific content expansion opportunities to compete better (List 3-5 opportunities)
|
||||
4. **INDUSTRY BENCHMARKS**: How does publishing frequency and content depth compare to competitors? (List 3 key comparisons)
|
||||
5. **STRATEGIC RECOMMENDATIONS**: 3-5 actionable steps for content strategy improvement
|
||||
|
||||
Focus on actionable insights that help content creators understand their competitive position and identify growth opportunities.
|
||||
@@ -783,69 +979,61 @@ Provide practical, data-driven insights that help content creators make informed
|
||||
|
||||
Format your response as structured insights that can be easily parsed and displayed in a user interface."""
|
||||
|
||||
def _parse_onboarding_insights(self, ai_response: str) -> Dict[str, Any]:
|
||||
def _parse_onboarding_insights(self, ai_response: Any) -> Dict[str, Any]:
|
||||
"""Parse AI response for onboarding-specific insights"""
|
||||
|
||||
try:
|
||||
# Initialize structured response
|
||||
insights = {
|
||||
"competitive_positioning": "Analysis in progress...",
|
||||
"content_gaps": [],
|
||||
"growth_opportunities": [],
|
||||
"industry_benchmarks": [],
|
||||
"strategic_recommendations": []
|
||||
insights = {}
|
||||
|
||||
# If it's already a dict (structured output), use it
|
||||
if isinstance(ai_response, dict):
|
||||
insights = ai_response
|
||||
elif isinstance(ai_response, str):
|
||||
# Try to parse JSON string
|
||||
try:
|
||||
insights = json.loads(ai_response)
|
||||
except json.JSONDecodeError:
|
||||
# Try to extract JSON from markdown block
|
||||
json_match = re.search(r'```json\s*(.*?)\s*```', ai_response, re.DOTALL)
|
||||
if json_match:
|
||||
try:
|
||||
insights = json.loads(json_match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Ensure all required keys exist
|
||||
required_keys = [
|
||||
"competitive_positioning",
|
||||
"content_gaps",
|
||||
"growth_opportunities",
|
||||
"industry_benchmarks",
|
||||
"strategic_recommendations"
|
||||
]
|
||||
|
||||
# Validate and fill missing keys
|
||||
validated_insights = {
|
||||
"competitive_positioning": insights.get("competitive_positioning", "Analysis in progress..."),
|
||||
"content_gaps": insights.get("content_gaps", []),
|
||||
"growth_opportunities": insights.get("growth_opportunities", []),
|
||||
"industry_benchmarks": insights.get("industry_benchmarks", []),
|
||||
"strategic_recommendations": insights.get("strategic_recommendations", [])
|
||||
}
|
||||
|
||||
# Simple parsing logic - look for structured sections
|
||||
lines = ai_response.split('\n')
|
||||
current_section = None
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Detect sections
|
||||
if any(keyword in line.lower() for keyword in ['competitive positioning', 'market position']):
|
||||
current_section = 'competitive_positioning'
|
||||
insights[current_section] = line
|
||||
elif any(keyword in line.lower() for keyword in ['content gaps', 'missing content']):
|
||||
current_section = 'content_gaps'
|
||||
elif any(keyword in line.lower() for keyword in ['growth opportunities', 'expansion']):
|
||||
current_section = 'growth_opportunities'
|
||||
elif any(keyword in line.lower() for keyword in ['industry benchmarks', 'benchmarks']):
|
||||
current_section = 'industry_benchmarks'
|
||||
elif any(keyword in line.lower() for keyword in ['strategic recommendations', 'recommendations']):
|
||||
current_section = 'strategic_recommendations'
|
||||
elif line.startswith('-') or line.startswith('•'):
|
||||
# This is a list item
|
||||
if current_section and current_section in insights:
|
||||
if isinstance(insights[current_section], str):
|
||||
insights[current_section] = [insights[current_section]]
|
||||
insights[current_section].append(line[1:].strip())
|
||||
elif current_section == 'competitive_positioning':
|
||||
# Append to competitive positioning text
|
||||
if insights[current_section] == "Analysis in progress...":
|
||||
insights[current_section] = line
|
||||
# Ensure lists are actually lists
|
||||
for key in required_keys[1:]:
|
||||
if not isinstance(validated_insights[key], list):
|
||||
if isinstance(validated_insights[key], str):
|
||||
validated_insights[key] = [validated_insights[key]]
|
||||
else:
|
||||
insights[current_section] += " " + line
|
||||
|
||||
# Fallback: if no structured parsing worked, use the full response
|
||||
if insights["competitive_positioning"] == "Analysis in progress...":
|
||||
insights["competitive_positioning"] = ai_response[:500] + "..." if len(ai_response) > 500 else ai_response
|
||||
|
||||
# Ensure lists are properly formatted
|
||||
for key in ['content_gaps', 'growth_opportunities', 'industry_benchmarks', 'strategic_recommendations']:
|
||||
if isinstance(insights[key], str):
|
||||
insights[key] = [insights[key]] if insights[key] else []
|
||||
|
||||
return insights
|
||||
validated_insights[key] = []
|
||||
|
||||
return validated_insights
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing onboarding insights: {e}")
|
||||
return {
|
||||
"competitive_positioning": ai_response[:300] + "..." if len(ai_response) > 300 else ai_response,
|
||||
"content_gaps": ["Analysis parsing error - see full response above"],
|
||||
"competitive_positioning": "Analysis unavailable",
|
||||
"content_gaps": [],
|
||||
"growth_opportunities": [],
|
||||
"industry_benchmarks": [],
|
||||
"strategic_recommendations": []
|
||||
@@ -889,6 +1077,48 @@ Format your response as structured insights that can be easily parsed and displa
|
||||
logger.error(f"Error discovering sitemap for {website_url}: {e}")
|
||||
return None
|
||||
|
||||
async def _find_sitemap_on_homepage(self, base_url: str) -> Optional[str]:
|
||||
"""
|
||||
Check homepage for sitemap links in HTML.
|
||||
|
||||
Args:
|
||||
base_url: Base URL of the website
|
||||
|
||||
Returns:
|
||||
Sitemap URL if found on homepage, None otherwise
|
||||
"""
|
||||
try:
|
||||
logger.debug(f"Checking homepage for sitemap links: {base_url}")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(base_url, timeout=aiohttp.ClientTimeout(total=15), headers={"User-Agent": "ALwrity-SEO-Bot/1.0"}) as response:
|
||||
if response.status == 200:
|
||||
content = await response.text()
|
||||
|
||||
# Look for sitemap links in href attributes
|
||||
# Matches: href="...sitemap.xml..." or href='...sitemap.xml...'
|
||||
# Simple regex to catch common variations
|
||||
sitemap_matches = re.findall(r'href=["\']([^"\']*[sS]itemap[^"\']*\.xml[^"\']*)["\']', content)
|
||||
|
||||
for match in sitemap_matches:
|
||||
potential_url = match.strip()
|
||||
|
||||
# Handle relative URLs
|
||||
if not potential_url.startswith(('http://', 'https://')):
|
||||
potential_url = urljoin(base_url, potential_url)
|
||||
|
||||
logger.debug(f"Found potential sitemap link on homepage: {potential_url}")
|
||||
|
||||
# Verify accessibility
|
||||
if await self._check_sitemap_url(potential_url, "homepage link"):
|
||||
return potential_url
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error checking homepage for sitemap: {e}")
|
||||
return None
|
||||
|
||||
async def _find_sitemap_in_robots_txt(self, base_url: str) -> Optional[str]:
|
||||
"""
|
||||
Check robots.txt for sitemap directives.
|
||||
@@ -1027,4 +1257,4 @@ Format your response as structured insights that can be easily parsed and displa
|
||||
return response.status == 200
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
@@ -5,8 +5,12 @@ Comprehensive technical SEO crawler and analyzer with AI-enhanced
|
||||
insights for website optimization and search engine compatibility.
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import time
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
|
||||
class TechnicalSEOService:
|
||||
@@ -16,6 +20,9 @@ class TechnicalSEOService:
|
||||
"""Initialize the technical SEO service"""
|
||||
self.service_name = "technical_seo_analyzer"
|
||||
logger.info(f"Initialized {self.service_name}")
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +http://alwrity.com/bot)'
|
||||
}
|
||||
|
||||
async def analyze_technical_seo(
|
||||
self,
|
||||
@@ -25,20 +32,115 @@ class TechnicalSEOService:
|
||||
analyze_performance: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze technical SEO factors"""
|
||||
# Placeholder implementation
|
||||
return {
|
||||
"url": url,
|
||||
"pages_crawled": 25,
|
||||
"crawl_depth": crawl_depth,
|
||||
"technical_issues": [
|
||||
{"type": "Missing robots.txt", "severity": "Medium", "pages_affected": 1},
|
||||
{"type": "Slow loading pages", "severity": "High", "pages_affected": 3}
|
||||
],
|
||||
"site_structure": {"internal_links": 150, "external_links": 25 if include_external_links else 0},
|
||||
"performance_metrics": {"avg_load_time": 2.5, "largest_contentful_paint": 1.8} if analyze_performance else {},
|
||||
"recommendations": ["Implement robots.txt", "Optimize page load speed"],
|
||||
"crawl_summary": {"successful": 23, "errors": 2, "redirects": 5}
|
||||
}
|
||||
try:
|
||||
start_time = time.time()
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
async with session.get(url, timeout=30) as response:
|
||||
load_time = time.time() - start_time
|
||||
status_code = response.status
|
||||
content = await response.text()
|
||||
headers = response.headers
|
||||
|
||||
# Basic parsing
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
# 1. Meta Tags Analysis
|
||||
title = soup.title.string if soup.title else None
|
||||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||||
meta_desc_content = meta_desc['content'] if meta_desc else None
|
||||
|
||||
# 2. Heading Structure
|
||||
h1_tags = soup.find_all('h1')
|
||||
h2_tags = soup.find_all('h2')
|
||||
h3_tags = soup.find_all('h3')
|
||||
|
||||
# 3. Image Analysis
|
||||
images = soup.find_all('img')
|
||||
images_without_alt = [img['src'] for img in images if not img.get('alt')]
|
||||
|
||||
# 4. Link Analysis
|
||||
links = soup.find_all('a')
|
||||
internal_links = []
|
||||
external_links = []
|
||||
domain = urlparse(url).netloc
|
||||
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if not href:
|
||||
continue
|
||||
if href.startswith('http'):
|
||||
if domain in href:
|
||||
internal_links.append(href)
|
||||
else:
|
||||
external_links.append(href)
|
||||
elif href.startswith('/'):
|
||||
internal_links.append(urljoin(url, href))
|
||||
|
||||
# 5. Technical Issues Detection
|
||||
issues = []
|
||||
|
||||
# Status Code Issues
|
||||
if status_code != 200:
|
||||
issues.append({"type": f"Status Code {status_code}", "severity": "High", "pages_affected": 1})
|
||||
|
||||
# Performance Issues
|
||||
if load_time > 2.0:
|
||||
issues.append({"type": "Slow Server Response", "severity": "Medium", "pages_affected": 1})
|
||||
|
||||
# Meta Issues
|
||||
if not title:
|
||||
issues.append({"type": "Missing Title Tag", "severity": "High", "pages_affected": 1})
|
||||
elif len(title) > 60:
|
||||
issues.append({"type": "Title Tag Too Long", "severity": "Low", "pages_affected": 1})
|
||||
|
||||
if not meta_desc_content:
|
||||
issues.append({"type": "Missing Meta Description", "severity": "High", "pages_affected": 1})
|
||||
|
||||
# Content Structure Issues
|
||||
if not h1_tags:
|
||||
issues.append({"type": "Missing H1 Tag", "severity": "High", "pages_affected": 1})
|
||||
elif len(h1_tags) > 1:
|
||||
issues.append({"type": "Multiple H1 Tags", "severity": "Medium", "pages_affected": 1})
|
||||
|
||||
# Image Issues
|
||||
if images_without_alt:
|
||||
issues.append({"type": "Images Missing Alt Text", "severity": "Medium", "pages_affected": len(images_without_alt)})
|
||||
|
||||
# Security Issues
|
||||
if url.startswith('http:'):
|
||||
issues.append({"type": "Insecure Protocol (HTTP)", "severity": "High", "pages_affected": 1})
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"pages_crawled": 1, # Currently single page
|
||||
"crawl_depth": 1,
|
||||
"technical_issues": issues,
|
||||
"site_structure": {
|
||||
"internal_links": len(internal_links),
|
||||
"external_links": len(external_links) if include_external_links else 0,
|
||||
"h1_count": len(h1_tags),
|
||||
"h2_count": len(h2_tags),
|
||||
"h3_count": len(h3_tags)
|
||||
},
|
||||
"performance_metrics": {
|
||||
"response_time": round(load_time, 3),
|
||||
"content_size": len(content)
|
||||
} if analyze_performance else {},
|
||||
"recommendations": [issue['type'] for issue in issues],
|
||||
"crawl_summary": {
|
||||
"successful": 1 if status_code == 200 else 0,
|
||||
"errors": 1 if status_code >= 400 else 0,
|
||||
"redirects": 1 if 300 <= status_code < 400 else 0
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in technical SEO analysis: {e}")
|
||||
return {
|
||||
"url": url,
|
||||
"error": str(e),
|
||||
"technical_issues": [{"type": "Crawl Failed", "severity": "High", "pages_affected": 1}]
|
||||
}
|
||||
|
||||
async def health_check(self) -> Dict[str, Any]:
|
||||
"""Health check for the technical SEO service"""
|
||||
|
||||
Reference in New Issue
Block a user