Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts

This commit is contained in:
ajaysi
2026-02-08 13:56:57 +05:30
parent 1db10ccd0f
commit e404a86502
333 changed files with 42223 additions and 10875 deletions

View File

@@ -5,10 +5,19 @@ AI-powered content strategy analyzer that provides insights into
content gaps, opportunities, and competitive positioning.
"""
from typing import Dict, Any, List, Optional
import json
import re
import asyncio
from typing import Dict, Any, List, Optional, Tuple
from datetime import datetime
import statistics
from loguru import logger
from ..llm_providers.main_text_generation import llm_text_gen
from middleware.logging_middleware import seo_logger
from .sitemap_service import SitemapService
class ContentStrategyService:
"""Service for AI-powered content strategy analysis"""
@@ -22,30 +31,540 @@ class ContentStrategyService:
website_url: str,
competitors: List[str] = None,
target_keywords: List[str] = None,
custom_parameters: Dict[str, Any] = None
custom_parameters: Dict[str, Any] = None,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""Analyze content strategy and opportunities"""
# Placeholder implementation
return {
start_time = datetime.utcnow()
competitors = competitors or []
target_keywords = target_keywords or []
custom_parameters = custom_parameters or {}
sitemap_service = SitemapService()
discovered_user_sitemap = await sitemap_service.discover_sitemap_url(website_url)
user_sitemap_result = None
if discovered_user_sitemap:
user_sitemap_result = await sitemap_service.analyze_sitemap(
sitemap_url=discovered_user_sitemap,
analyze_content_trends=True,
analyze_publishing_patterns=True,
include_ai_insights=False
)
competitor_sitemaps: Dict[str, Optional[str]] = {}
competitor_results: Dict[str, Dict[str, Any]] = {}
for competitor_url in competitors[:5]:
sitemap_url = await sitemap_service.discover_sitemap_url(competitor_url)
competitor_sitemaps[competitor_url] = sitemap_url
if sitemap_url:
try:
competitor_results[competitor_url] = await sitemap_service.analyze_sitemap(
sitemap_url=sitemap_url,
analyze_content_trends=True,
analyze_publishing_patterns=True,
include_ai_insights=False
)
except Exception as e:
competitor_results[competitor_url] = {"error": str(e)}
deterministic = self._build_deterministic_insights(
website_url=website_url,
user_sitemap_url=discovered_user_sitemap,
user_sitemap_result=user_sitemap_result,
competitor_sitemaps=competitor_sitemaps,
competitor_results=competitor_results,
target_keywords=target_keywords
)
ai_strategy = None
ai_error = None
if user_id:
try:
prompt = self._build_ai_prompt(
website_url=website_url,
target_keywords=target_keywords,
custom_parameters=custom_parameters,
deterministic_summary=deterministic
)
ai_response = llm_text_gen(
prompt=prompt,
system_prompt=self._get_system_prompt(),
user_id=user_id
)
ai_strategy = self._parse_json_response(ai_response)
await seo_logger.log_ai_analysis(
tool_name=self.service_name,
prompt=prompt,
response=ai_response,
model_used="gemini-2.0-flash-001"
)
except Exception as e:
ai_error = str(e)
execution_time = (datetime.utcnow() - start_time).total_seconds()
result = {
"website_url": website_url,
"analysis_type": "content_strategy",
"competitors_analyzed": len(competitors) if competitors else 0,
"content_gaps": [
{"topic": "SEO best practices", "opportunity_score": 85, "difficulty": "Medium"},
{"topic": "Content marketing", "opportunity_score": 78, "difficulty": "Low"}
],
"opportunities": [
{"type": "Trending topics", "count": 15, "potential_traffic": "High"},
{"type": "Long-tail keywords", "count": 45, "potential_traffic": "Medium"}
],
"content_performance": {"top_performing": 12, "underperforming": 8},
"recommendations": [
"Create content around trending SEO topics",
"Optimize existing content for long-tail keywords",
"Develop content series for better engagement"
],
"competitive_analysis": {"content_leadership": "moderate", "gaps_identified": 8}
"timestamp": datetime.utcnow().isoformat(),
"execution_time": execution_time,
"inputs": {
"competitors": competitors[:5],
"target_keywords": target_keywords,
"custom_parameters": custom_parameters
},
"data_sources": {
"user_sitemap_url": discovered_user_sitemap,
"competitor_sitemaps": competitor_sitemaps
},
"deterministic_insights": deterministic,
"ai_strategy": ai_strategy,
"ai_error": ai_error
}
await seo_logger.log_tool_usage(
tool_name=self.service_name,
input_data={
"website_url": website_url,
"competitors_count": len(competitors),
"target_keywords_count": len(target_keywords),
"has_user_sitemap": bool(discovered_user_sitemap)
},
output_data={
"website_url": website_url,
"has_ai_strategy": bool(ai_strategy),
"has_ai_error": bool(ai_error),
"execution_time": execution_time
},
success=True if (ai_strategy is not None or deterministic is not None) else False
)
return result
async def analyze_competitive_sitemap_benchmarking(
self,
website_url: str,
competitors: List[str],
max_competitors: Optional[int] = None,
user_id: Optional[str] = None
) -> Dict[str, Any]:
start_time = datetime.utcnow()
# Using WARNING level to ensure visibility in production logs as requested by user
logger.warning(f"🚀 [START] Competitive sitemap benchmarking for {website_url} with {len(competitors)} competitors")
competitors = [c for c in (competitors or []) if isinstance(c, str) and c.strip()]
if max_competitors:
competitors = competitors[: max(0, int(max_competitors))]
if not competitors:
logger.warning(f"No competitors provided for benchmarking {website_url}")
sitemap_service = SitemapService()
logger.warning(f"🔍 [PROGRESS] Discovering user sitemap for {website_url}")
discovered_user_sitemap = await sitemap_service.discover_sitemap_url(website_url)
user_sitemap_result = None
user_error = None
if discovered_user_sitemap:
try:
logger.warning(f"⚡ [PROGRESS] Analyzing user sitemap: {discovered_user_sitemap}")
user_sitemap_result = await sitemap_service.analyze_sitemap(
sitemap_url=discovered_user_sitemap,
analyze_content_trends=True,
analyze_publishing_patterns=True,
include_ai_insights=False,
user_id=user_id
)
except Exception as e:
user_error = str(e)
logger.error(f"Error analyzing user sitemap {discovered_user_sitemap}: {e}")
else:
user_error = "No sitemap discovered for your website. Please ensure your site has a valid sitemap.xml."
logger.warning(f"⚠️ No sitemap found for user website {website_url}")
competitor_sitemaps: Dict[str, Optional[str]] = {}
competitor_results: Dict[str, Dict[str, Any]] = {}
competitor_errors: Dict[str, str] = {}
logger.warning(f"🔍 [PROGRESS] Discovering sitemaps for {len(competitors)} competitors")
discovery_tasks = [sitemap_service.discover_sitemap_url(u) for u in competitors]
discovery_results = await asyncio.gather(*discovery_tasks, return_exceptions=True)
for i, url in enumerate(competitors):
res = discovery_results[i]
if isinstance(res, Exception):
competitor_sitemaps[url] = None
competitor_errors[url] = str(res)
logger.warning(f"Error discovering sitemap for competitor {url}: {res}")
else:
competitor_sitemaps[url] = res
if not res:
competitor_errors[url] = "No sitemap found"
logger.info(f" No sitemap found for competitor {url}")
else:
logger.info(f"✅ Found sitemap for competitor {url}: {res}")
to_analyze = [(url, competitor_sitemaps.get(url)) for url in competitors if competitor_sitemaps.get(url)]
logger.warning(f"⚡ [PROGRESS] Analyzing {len(to_analyze)} competitor sitemaps")
# Helper for safe analysis with timeout
async def analyze_with_timeout(url, sm):
try:
logger.warning(f"🕒 [START] Analyzing {url} with 300s timeout")
# 5 minute timeout per competitor to prevent total blocking
result = await asyncio.wait_for(
sitemap_service.analyze_sitemap(
sitemap_url=sm,
analyze_content_trends=True,
analyze_publishing_patterns=True,
include_ai_insights=False,
user_id=user_id
),
timeout=300.0
)
logger.warning(f"✅ [DONE] Analysis finished for {url}")
return result
except asyncio.TimeoutError:
logger.error(f"⏱️ Analysis timed out for competitor {url} (limit: 300s)")
return TimeoutError(f"Analysis timed out after 300s")
except Exception as e:
msg = str(e)
if "URL returned a webpage" in msg or "Failed to parse sitemap XML" in msg or "no element found" in msg:
logger.warning(f"⚠️ Analysis skipped for {url}: Invalid sitemap ({msg})")
else:
logger.error(f"❌ Analysis failed for {url}: {e}")
return e
analysis_tasks = [
analyze_with_timeout(url, sm)
for (url, sm) in to_analyze
]
analysis_results = await asyncio.gather(*analysis_tasks, return_exceptions=True)
for i, (url, _) in enumerate(to_analyze):
res = analysis_results[i]
if isinstance(res, Exception):
competitor_errors[url] = str(res)
if "URL returned a webpage" not in str(res) and "Failed to parse sitemap XML" not in str(res) and "no element found" not in str(res):
logger.error(f"Error analyzing sitemap for competitor {url}: {res}")
else:
competitor_results[url] = res
user_summary = self._summarize_sitemap(user_sitemap_result)
competitor_summaries: Dict[str, Dict[str, Any]] = {}
for competitor_url, result in competitor_results.items():
if result and isinstance(result, dict) and "error" not in result:
competitor_summaries[competitor_url] = self._summarize_sitemap(result)
benchmark = self._build_competitive_sitemap_benchmark(
website_url=website_url,
user_summary=user_summary,
competitor_summaries=competitor_summaries
)
execution_time = (datetime.utcnow() - start_time).total_seconds()
return {
"analysis_type": "competitive_sitemap_benchmarking",
"timestamp": datetime.utcnow().isoformat(),
"execution_time": execution_time,
"inputs": {
"website_url": website_url,
"competitors": competitors,
"max_competitors": max_competitors
},
"data_sources": {
"user_sitemap_url": discovered_user_sitemap,
"competitor_sitemaps": competitor_sitemaps
},
"user": {
"summary": user_summary,
"error": user_error
},
"competitors": {
"summaries": competitor_summaries,
"errors": competitor_errors
},
"benchmark": benchmark
}
def _safe_ratio(self, numerator: Any, denominator: Any) -> Optional[float]:
try:
num = float(numerator)
den = float(denominator)
if den <= 0:
return None
return round(num / den, 4)
except Exception:
return None
def _as_float(self, value: Any) -> Optional[float]:
try:
if value is None:
return None
return float(value)
except Exception:
return None
def _median(self, values: List[Optional[float]]) -> Optional[float]:
cleaned = [v for v in values if isinstance(v, (int, float))]
if not cleaned:
return None
try:
return float(statistics.median(cleaned))
except Exception:
return None
def _build_competitive_sitemap_benchmark(
self,
website_url: str,
user_summary: Dict[str, Any],
competitor_summaries: Dict[str, Dict[str, Any]]
) -> Dict[str, Any]:
user_patterns = user_summary.get("top_url_patterns") or {}
user_sections = set(user_patterns.keys())
competitor_section_stats: Dict[str, Dict[str, Any]] = {}
competitor_metrics: List[Dict[str, Any]] = []
for competitor_url, summary in competitor_summaries.items():
patterns = summary.get("top_url_patterns") or {}
total_urls = summary.get("total_urls") or 0
span_days = (summary.get("date_range") or {}).get("span_days")
competitor_metrics.append({
"competitor_url": competitor_url,
"total_urls": summary.get("total_urls"),
"sections_count": len(patterns.keys()),
"average_path_depth": summary.get("average_path_depth"),
"max_path_depth": summary.get("max_path_depth"),
"publishing_velocity": summary.get("publishing_velocity"),
"lastmod_coverage": self._safe_ratio(summary.get("total_dated_urls"), total_urls) if isinstance(summary.get("total_dated_urls"), (int, float)) else None,
"span_days": span_days
})
for section, count in patterns.items():
if not section:
continue
if section not in competitor_section_stats:
competitor_section_stats[section] = {
"competitor_presence": 0,
"total_url_count": 0
}
competitor_section_stats[section]["competitor_presence"] += 1
competitor_section_stats[section]["total_url_count"] += int(count or 0)
competitor_count = len(competitor_summaries)
missing_sections = []
for section, stats in sorted(
competitor_section_stats.items(),
key=lambda x: (x[1].get("competitor_presence", 0), x[1].get("total_url_count", 0)),
reverse=True
):
# Filter out known non-content patterns:
# 1. Sections present in user site
# 2. Short sections <= 3 chars (likely language codes like /en, /es, /fr)
# 3. Common technical paths (wp-content, wp-includes, cgi-bin)
if section in user_sections:
continue
if len(section) <= 3: # e.g., /es, /fr, /pt
continue
if any(tech in section.lower() for tech in ['wp-content', 'wp-includes', 'cgi-bin', 'assets', 'static']):
continue
if competitor_count > 0 and stats.get("competitor_presence", 0) >= max(2, int(round(0.4 * competitor_count))):
missing_sections.append({
"section": section,
# Ensure presence is a normalized ratio (0.0 - 1.0)
"competitor_presence": self._safe_ratio(stats.get("competitor_presence", 0), competitor_count) or 0,
"competitor_count": stats.get("competitor_presence"),
"total_url_count": stats.get("total_url_count", 0)
})
missing_sections = missing_sections[:15]
velocity_values = [self._as_float(s.get("publishing_velocity")) for s in competitor_summaries.values()]
depth_values = [self._as_float(s.get("average_path_depth")) for s in competitor_summaries.values()]
competitor_velocity_median = self._median(velocity_values)
competitor_depth_median = self._median(depth_values)
user_velocity = self._as_float(user_summary.get("publishing_velocity"))
user_depth = self._as_float(user_summary.get("average_path_depth"))
user_total_urls = user_summary.get("total_urls") or 0
opportunities = []
# Note: 'missing_sections' opportunity removed to avoid duplication with 'Competitor Content Strategy Patterns' section
# Insight 1: Content Volume Gap
competitor_total_urls_list = [m["total_urls"] for m in competitor_metrics if m.get("total_urls")]
competitor_urls_median = self._median(competitor_total_urls_list)
if competitor_urls_median and user_total_urls < competitor_urls_median * 0.8:
opportunities.append({
"type": "content_volume_gap",
"title": "Competitors have significantly more content",
"metrics": {
"user_total_pages": user_total_urls,
"competitor_median_total_pages": int(competitor_urls_median)
}
})
# Insight 2: Publishing Velocity Gap
if competitor_velocity_median is not None and user_velocity is not None:
if user_velocity < competitor_velocity_median * 0.75:
opportunities.append({
"type": "publishing_velocity_gap",
"title": "Competitors appear to publish more frequently",
"metrics": {
"user_publishing_velocity": user_velocity,
"competitor_median_publishing_velocity": competitor_velocity_median
}
})
# Insight 3: Architecture Depth Gap
if competitor_depth_median is not None and user_depth is not None:
if user_depth < competitor_depth_median - 0.5:
opportunities.append({
"type": "architecture_depth_gap",
"title": "Competitors have deeper site structure",
"metrics": {
"user_average_path_depth": user_depth,
"competitor_median_average_path_depth": competitor_depth_median
}
})
competitor_metrics_sorted = sorted(
competitor_metrics,
key=lambda x: (x.get("total_urls") or 0),
reverse=True
)
return {
"website_url": website_url,
"competitors_analyzed": competitor_count,
"user_sections_count": len(user_sections),
"competitor_section_leaders": competitor_metrics_sorted[:10],
"gaps": {
"missing_sections": missing_sections
},
"opportunities": opportunities
}
def _summarize_sitemap(self, sitemap_result: Optional[Dict[str, Any]]) -> Dict[str, Any]:
if not sitemap_result or not isinstance(sitemap_result, dict):
return {}
structure = sitemap_result.get("structure_analysis") or {}
trends = sitemap_result.get("content_trends") or {}
patterns = sitemap_result.get("publishing_patterns") or {}
return {
"total_urls": sitemap_result.get("total_urls"),
"top_url_patterns": structure.get("url_patterns") or {},
"file_types": structure.get("file_types") or {},
"average_path_depth": structure.get("average_path_depth"),
"max_path_depth": structure.get("max_path_depth"),
"publishing_velocity": trends.get("publishing_velocity"),
"date_range": trends.get("date_range") or {},
"total_dated_urls": trends.get("total_dated_urls"),
"priority_distribution": patterns.get("priority_distribution") or {},
"changefreq_distribution": patterns.get("changefreq_distribution") or {},
}
def _build_deterministic_insights(
self,
website_url: str,
user_sitemap_url: Optional[str],
user_sitemap_result: Optional[Dict[str, Any]],
competitor_sitemaps: Dict[str, Optional[str]],
competitor_results: Dict[str, Dict[str, Any]],
target_keywords: List[str]
) -> Dict[str, Any]:
user_summary = self._summarize_sitemap(user_sitemap_result)
competitor_summaries: Dict[str, Dict[str, Any]] = {}
for competitor_url, result in competitor_results.items():
if result and isinstance(result, dict) and "error" not in result:
competitor_summaries[competitor_url] = self._summarize_sitemap(result)
user_sections = set((user_summary.get("top_url_patterns") or {}).keys())
competitor_section_union: Dict[str, int] = {}
for comp_summary in competitor_summaries.values():
patterns = comp_summary.get("top_url_patterns") or {}
for k, v in patterns.items():
competitor_section_union[k] = competitor_section_union.get(k, 0) + int(v or 0)
missing_vs_competitors = []
for section, count in sorted(competitor_section_union.items(), key=lambda x: x[1], reverse=True):
if section not in user_sections and section:
missing_vs_competitors.append({"section": section, "competitor_url_count": count})
missing_vs_competitors = missing_vs_competitors[:10]
keyword_hints = []
if target_keywords:
user_pattern_text = " ".join(sorted(user_sections))
for kw in target_keywords[:25]:
kw_clean = (kw or "").strip()
if not kw_clean:
continue
hit = kw_clean.lower() in user_pattern_text.lower()
keyword_hints.append({"keyword": kw_clean, "seen_in_url_patterns": hit})
return {
"website_url": website_url,
"sitemap_found": bool(user_sitemap_url),
"user_sitemap_summary": user_summary,
"competitor_sitemap_summaries": competitor_summaries,
"gaps_vs_competitors": {
"missing_sections": missing_vs_competitors
},
"keyword_hints": keyword_hints
}
def _get_system_prompt(self) -> str:
return (
"You are an SEO and content strategy expert for non-technical content creators, "
"digital marketers, and solopreneurs. Return ONLY valid minified JSON."
)
def _build_ai_prompt(
self,
website_url: str,
target_keywords: List[str],
custom_parameters: Dict[str, Any],
deterministic_summary: Dict[str, Any]
) -> str:
required_schema = {
"positioning_summary": "",
"content_gaps": [],
"topic_clusters": [],
"publishing_recommendations": {},
"quick_wins": [],
"risks": [],
"meta": {"confidence": 0.0, "inputs_used": []}
}
return (
"RULES:\n"
"- Return ONE single-line MINIFIED JSON object only.\n"
"- No markdown, code fences, or prose.\n"
"- Use EXACTLY the top-level keys from this schema: "
f"{list(required_schema.keys())}.\n"
"- For arrays of objects, keep objects small and consistent.\n\n"
f"WEBSITE: {website_url}\n"
f"TARGET_KEYWORDS: {target_keywords[:25]}\n"
f"CUSTOM_PARAMETERS: {custom_parameters}\n\n"
f"SITEMAP_DERIVED_DATA (compact): {json.dumps(deterministic_summary, ensure_ascii=False)[:12000]}\n\n"
"Now produce the strategy JSON."
)
def _parse_json_response(self, text: str) -> Dict[str, Any]:
cleaned = text.strip()
cleaned = cleaned.replace("```json", "").replace("```", "").strip()
match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
if match:
cleaned = match.group(0)
return json.loads(cleaned)
async def health_check(self) -> Dict[str, Any]:
"""Health check for the content strategy service"""
@@ -53,4 +572,4 @@ class ContentStrategyService:
"status": "operational",
"service": self.service_name,
"last_check": datetime.utcnow().isoformat()
}
}

View File

@@ -27,7 +27,8 @@ class MetaDescriptionService:
tone: str = "General",
search_intent: str = "Informational Intent",
language: str = "English",
custom_prompt: Optional[str] = None
custom_prompt: Optional[str] = None,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Generate AI-powered meta descriptions based on keywords and parameters
@@ -65,7 +66,8 @@ class MetaDescriptionService:
ai_response = llm_text_gen(
prompt=prompt,
system_prompt=self._get_system_prompt(language)
system_prompt=self._get_system_prompt(language),
user_id=user_id
)
# Parse and structure the response
@@ -417,4 +419,4 @@ Focus on creating descriptions that will improve click-through rates for content
"service": self.service_name,
"error": str(e),
"last_check": datetime.utcnow().isoformat()
}
}

View File

@@ -5,9 +5,13 @@ Comprehensive on-page SEO analyzer with AI-enhanced insights
for content optimization and technical improvements.
"""
import aiohttp
from bs4 import BeautifulSoup
from typing import Dict, Any, List, Optional
from datetime import datetime
from loguru import logger
import re
from urllib.parse import urlparse
class OnPageSEOService:
"""Service for comprehensive on-page SEO analysis"""
@@ -17,6 +21,155 @@ class OnPageSEOService:
self.service_name = "on_page_seo_analyzer"
logger.info(f"Initialized {self.service_name}")
async def _fetch_page(self, url: str) -> tuple[Optional[str], int]:
"""Fetch page content"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +https://alwrity.com)'
}
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=10) as response:
if response.status == 200:
return await response.text(), 200
return None, response.status
except Exception as e:
logger.error(f"Error fetching {url}: {str(e)}")
return None, 500
def _analyze_meta_tags(self, soup: BeautifulSoup) -> Dict[str, Any]:
"""Analyze meta tags"""
title = soup.title.string if soup.title else None
meta_desc = soup.find('meta', attrs={'name': 'description'})
viewport = soup.find('meta', attrs={'name': 'viewport'})
robots = soup.find('meta', attrs={'name': 'robots'})
charset = soup.find('meta', attrs={'charset': True})
# Social Tags
og_title = soup.find('meta', property='og:title')
og_desc = soup.find('meta', property='og:description')
og_image = soup.find('meta', property='og:image')
twitter_card = soup.find('meta', attrs={'name': 'twitter:card'})
issues = []
score = 100
# Title Analysis
if not title:
issues.append("Missing title tag")
score -= 20
elif len(title) < 30 or len(title) > 60:
issues.append(f"Title length ({len(title)} chars) should be 30-60 chars")
score -= 10
# Description Analysis
desc_content = meta_desc['content'] if meta_desc else None
if not desc_content:
issues.append("Missing meta description")
score -= 20
elif len(desc_content) < 70 or len(desc_content) > 160:
issues.append(f"Description length ({len(desc_content)} chars) should be 70-160 chars")
score -= 10
# Viewport
if not viewport:
issues.append("Missing viewport meta tag")
score -= 20
og_found = list(filter(None, ['Title' if og_title else '', 'Desc' if og_desc else '', 'Image' if og_image else '']))
return {
"title_length": f"{len(title)} chars" if title else "Missing",
"meta_description_length": f"{len(desc_content)} chars" if desc_content else "Missing",
"has_viewport": bool(viewport),
"charset": charset['charset'] if charset else "Missing",
"robots_meta": robots['content'] if robots else "Missing (Default: index, follow)",
"og_tags": f"Found: {', '.join(og_found)}" if og_found else "None",
"twitter_card": twitter_card['content'] if twitter_card else "Missing",
"score": max(0, score),
"issues": issues
}
def _analyze_technical(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
"""Analyze technical SEO elements"""
canonical = soup.find('link', attrs={'rel': 'canonical'})
schema = soup.find_all('script', type='application/ld+json')
issues = []
score = 100
if not canonical:
issues.append("Missing canonical tag")
score -= 10
# Check H1
h1_tags = soup.find_all('h1')
if len(h1_tags) == 0:
issues.append("Missing H1 tag")
score -= 20
elif len(h1_tags) > 1:
issues.append(f"Multiple H1 tags found ({len(h1_tags)})")
score -= 10
return {
"canonical_tag": canonical['href'] if canonical else "Missing",
"schema_markup": f"Found {len(schema)} schema objects",
"h1_count": len(h1_tags),
"score": max(0, score),
"issues": issues
}
def _analyze_content(self, soup: BeautifulSoup) -> Dict[str, Any]:
"""Analyze content quality"""
# Remove scripts and styles
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
words = len(re.findall(r'\w+', text))
images = soup.find_all('img')
images_without_alt = sum(1 for img in images if not img.get('alt'))
issues = []
score = 100
if words < 300:
issues.append(f"Low word count ({words} words)")
score -= 20
if images_without_alt > 0:
issues.append(f"{images_without_alt} images missing alt text")
score -= 10
return {
"word_count": words,
"total_images": len(images),
"images_without_alt": images_without_alt,
"readability": "Good" if words > 300 else "Needs Improvement", # Placeholder for readability algo
"score": max(0, score),
"issues": issues
}
def _analyze_url_structure(self, url: str) -> Dict[str, Any]:
parsed = urlparse(url)
return {
"protocol": parsed.scheme,
"domain": parsed.netloc,
"path_depth": len(parsed.path.strip('/').split('/')) if parsed.path else 0,
"is_https": parsed.scheme == 'https'
}
def _calculate_overall_score(self, *analyses) -> int:
total = sum(a.get('score', 0) for a in analyses)
return round(total / len(analyses))
def _generate_summary(self, *analyses) -> Dict[str, Any]:
critical_issues = []
for a in analyses:
for issue in a.get('issues', []):
critical_issues.append({"message": issue, "severity": "critical", "category": "SEO"})
return {"critical_issues": critical_issues}
async def analyze_on_page_seo(
self,
url: str,
@@ -25,18 +178,53 @@ class OnPageSEOService:
analyze_content_quality: bool = True
) -> Dict[str, Any]:
"""Analyze on-page SEO factors"""
# Placeholder implementation
return {
"url": url,
"overall_score": 75,
"title_analysis": {"score": 80, "issues": [], "recommendations": []},
"meta_description": {"score": 70, "issues": [], "recommendations": []},
"heading_structure": {"score": 85, "issues": [], "recommendations": []},
"content_analysis": {"score": 75, "word_count": 1500, "readability": "Good"},
"keyword_analysis": {"target_keywords": target_keywords or [], "optimization": "Moderate"},
"image_analysis": {"total_images": 10, "missing_alt": 2} if analyze_images else {},
"recommendations": ["Optimize meta description", "Add more target keywords"]
}
try:
# Add protocol if missing
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
html_content, status_code = await self._fetch_page(url)
if not html_content:
# Return error structure
return {
"url": url,
"overall_score": 0,
"summary": {"critical_issues": [{"message": f"Failed to fetch URL (Status: {status_code})", "severity": "critical", "category": "Connectivity"}]},
"meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
}
soup = BeautifulSoup(html_content, 'html.parser')
# Run Analyses
meta_analysis = self._analyze_meta_tags(soup)
technical_analysis = self._analyze_technical(soup, url)
content_analysis = self._analyze_content(soup)
url_analysis = self._analyze_url_structure(url)
result = {
"url": url,
"overall_score": self._calculate_overall_score(meta_analysis, technical_analysis, content_analysis),
"meta": meta_analysis,
"technical": technical_analysis,
"content_health": content_analysis,
"url_structure": url_analysis,
"performance": {"load_time": "Real-time check pending"},
"accessibility": {"images_without_alt": content_analysis["images_without_alt"]},
"ux": {"viewport": meta_analysis["has_viewport"], "mobile_friendly": bool(meta_analysis["has_viewport"])},
"summary": self._generate_summary(meta_analysis, technical_analysis, content_analysis)
}
return result
except Exception as e:
logger.error(f"Error analyzing {url}: {str(e)}")
return {
"url": url,
"overall_score": 0,
"summary": {"critical_issues": [{"message": str(e), "severity": "critical", "category": "System"}]},
"meta": {}, "technical": {}, "content_health": {}, "url_structure": {}, "performance": {}, "accessibility": {}, "ux": {}
}
async def health_check(self) -> Dict[str, Any]:
"""Health check for the on-page SEO service"""
@@ -44,4 +232,4 @@ class OnPageSEOService:
"status": "operational",
"service": self.service_name,
"last_check": datetime.utcnow().isoformat()
}
}

View File

@@ -31,7 +31,8 @@ class PageSpeedService:
url: str,
strategy: str = "DESKTOP",
locale: str = "en",
categories: List[str] = None
categories: List[str] = None,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Analyze website performance using Google PageSpeed Insights
@@ -70,7 +71,7 @@ class PageSpeedService:
structured_results = self._structure_pagespeed_results(pagespeed_data)
# Generate AI-enhanced insights
ai_insights = await self._generate_ai_insights(structured_results, url, strategy)
ai_insights = await self._generate_ai_insights(structured_results, url, strategy, user_id=user_id)
# Calculate optimization priority
optimization_plan = self._create_optimization_plan(structured_results)
@@ -281,7 +282,8 @@ class PageSpeedService:
self,
structured_results: Dict[str, Any],
url: str,
strategy: str
strategy: str,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""Generate AI-powered insights and recommendations"""
@@ -299,7 +301,8 @@ class PageSpeedService:
# Generate AI insights
ai_response = llm_text_gen(
prompt=prompt,
system_prompt=self._get_system_prompt()
system_prompt=self._get_system_prompt(),
user_id=user_id
)
# Parse AI response
@@ -598,4 +601,4 @@ Focus on practical advice that content creators and digital marketers can unders
"service": self.service_name,
"error": str(e),
"last_check": datetime.utcnow().isoformat()
}
}

View File

@@ -8,12 +8,14 @@ content distribution, and publishing patterns for SEO optimization.
import aiohttp
import asyncio
import re
import json
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from loguru import logger
import xml.etree.ElementTree as ET
from urllib.parse import urlparse, urljoin
import pandas as pd
import gzip
from ..llm_providers.main_text_generation import llm_text_gen
from middleware.logging_middleware import seo_logger
@@ -52,7 +54,9 @@ class SitemapService:
self,
sitemap_url: str,
analyze_content_trends: bool = True,
analyze_publishing_patterns: bool = True
analyze_publishing_patterns: bool = True,
include_ai_insights: bool = True,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Analyze website sitemap for structure and patterns
@@ -92,10 +96,11 @@ class SitemapService:
if analyze_publishing_patterns and sitemap_data.get("urls"):
publishing_patterns = self._analyze_publishing_patterns(sitemap_data["urls"])
# Generate AI insights
ai_insights = await self._generate_ai_insights(
structure_analysis, content_trends, publishing_patterns, sitemap_url
)
ai_insights = {}
if include_ai_insights:
ai_insights = await self._generate_ai_insights(
structure_analysis, content_trends, publishing_patterns, sitemap_url, user_id=user_id
)
execution_time = (datetime.utcnow() - start_time).total_seconds()
@@ -119,7 +124,8 @@ class SitemapService:
input_data={
"sitemap_url": sitemap_url,
"analyze_content_trends": analyze_content_trends,
"analyze_publishing_patterns": analyze_publishing_patterns
"analyze_publishing_patterns": analyze_publishing_patterns,
"include_ai_insights": include_ai_insights
},
output_data=result,
success=True
@@ -145,19 +151,88 @@ class SitemapService:
raise
async def _fetch_sitemap_data(self, sitemap_url: str) -> Dict[str, Any]:
async def _fetch_sitemap_data(self, sitemap_url: str, depth: int = 0, session: aiohttp.ClientSession = None) -> Dict[str, Any]:
"""Fetch and parse sitemap data"""
# Reduced max depth from 3 to 2 to prevent infinite recursion/hanging on massive sites
if depth > 2:
logger.info(f"🛑 Max recursion depth (2) reached for sitemap {sitemap_url}")
return {"urls": [], "sitemaps": [], "total_urls": 0}
# Use passed session or create a new local one if it's the top-level call
local_session = False
if session is None:
local_session = True
# Limit pool size and set strict timeouts
connector = aiohttp.TCPConnector(limit_per_host=5, force_close=True)
# Increased total timeout to 60s for slow sitemaps, but kept connect/read strict
timeout = aiohttp.ClientTimeout(total=60, connect=10, sock_read=30)
session = aiohttp.ClientSession(connector=connector, timeout=timeout)
try:
async with aiohttp.ClientSession() as session:
async with session.get(sitemap_url, timeout=aiohttp.ClientTimeout(total=30)) as response:
logger.info(f"🔍 Fetching sitemap: {sitemap_url} (depth={depth})")
# 10MB limit for sitemaps
MAX_SITEMAP_SIZE = 10 * 1024 * 1024
try:
async with session.get(sitemap_url) as response:
if response.status != 200:
raise Exception(f"Failed to fetch sitemap: HTTP {response.status}")
content = await response.text()
# Parse XML
root = ET.fromstring(content)
# Check Content-Type header
content_type = response.headers.get("Content-Type", "").lower()
if "text/html" in content_type:
raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
# Check Content-Length header if available
content_length = response.headers.get("Content-Length")
if content_length and int(content_length) > MAX_SITEMAP_SIZE:
raise Exception(f"Sitemap too large: {content_length} bytes")
# Read with size limit (safe read)
raw = await response.content.read(MAX_SITEMAP_SIZE + 1)
if len(raw) > MAX_SITEMAP_SIZE:
raise Exception(f"Sitemap size exceeds limit of {MAX_SITEMAP_SIZE} bytes")
if sitemap_url.lower().endswith(".gz") or (len(raw) >= 2 and raw[0] == 0x1F and raw[1] == 0x8B):
try:
raw = gzip.decompress(raw)
except Exception:
pass
try:
content = raw.decode(response.charset or "utf-8", errors="replace")
except Exception:
content = raw.decode("utf-8", errors="replace")
content_stripped = content.lstrip()
if not content_stripped.startswith("<"):
urls = []
# Limit text sitemaps to 50k lines
lines = content.splitlines()[:50000]
for line in lines:
line_clean = (line or "").strip()
if not line_clean or line_clean.startswith("#"):
continue
if line_clean.startswith("http://") or line_clean.startswith("https://"):
urls.append({"loc": line_clean})
return {
"urls": urls,
"sitemaps": [],
"total_urls": len(urls)
}
# Check for HTML content disguised as XML
if content.strip().lower().startswith(("<!doctype html", "<html")):
raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
# Use defusedxml for safety if available, otherwise standard ET
try:
import defusedxml.ElementTree as DET
root = DET.fromstring(content)
except ImportError:
root = ET.fromstring(content)
# Handle different sitemap formats
urls = []
@@ -172,17 +247,28 @@ class SitemapService:
if loc is not None:
sitemaps.append(loc.text)
# Fetch and parse nested sitemaps
for nested_url in sitemaps[:10]: # Limit to 10 sitemaps
try:
nested_data = await self._fetch_sitemap_data(nested_url)
urls.extend(nested_data.get("urls", []))
except Exception as e:
logger.warning(f"Failed to fetch nested sitemap {nested_url}: {e}")
# Fetch and parse nested sitemaps in parallel
nested_tasks = []
# Reduced nested limit from 10 to 5 to prevent fan-out explosion
for nested_url in sitemaps[:5]:
nested_tasks.append(self._fetch_sitemap_data(nested_url, depth + 1, session))
if nested_tasks:
nested_results = await asyncio.gather(*nested_tasks, return_exceptions=True)
for res in nested_results:
if isinstance(res, Exception):
logger.warning(f"Failed to fetch nested sitemap: {res}")
elif isinstance(res, dict):
urls.extend(res.get("urls", []))
else:
# Regular sitemap with URLs
# Limit to first 10k URLs per sitemap file to prevent memory issues
url_count = 0
for url_element in root:
if url_count >= 10000:
break
if url_element.tag.endswith('url'):
url_data = {}
@@ -192,18 +278,42 @@ class SitemapService:
if 'loc' in url_data:
urls.append(url_data)
url_count += 1
return {
"urls": urls,
"sitemaps": sitemaps,
"total_urls": len(urls)
}
except Exception as e:
# Re-raise to be caught by outer try/except
raise e
except ET.ParseError as e:
# Check if content is empty
if not content or not content.strip():
logger.warning(f"Sitemap is empty: {sitemap_url}")
return {"urls": [], "sitemaps": [], "total_urls": 0}
# Check if content looks like HTML to give a better error message
try:
if "content" in locals() and ("<html" in content.lower() or "<body" in content.lower() or "<div" in content.lower()):
raise Exception("URL returned a webpage (HTML), not a valid XML sitemap")
except Exception:
pass
logger.warning(f"Failed to parse sitemap XML: {e}")
raise Exception(f"Failed to parse sitemap XML: {e}")
except Exception as e:
logger.error(f"Error fetching sitemap data: {e}")
if "no element found" in str(e) or "not a valid XML sitemap" in str(e):
logger.warning(f"⚠️ Sitemap parsing failed for {sitemap_url}: {e}")
else:
logger.error(f"Error fetching sitemap data for {sitemap_url}: {e}")
raise
finally:
# Only close the session if we created it
if local_session and session:
await session.close()
def _analyze_sitemap_structure(self, sitemap_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze the structure of the sitemap"""
@@ -239,14 +349,60 @@ class SitemapService:
# Calculate statistics
avg_path_depth = sum(path_levels) / len(path_levels) if path_levels else 0
# Enhancement: Keyword Clustering & Strategic Pillar Mapping
keyword_clusters = self._cluster_keywords_from_urls(urls)
strategic_pillars = self._map_strategic_pillars(urls)
return {
"total_urls": len(urls),
"url_patterns": dict(sorted(url_patterns.items(), key=lambda x: x[1], reverse=True)[:10]),
"file_types": dict(sorted(file_types.items(), key=lambda x: x[1], reverse=True)),
"average_path_depth": round(avg_path_depth, 2),
"max_path_depth": max(path_levels) if path_levels else 0,
"keyword_clusters": keyword_clusters,
"strategic_pillars": strategic_pillars,
"structure_quality": self._assess_structure_quality(url_patterns, avg_path_depth)
}
def _cluster_keywords_from_urls(self, urls: List[Dict[str, Any]]) -> Dict[str, int]:
"""Extract and cluster keywords from URL slugs to identify content strategy focus."""
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'from', 'category', 'tag', 'blog', 'posts', 'archive'}
keywords: Dict[str, int] = {}
for u in urls[:1000]: # Sample 1000 for performance
path = urlparse(u.get('loc', '')).path
# Split by non-alphanumeric and underscores
parts = re.split(r'[^a-zA-Z0-9]', path)
for part in parts:
p = part.lower()
if len(p) > 3 and p not in stop_words and not p.isdigit():
keywords[p] = keywords.get(p, 0) + 1
# Return top 15 clusters
return dict(sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:15])
def _map_strategic_pillars(self, urls: List[Dict[str, Any]]) -> Dict[str, int]:
"""Categorize URLs into strategic content pillars based on common path patterns."""
pillars = {
"Educational": ["blog", "guides", "how-to", "learn", "academy", "resource", "documentation", "docs"],
"Transactional": ["product", "features", "pricing", "plans", "solutions", "buy", "checkout", "cart"],
"Comparison": ["vs", "alternative", "comparison", "reviews", "best-of"],
"Company": ["about", "careers", "press", "contact", "team", "legal", "privacy", "terms"],
"Tools": ["calculator", "tool", "generator", "checker", "analyzer"]
}
results = {k: 0 for k in pillars}
for u in urls:
loc = u.get('loc', '').lower()
found = False
for pillar, tokens in pillars.items():
if any(token in loc for token in tokens):
results[pillar] += 1
found = True
break
# Optional: Add "Other" category if needed
return results
def _analyze_content_trends(self, urls: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze content publishing trends"""
@@ -334,7 +490,9 @@ class SitemapService:
competitors: List[str] = None,
industry_context: str = None,
analyze_content_trends: bool = True,
analyze_publishing_patterns: bool = True
analyze_publishing_patterns: bool = True,
include_ai_insights: bool = True,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""Enhanced sitemap analysis specifically for onboarding Step 3 competitive analysis"""
@@ -343,7 +501,9 @@ class SitemapService:
analysis_result = await self.analyze_sitemap(
sitemap_url=sitemap_url,
analyze_content_trends=analyze_content_trends,
analyze_publishing_patterns=analyze_publishing_patterns
analyze_publishing_patterns=analyze_publishing_patterns,
include_ai_insights=include_ai_insights,
user_id=user_id
)
# Enhance with onboarding-specific insights
@@ -351,7 +511,8 @@ class SitemapService:
analysis_result,
user_url,
competitors,
industry_context
industry_context,
user_id=user_id
)
# Combine results
@@ -374,7 +535,8 @@ class SitemapService:
analysis_result: Dict[str, Any],
user_url: str,
competitors: List[str] = None,
industry_context: str = None
industry_context: str = None,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""Generate onboarding-specific insights for competitive analysis"""
@@ -389,10 +551,37 @@ class SitemapService:
user_url, competitors, industry_context
)
# Define JSON schema for structured output
json_struct = {
"type": "object",
"properties": {
"competitive_positioning": {"type": "string"},
"content_gaps": {
"type": "array",
"items": {"type": "string"}
},
"growth_opportunities": {
"type": "array",
"items": {"type": "string"}
},
"industry_benchmarks": {
"type": "array",
"items": {"type": "string"}
},
"strategic_recommendations": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["competitive_positioning", "content_gaps", "growth_opportunities", "industry_benchmarks", "strategic_recommendations"]
}
# Generate AI insights
ai_response = llm_text_gen(
prompt=prompt,
system_prompt=self._get_onboarding_system_prompt()
system_prompt=self._get_onboarding_system_prompt(),
json_struct=json_struct,
user_id=user_id
)
# Parse and structure insights
@@ -402,7 +591,7 @@ class SitemapService:
await seo_logger.log_ai_analysis(
tool_name=f"{self.service_name}_onboarding",
prompt=prompt,
response=ai_response,
response=ai_response if isinstance(ai_response, str) else str(ai_response),
model_used="gemini-2.0-flash-001"
)
@@ -422,7 +611,8 @@ class SitemapService:
structure_analysis: Dict[str, Any],
content_trends: Dict[str, Any],
publishing_patterns: Dict[str, Any],
sitemap_url: str
sitemap_url: str,
user_id: Optional[str] = None
) -> Dict[str, Any]:
"""Generate AI-powered insights for sitemap analysis"""
@@ -435,7 +625,8 @@ class SitemapService:
# Generate AI insights
ai_response = llm_text_gen(
prompt=prompt,
system_prompt=self._get_system_prompt()
system_prompt=self._get_system_prompt(),
user_id=user_id
)
# Parse and structure insights
@@ -697,7 +888,12 @@ Focus on actionable insights for content creators and digital marketing professi
try:
# Test with a simple sitemap
test_url = "https://www.google.com/sitemap.xml"
result = await self.analyze_sitemap(test_url, False, False)
result = await self.analyze_sitemap(
sitemap_url=test_url,
analyze_content_trends=False,
analyze_publishing_patterns=False,
include_ai_insights=False
)
return {
"status": "operational",
@@ -731,7 +927,7 @@ Focus on actionable insights for content creators and digital marketing professi
competitor_info = ""
if competitors:
competitor_info = f"\nCompetitors to consider: {', '.join(competitors[:5])}"
competitor_info = f"\nCompetitors to consider: {', '.join(competitors)}"
industry_info = ""
if industry_context:
@@ -753,12 +949,12 @@ Content Publishing Patterns:
- Publishing Rate: {publishing_velocity:.2f} pages per day
- Content Categories: {len(url_patterns)} main categories identified
Please provide competitive analysis insights focusing on:
Please provide competitive analysis insights focusing on the following sections:
1. **COMPETITIVE POSITIONING**: How does this site's content structure compare to industry standards?
2. **CONTENT GAPS**: What content categories or topics are missing based on the URL structure?
3. **GROWTH OPPORTUNITIES**: Specific content expansion opportunities to compete better
4. **INDUSTRY BENCHMARKS**: How does publishing frequency and content depth compare to competitors?
1. **COMPETITIVE POSITIONING**: How does this site's content structure compare to industry standards? (Provide a brief paragraph)
2. **CONTENT GAPS**: What content categories or topics are missing based on the URL structure? (List 3-5 specific gaps)
3. **GROWTH OPPORTUNITIES**: Specific content expansion opportunities to compete better (List 3-5 opportunities)
4. **INDUSTRY BENCHMARKS**: How does publishing frequency and content depth compare to competitors? (List 3 key comparisons)
5. **STRATEGIC RECOMMENDATIONS**: 3-5 actionable steps for content strategy improvement
Focus on actionable insights that help content creators understand their competitive position and identify growth opportunities.
@@ -783,69 +979,61 @@ Provide practical, data-driven insights that help content creators make informed
Format your response as structured insights that can be easily parsed and displayed in a user interface."""
def _parse_onboarding_insights(self, ai_response: str) -> Dict[str, Any]:
def _parse_onboarding_insights(self, ai_response: Any) -> Dict[str, Any]:
"""Parse AI response for onboarding-specific insights"""
try:
# Initialize structured response
insights = {
"competitive_positioning": "Analysis in progress...",
"content_gaps": [],
"growth_opportunities": [],
"industry_benchmarks": [],
"strategic_recommendations": []
insights = {}
# If it's already a dict (structured output), use it
if isinstance(ai_response, dict):
insights = ai_response
elif isinstance(ai_response, str):
# Try to parse JSON string
try:
insights = json.loads(ai_response)
except json.JSONDecodeError:
# Try to extract JSON from markdown block
json_match = re.search(r'```json\s*(.*?)\s*```', ai_response, re.DOTALL)
if json_match:
try:
insights = json.loads(json_match.group(1))
except json.JSONDecodeError:
pass
# Ensure all required keys exist
required_keys = [
"competitive_positioning",
"content_gaps",
"growth_opportunities",
"industry_benchmarks",
"strategic_recommendations"
]
# Validate and fill missing keys
validated_insights = {
"competitive_positioning": insights.get("competitive_positioning", "Analysis in progress..."),
"content_gaps": insights.get("content_gaps", []),
"growth_opportunities": insights.get("growth_opportunities", []),
"industry_benchmarks": insights.get("industry_benchmarks", []),
"strategic_recommendations": insights.get("strategic_recommendations", [])
}
# Simple parsing logic - look for structured sections
lines = ai_response.split('\n')
current_section = None
for line in lines:
line = line.strip()
if not line:
continue
# Detect sections
if any(keyword in line.lower() for keyword in ['competitive positioning', 'market position']):
current_section = 'competitive_positioning'
insights[current_section] = line
elif any(keyword in line.lower() for keyword in ['content gaps', 'missing content']):
current_section = 'content_gaps'
elif any(keyword in line.lower() for keyword in ['growth opportunities', 'expansion']):
current_section = 'growth_opportunities'
elif any(keyword in line.lower() for keyword in ['industry benchmarks', 'benchmarks']):
current_section = 'industry_benchmarks'
elif any(keyword in line.lower() for keyword in ['strategic recommendations', 'recommendations']):
current_section = 'strategic_recommendations'
elif line.startswith('-') or line.startswith(''):
# This is a list item
if current_section and current_section in insights:
if isinstance(insights[current_section], str):
insights[current_section] = [insights[current_section]]
insights[current_section].append(line[1:].strip())
elif current_section == 'competitive_positioning':
# Append to competitive positioning text
if insights[current_section] == "Analysis in progress...":
insights[current_section] = line
# Ensure lists are actually lists
for key in required_keys[1:]:
if not isinstance(validated_insights[key], list):
if isinstance(validated_insights[key], str):
validated_insights[key] = [validated_insights[key]]
else:
insights[current_section] += " " + line
# Fallback: if no structured parsing worked, use the full response
if insights["competitive_positioning"] == "Analysis in progress...":
insights["competitive_positioning"] = ai_response[:500] + "..." if len(ai_response) > 500 else ai_response
# Ensure lists are properly formatted
for key in ['content_gaps', 'growth_opportunities', 'industry_benchmarks', 'strategic_recommendations']:
if isinstance(insights[key], str):
insights[key] = [insights[key]] if insights[key] else []
return insights
validated_insights[key] = []
return validated_insights
except Exception as e:
logger.error(f"Error parsing onboarding insights: {e}")
return {
"competitive_positioning": ai_response[:300] + "..." if len(ai_response) > 300 else ai_response,
"content_gaps": ["Analysis parsing error - see full response above"],
"competitive_positioning": "Analysis unavailable",
"content_gaps": [],
"growth_opportunities": [],
"industry_benchmarks": [],
"strategic_recommendations": []
@@ -889,6 +1077,48 @@ Format your response as structured insights that can be easily parsed and displa
logger.error(f"Error discovering sitemap for {website_url}: {e}")
return None
async def _find_sitemap_on_homepage(self, base_url: str) -> Optional[str]:
"""
Check homepage for sitemap links in HTML.
Args:
base_url: Base URL of the website
Returns:
Sitemap URL if found on homepage, None otherwise
"""
try:
logger.debug(f"Checking homepage for sitemap links: {base_url}")
async with aiohttp.ClientSession() as session:
async with session.get(base_url, timeout=aiohttp.ClientTimeout(total=15), headers={"User-Agent": "ALwrity-SEO-Bot/1.0"}) as response:
if response.status == 200:
content = await response.text()
# Look for sitemap links in href attributes
# Matches: href="...sitemap.xml..." or href='...sitemap.xml...'
# Simple regex to catch common variations
sitemap_matches = re.findall(r'href=["\']([^"\']*[sS]itemap[^"\']*\.xml[^"\']*)["\']', content)
for match in sitemap_matches:
potential_url = match.strip()
# Handle relative URLs
if not potential_url.startswith(('http://', 'https://')):
potential_url = urljoin(base_url, potential_url)
logger.debug(f"Found potential sitemap link on homepage: {potential_url}")
# Verify accessibility
if await self._check_sitemap_url(potential_url, "homepage link"):
return potential_url
return None
except Exception as e:
logger.debug(f"Error checking homepage for sitemap: {e}")
return None
async def _find_sitemap_in_robots_txt(self, base_url: str) -> Optional[str]:
"""
Check robots.txt for sitemap directives.
@@ -1027,4 +1257,4 @@ Format your response as structured insights that can be easily parsed and displa
return response.status == 200
except Exception:
return False
return False

View File

@@ -5,8 +5,12 @@ Comprehensive technical SEO crawler and analyzer with AI-enhanced
insights for website optimization and search engine compatibility.
"""
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import time
from typing import Dict, Any, List, Optional
from datetime import datetime
from loguru import logger
class TechnicalSEOService:
@@ -16,6 +20,9 @@ class TechnicalSEOService:
"""Initialize the technical SEO service"""
self.service_name = "technical_seo_analyzer"
logger.info(f"Initialized {self.service_name}")
self.headers = {
'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +http://alwrity.com/bot)'
}
async def analyze_technical_seo(
self,
@@ -25,20 +32,115 @@ class TechnicalSEOService:
analyze_performance: bool = True
) -> Dict[str, Any]:
"""Analyze technical SEO factors"""
# Placeholder implementation
return {
"url": url,
"pages_crawled": 25,
"crawl_depth": crawl_depth,
"technical_issues": [
{"type": "Missing robots.txt", "severity": "Medium", "pages_affected": 1},
{"type": "Slow loading pages", "severity": "High", "pages_affected": 3}
],
"site_structure": {"internal_links": 150, "external_links": 25 if include_external_links else 0},
"performance_metrics": {"avg_load_time": 2.5, "largest_contentful_paint": 1.8} if analyze_performance else {},
"recommendations": ["Implement robots.txt", "Optimize page load speed"],
"crawl_summary": {"successful": 23, "errors": 2, "redirects": 5}
}
try:
start_time = time.time()
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url, timeout=30) as response:
load_time = time.time() - start_time
status_code = response.status
content = await response.text()
headers = response.headers
# Basic parsing
soup = BeautifulSoup(content, 'html.parser')
# 1. Meta Tags Analysis
title = soup.title.string if soup.title else None
meta_desc = soup.find('meta', attrs={'name': 'description'})
meta_desc_content = meta_desc['content'] if meta_desc else None
# 2. Heading Structure
h1_tags = soup.find_all('h1')
h2_tags = soup.find_all('h2')
h3_tags = soup.find_all('h3')
# 3. Image Analysis
images = soup.find_all('img')
images_without_alt = [img['src'] for img in images if not img.get('alt')]
# 4. Link Analysis
links = soup.find_all('a')
internal_links = []
external_links = []
domain = urlparse(url).netloc
for link in links:
href = link.get('href')
if not href:
continue
if href.startswith('http'):
if domain in href:
internal_links.append(href)
else:
external_links.append(href)
elif href.startswith('/'):
internal_links.append(urljoin(url, href))
# 5. Technical Issues Detection
issues = []
# Status Code Issues
if status_code != 200:
issues.append({"type": f"Status Code {status_code}", "severity": "High", "pages_affected": 1})
# Performance Issues
if load_time > 2.0:
issues.append({"type": "Slow Server Response", "severity": "Medium", "pages_affected": 1})
# Meta Issues
if not title:
issues.append({"type": "Missing Title Tag", "severity": "High", "pages_affected": 1})
elif len(title) > 60:
issues.append({"type": "Title Tag Too Long", "severity": "Low", "pages_affected": 1})
if not meta_desc_content:
issues.append({"type": "Missing Meta Description", "severity": "High", "pages_affected": 1})
# Content Structure Issues
if not h1_tags:
issues.append({"type": "Missing H1 Tag", "severity": "High", "pages_affected": 1})
elif len(h1_tags) > 1:
issues.append({"type": "Multiple H1 Tags", "severity": "Medium", "pages_affected": 1})
# Image Issues
if images_without_alt:
issues.append({"type": "Images Missing Alt Text", "severity": "Medium", "pages_affected": len(images_without_alt)})
# Security Issues
if url.startswith('http:'):
issues.append({"type": "Insecure Protocol (HTTP)", "severity": "High", "pages_affected": 1})
return {
"url": url,
"pages_crawled": 1, # Currently single page
"crawl_depth": 1,
"technical_issues": issues,
"site_structure": {
"internal_links": len(internal_links),
"external_links": len(external_links) if include_external_links else 0,
"h1_count": len(h1_tags),
"h2_count": len(h2_tags),
"h3_count": len(h3_tags)
},
"performance_metrics": {
"response_time": round(load_time, 3),
"content_size": len(content)
} if analyze_performance else {},
"recommendations": [issue['type'] for issue in issues],
"crawl_summary": {
"successful": 1 if status_code == 200 else 0,
"errors": 1 if status_code >= 400 else 0,
"redirects": 1 if 300 <= status_code < 400 else 0
}
}
except Exception as e:
logger.error(f"Error in technical SEO analysis: {e}")
return {
"url": url,
"error": str(e),
"technical_issues": [{"type": "Crawl Failed", "severity": "High", "pages_affected": 1}]
}
async def health_check(self) -> Dict[str, Any]:
"""Health check for the technical SEO service"""