Recovered state: integrated TrendSurferAgent, restored frontend/backend files, and cleaned up recovery scripts
This commit is contained in:
@@ -5,8 +5,12 @@ Comprehensive technical SEO crawler and analyzer with AI-enhanced
|
||||
insights for website optimization and search engine compatibility.
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import time
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
|
||||
class TechnicalSEOService:
|
||||
@@ -16,6 +20,9 @@ class TechnicalSEOService:
|
||||
"""Initialize the technical SEO service"""
|
||||
self.service_name = "technical_seo_analyzer"
|
||||
logger.info(f"Initialized {self.service_name}")
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ALwritySEO/1.0; +http://alwrity.com/bot)'
|
||||
}
|
||||
|
||||
async def analyze_technical_seo(
|
||||
self,
|
||||
@@ -25,20 +32,115 @@ class TechnicalSEOService:
|
||||
analyze_performance: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyze technical SEO factors"""
|
||||
# Placeholder implementation
|
||||
return {
|
||||
"url": url,
|
||||
"pages_crawled": 25,
|
||||
"crawl_depth": crawl_depth,
|
||||
"technical_issues": [
|
||||
{"type": "Missing robots.txt", "severity": "Medium", "pages_affected": 1},
|
||||
{"type": "Slow loading pages", "severity": "High", "pages_affected": 3}
|
||||
],
|
||||
"site_structure": {"internal_links": 150, "external_links": 25 if include_external_links else 0},
|
||||
"performance_metrics": {"avg_load_time": 2.5, "largest_contentful_paint": 1.8} if analyze_performance else {},
|
||||
"recommendations": ["Implement robots.txt", "Optimize page load speed"],
|
||||
"crawl_summary": {"successful": 23, "errors": 2, "redirects": 5}
|
||||
}
|
||||
try:
|
||||
start_time = time.time()
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
async with session.get(url, timeout=30) as response:
|
||||
load_time = time.time() - start_time
|
||||
status_code = response.status
|
||||
content = await response.text()
|
||||
headers = response.headers
|
||||
|
||||
# Basic parsing
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
# 1. Meta Tags Analysis
|
||||
title = soup.title.string if soup.title else None
|
||||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||||
meta_desc_content = meta_desc['content'] if meta_desc else None
|
||||
|
||||
# 2. Heading Structure
|
||||
h1_tags = soup.find_all('h1')
|
||||
h2_tags = soup.find_all('h2')
|
||||
h3_tags = soup.find_all('h3')
|
||||
|
||||
# 3. Image Analysis
|
||||
images = soup.find_all('img')
|
||||
images_without_alt = [img['src'] for img in images if not img.get('alt')]
|
||||
|
||||
# 4. Link Analysis
|
||||
links = soup.find_all('a')
|
||||
internal_links = []
|
||||
external_links = []
|
||||
domain = urlparse(url).netloc
|
||||
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if not href:
|
||||
continue
|
||||
if href.startswith('http'):
|
||||
if domain in href:
|
||||
internal_links.append(href)
|
||||
else:
|
||||
external_links.append(href)
|
||||
elif href.startswith('/'):
|
||||
internal_links.append(urljoin(url, href))
|
||||
|
||||
# 5. Technical Issues Detection
|
||||
issues = []
|
||||
|
||||
# Status Code Issues
|
||||
if status_code != 200:
|
||||
issues.append({"type": f"Status Code {status_code}", "severity": "High", "pages_affected": 1})
|
||||
|
||||
# Performance Issues
|
||||
if load_time > 2.0:
|
||||
issues.append({"type": "Slow Server Response", "severity": "Medium", "pages_affected": 1})
|
||||
|
||||
# Meta Issues
|
||||
if not title:
|
||||
issues.append({"type": "Missing Title Tag", "severity": "High", "pages_affected": 1})
|
||||
elif len(title) > 60:
|
||||
issues.append({"type": "Title Tag Too Long", "severity": "Low", "pages_affected": 1})
|
||||
|
||||
if not meta_desc_content:
|
||||
issues.append({"type": "Missing Meta Description", "severity": "High", "pages_affected": 1})
|
||||
|
||||
# Content Structure Issues
|
||||
if not h1_tags:
|
||||
issues.append({"type": "Missing H1 Tag", "severity": "High", "pages_affected": 1})
|
||||
elif len(h1_tags) > 1:
|
||||
issues.append({"type": "Multiple H1 Tags", "severity": "Medium", "pages_affected": 1})
|
||||
|
||||
# Image Issues
|
||||
if images_without_alt:
|
||||
issues.append({"type": "Images Missing Alt Text", "severity": "Medium", "pages_affected": len(images_without_alt)})
|
||||
|
||||
# Security Issues
|
||||
if url.startswith('http:'):
|
||||
issues.append({"type": "Insecure Protocol (HTTP)", "severity": "High", "pages_affected": 1})
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"pages_crawled": 1, # Currently single page
|
||||
"crawl_depth": 1,
|
||||
"technical_issues": issues,
|
||||
"site_structure": {
|
||||
"internal_links": len(internal_links),
|
||||
"external_links": len(external_links) if include_external_links else 0,
|
||||
"h1_count": len(h1_tags),
|
||||
"h2_count": len(h2_tags),
|
||||
"h3_count": len(h3_tags)
|
||||
},
|
||||
"performance_metrics": {
|
||||
"response_time": round(load_time, 3),
|
||||
"content_size": len(content)
|
||||
} if analyze_performance else {},
|
||||
"recommendations": [issue['type'] for issue in issues],
|
||||
"crawl_summary": {
|
||||
"successful": 1 if status_code == 200 else 0,
|
||||
"errors": 1 if status_code >= 400 else 0,
|
||||
"redirects": 1 if 300 <= status_code < 400 else 0
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in technical SEO analysis: {e}")
|
||||
return {
|
||||
"url": url,
|
||||
"error": str(e),
|
||||
"technical_issues": [{"type": "Crawl Failed", "severity": "High", "pages_affected": 1}]
|
||||
}
|
||||
|
||||
async def health_check(self) -> Dict[str, Any]:
|
||||
"""Health check for the technical SEO service"""
|
||||
|
||||
Reference in New Issue
Block a user