378 lines
16 KiB
Python
378 lines
16 KiB
Python
import streamlit as st
|
||
import advertools as adv
|
||
import pandas as pd
|
||
from urllib.parse import urlparse
|
||
import requests
|
||
from datetime import datetime
|
||
import tempfile
|
||
import os
|
||
|
||
|
||
# Title and introduction
|
||
def show_title_and_intro():
|
||
st.title("🌟 URL SEO Checkup: Your Link's Health Report 🌟")
|
||
st.write("""
|
||
Welcome to the URL SEO Checkup! This tool is like a doctor for your website links.
|
||
Just paste your URL, and we'll check if it's healthy and ready to climb the search engine ladder.
|
||
""")
|
||
|
||
|
||
# Basic HTTPS Check
|
||
def check_https(url):
|
||
st.subheader("The Basics - Are We Looking Good?")
|
||
st.write("---")
|
||
|
||
if url.startswith("https://"):
|
||
st.success("✨ You're using HTTPS! This adds extra security, and Google rewards that with better rankings. Keep it up! ✨")
|
||
else:
|
||
st.warning("🚧 Heads Up! Your URL doesn't use 'https://'. This is a red flag for Google.")
|
||
st.info("🔧 **How to fix:** Contact your hosting provider or website developer to install an SSL certificate. This will secure your site with HTTPS.")
|
||
|
||
|
||
# URL Length Check
|
||
def check_url_length(path):
|
||
st.subheader("The Length Test - Keep it Short and Sweet!")
|
||
st.write("---")
|
||
|
||
if len(path) <= 50:
|
||
st.success("🏆 Great! Your URL is short and user-friendly. Google loves short URLs! 🏆")
|
||
else:
|
||
st.warning("🧭 Tip: Try shortening your URL. Shorter URLs are easier to remember and better for SEO.")
|
||
st.info("🔧 **How to fix:** Consider removing unnecessary words or folders in the URL. Aim for concise, descriptive URLs that are easy for users to read.")
|
||
|
||
|
||
# Hyphen Check
|
||
def check_hyphens(path):
|
||
st.subheader("The Hyphen Check - Use Hyphens for Clear Separation!")
|
||
st.write("---")
|
||
|
||
if "-" in path:
|
||
st.success("😎 You're on the right track! Using hyphens makes your URL more readable for both users and Google. 😎")
|
||
else:
|
||
st.warning("❓ Did you know? Using hyphens between words (like 'shoes-for-sale') helps Google understand your URL better!")
|
||
st.info("🔧 **How to fix:** Update your URL to use hyphens (-) instead of spaces or underscores (_). For example, 'shoes-for-sale' instead of 'shoes_for_sale'.")
|
||
|
||
|
||
# File Extension Check
|
||
def check_file_extension(path):
|
||
st.subheader("File Extension Check - Showing Your Files With Pride!")
|
||
st.write("---")
|
||
|
||
if "." in path:
|
||
st.success("🥳 File Extension Check: Your URL includes a file extension like '.html', which helps Google categorize your page. Nice job! 🥳")
|
||
else:
|
||
st.warning("🤔 Your URL seems to be missing a file extension like '.html' or '.php'.")
|
||
st.info("🔧 **How to fix:** While file extensions are not always required, adding them to static pages (like .html or .php) can improve clarity for search engines.")
|
||
|
||
|
||
# Keyword Insights
|
||
def show_keyword_insights(netloc, path):
|
||
st.subheader("Bonus Insight - Let's Talk Keywords")
|
||
st.write("---")
|
||
|
||
st.info("Keywords are the words people use to search for information online. Your goal is to help Google understand what your page is about by using the right keywords in your URL!")
|
||
|
||
st.markdown(f"""
|
||
**Your Domain:** {netloc}
|
||
**Your URL Path:** {path}
|
||
|
||
**Suggestion:** Consider adding a primary keyword to your URL if it aligns with your page content. But don't overdo it – too many keywords can hurt your SEO. Keep it natural!
|
||
""")
|
||
|
||
|
||
# Enhanced HTTP Headers Analysis using advertools
|
||
def analyze_http_headers(url):
|
||
"""Analyze HTTP headers using advertools for comprehensive SEO insights."""
|
||
st.subheader("🔍 Advanced HTTP Headers Analysis")
|
||
st.write("---")
|
||
|
||
try:
|
||
with st.spinner("Analyzing HTTP headers..."):
|
||
# Create a temporary file for output
|
||
with tempfile.NamedTemporaryFile(mode='w', suffix='.jl', delete=False) as tmp_file:
|
||
temp_filename = tmp_file.name
|
||
|
||
# Use advertools to crawl headers
|
||
adv.crawl_headers([url], temp_filename)
|
||
|
||
# Read the results
|
||
headers_df = pd.read_json(temp_filename, lines=True)
|
||
|
||
# Clean up temp file
|
||
os.unlink(temp_filename)
|
||
|
||
if not headers_df.empty:
|
||
# Display key SEO-relevant headers
|
||
st.success("✅ Successfully analyzed HTTP headers!")
|
||
|
||
# Create tabs for different header categories
|
||
tab1, tab2, tab3, tab4 = st.tabs(["🔒 Security", "📈 SEO Headers", "⚡ Performance", "📊 Technical Details"])
|
||
|
||
with tab1:
|
||
st.write("### Security Headers Analysis")
|
||
security_headers = {
|
||
'resp_headers_X-Frame-Options': 'X-Frame-Options',
|
||
'resp_headers_X-Content-Type-Options': 'X-Content-Type-Options',
|
||
'resp_headers_X-XSS-Protection': 'X-XSS-Protection',
|
||
'resp_headers_Strict-Transport-Security': 'Strict-Transport-Security',
|
||
'resp_headers_Content-Security-Policy': 'Content-Security-Policy',
|
||
'resp_headers_Referrer-Policy': 'Referrer-Policy'
|
||
}
|
||
|
||
for header_key, header_name in security_headers.items():
|
||
if header_key in headers_df.columns and not pd.isna(headers_df[header_key].iloc[0]):
|
||
st.success(f"✅ **{header_name}**: Present")
|
||
with st.expander(f"View {header_name} Details"):
|
||
st.code(headers_df[header_key].iloc[0])
|
||
else:
|
||
st.warning(f"⚠️ **{header_name}**: Missing")
|
||
st.info(f"💡 **Recommendation**: Add {header_name} header for better security")
|
||
|
||
with tab2:
|
||
st.write("### SEO-Related Headers")
|
||
seo_headers = {
|
||
'resp_headers_Content-Type': 'Content-Type',
|
||
'resp_headers_Content-Language': 'Content-Language',
|
||
'resp_headers_Cache-Control': 'Cache-Control',
|
||
'resp_headers_Expires': 'Expires',
|
||
'resp_headers_Last-Modified': 'Last-Modified',
|
||
'resp_headers_ETag': 'ETag'
|
||
}
|
||
|
||
for header_key, header_name in seo_headers.items():
|
||
if header_key in headers_df.columns and not pd.isna(headers_df[header_key].iloc[0]):
|
||
st.success(f"✅ **{header_name}**: {headers_df[header_key].iloc[0]}")
|
||
else:
|
||
st.info(f"ℹ️ **{header_name}**: Not set or not detected")
|
||
|
||
# Special handling for content-type
|
||
if 'resp_headers_Content-Type' in headers_df.columns:
|
||
content_type = headers_df['resp_headers_Content-Type'].iloc[0]
|
||
if 'text/html' in str(content_type):
|
||
st.success("🎯 **Content-Type**: Properly set for HTML content")
|
||
if 'charset=utf-8' in str(content_type):
|
||
st.success("🌍 **Character Encoding**: UTF-8 detected - Great for international SEO!")
|
||
|
||
with tab3:
|
||
st.write("### Performance Headers")
|
||
perf_headers = {
|
||
'resp_headers_Server': 'Server',
|
||
'resp_headers_X-Powered-By': 'X-Powered-By',
|
||
'resp_headers_Connection': 'Connection',
|
||
'resp_headers_Transfer-Encoding': 'Transfer-Encoding',
|
||
'resp_headers_Content-Encoding': 'Content-Encoding',
|
||
'resp_headers_Content-Length': 'Content-Length'
|
||
}
|
||
|
||
for header_key, header_name in perf_headers.items():
|
||
if header_key in headers_df.columns and not pd.isna(headers_df[header_key].iloc[0]):
|
||
st.info(f"📊 **{header_name}**: {headers_df[header_key].iloc[0]}")
|
||
|
||
# Check for compression
|
||
if 'resp_headers_Content-Encoding' in headers_df.columns:
|
||
encoding = headers_df['resp_headers_Content-Encoding'].iloc[0]
|
||
if 'gzip' in str(encoding) or 'br' in str(encoding):
|
||
st.success("🚀 **Compression**: Enabled - Great for page speed!")
|
||
else:
|
||
st.warning("⚠️ **Compression**: Consider enabling GZIP or Brotli compression")
|
||
else:
|
||
st.warning("⚠️ **Compression**: Not detected - Consider enabling compression")
|
||
|
||
# Check status code
|
||
if 'status' in headers_df.columns:
|
||
status = headers_df['status'].iloc[0]
|
||
if status == 200:
|
||
st.success(f"✅ **HTTP Status**: {status} OK")
|
||
else:
|
||
st.warning(f"⚠️ **HTTP Status**: {status}")
|
||
|
||
with tab4:
|
||
st.write("### Complete Headers Analysis")
|
||
|
||
# Show response headers only (more relevant for SEO)
|
||
response_headers = {col: col.replace('resp_headers_', '') for col in headers_df.columns if col.startswith('resp_headers_')}
|
||
if response_headers:
|
||
st.write("**Response Headers:**")
|
||
for col, display_name in response_headers.items():
|
||
if not pd.isna(headers_df[col].iloc[0]):
|
||
st.write(f"**{display_name}**: `{headers_df[col].iloc[0]}`")
|
||
|
||
# Show crawl metadata
|
||
st.write("**Crawl Information:**")
|
||
metadata_cols = ['url', 'status', 'crawl_time', 'download_latency']
|
||
for col in metadata_cols:
|
||
if col in headers_df.columns:
|
||
st.write(f"**{col.replace('_', ' ').title()}**: `{headers_df[col].iloc[0]}`")
|
||
|
||
# Download option
|
||
csv = headers_df.to_csv(index=False)
|
||
st.download_button(
|
||
label="📥 Download Complete Headers Data as CSV",
|
||
data=csv,
|
||
file_name=f"headers_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
||
mime="text/csv"
|
||
)
|
||
|
||
else:
|
||
st.error("❌ Could not retrieve headers data")
|
||
|
||
except Exception as e:
|
||
st.error(f"❌ Error analyzing headers: {str(e)}")
|
||
st.info("💡 **Tip**: Make sure the URL is accessible and try again")
|
||
|
||
|
||
# Enhanced robots.txt and sitemap detection
|
||
def check_robots_and_sitemap(url):
|
||
"""Check for robots.txt and sitemap files."""
|
||
st.subheader("🤖 Robots.txt & Sitemap Detection")
|
||
st.write("---")
|
||
|
||
parsed_url = urlparse(url)
|
||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||
|
||
# Check robots.txt
|
||
try:
|
||
robots_url = f"{base_url}/robots.txt"
|
||
response = requests.get(robots_url, timeout=10)
|
||
if response.status_code == 200:
|
||
st.success(f"✅ **Robots.txt found**: {robots_url}")
|
||
with st.expander("View robots.txt content"):
|
||
st.code(response.text[:1000]) # Show first 1000 characters
|
||
else:
|
||
st.warning(f"⚠️ **Robots.txt not found**: Consider creating one at {robots_url}")
|
||
except:
|
||
st.error("❌ Could not check robots.txt")
|
||
|
||
# Check common sitemap locations
|
||
sitemap_locations = [
|
||
f"{base_url}/sitemap.xml",
|
||
f"{base_url}/sitemap_index.xml",
|
||
f"{base_url}/sitemaps.xml"
|
||
]
|
||
|
||
sitemap_found = False
|
||
for sitemap_url in sitemap_locations:
|
||
try:
|
||
response = requests.get(sitemap_url, timeout=10)
|
||
if response.status_code == 200:
|
||
st.success(f"✅ **Sitemap found**: {sitemap_url}")
|
||
sitemap_found = True
|
||
break
|
||
except:
|
||
continue
|
||
|
||
if not sitemap_found:
|
||
st.warning("⚠️ **Sitemap not found**: Consider creating an XML sitemap")
|
||
st.info("💡 **Recommendation**: Submit your sitemap to Google Search Console")
|
||
|
||
|
||
# Enhanced URL structure analysis
|
||
def enhanced_url_analysis(url):
|
||
"""Provide enhanced URL structure analysis."""
|
||
st.subheader("🔗 Enhanced URL Structure Analysis")
|
||
st.write("---")
|
||
|
||
parsed_url = urlparse(url)
|
||
|
||
# URL components analysis
|
||
col1, col2 = st.columns(2)
|
||
|
||
with col1:
|
||
st.write("**URL Components:**")
|
||
st.info(f"**Protocol**: {parsed_url.scheme}")
|
||
st.info(f"**Domain**: {parsed_url.netloc}")
|
||
st.info(f"**Path**: {parsed_url.path}")
|
||
if parsed_url.query:
|
||
st.info(f"**Query**: {parsed_url.query}")
|
||
if parsed_url.fragment:
|
||
st.info(f"**Fragment**: {parsed_url.fragment}")
|
||
|
||
with col2:
|
||
st.write("**SEO Analysis:**")
|
||
|
||
# URL length analysis
|
||
url_length = len(url)
|
||
if url_length <= 60:
|
||
st.success(f"✅ **URL Length**: {url_length} characters (Excellent)")
|
||
elif url_length <= 100:
|
||
st.warning(f"⚠️ **URL Length**: {url_length} characters (Good, but could be shorter)")
|
||
else:
|
||
st.error(f"❌ **URL Length**: {url_length} characters (Too long)")
|
||
|
||
# Path depth analysis
|
||
path_segments = [seg for seg in parsed_url.path.split('/') if seg]
|
||
depth = len(path_segments)
|
||
if depth <= 3:
|
||
st.success(f"✅ **URL Depth**: {depth} levels (Good)")
|
||
else:
|
||
st.warning(f"⚠️ **URL Depth**: {depth} levels (Consider flattening)")
|
||
|
||
# Special characters check
|
||
special_chars = set(url) - set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~:/?#[]@!$&\'()*+,;=')
|
||
if not special_chars:
|
||
st.success("✅ **Special Characters**: Clean URL structure")
|
||
else:
|
||
st.warning(f"⚠️ **Special Characters**: Found {len(special_chars)} special characters")
|
||
|
||
|
||
# Enhanced main function to run the analysis
|
||
def run_analysis(url):
|
||
# Parse the URL
|
||
parsed_url = urlparse(url)
|
||
netloc = parsed_url.netloc # Domain name
|
||
path = parsed_url.path # Path after the domain
|
||
|
||
# Run existing checks
|
||
check_https(url)
|
||
check_url_length(path)
|
||
check_hyphens(path)
|
||
check_file_extension(path)
|
||
|
||
# Add new enhanced analyses
|
||
enhanced_url_analysis(url)
|
||
analyze_http_headers(url)
|
||
check_robots_and_sitemap(url)
|
||
|
||
# Keep existing keyword insights
|
||
show_keyword_insights(netloc, path)
|
||
|
||
# Add summary section
|
||
st.subheader("📋 Analysis Summary & Recommendations")
|
||
st.write("---")
|
||
st.success("🎉 **Analysis Complete!** Review the findings above and implement the recommendations for better SEO performance.")
|
||
|
||
recommendations = [
|
||
"✅ Ensure HTTPS is enabled for security and SEO benefits",
|
||
"🔗 Keep URLs short, descriptive, and user-friendly",
|
||
"🔒 Implement security headers to protect your site",
|
||
"🤖 Create and maintain robots.txt and XML sitemaps",
|
||
"⚡ Enable compression and optimize HTTP headers for performance",
|
||
"📊 Monitor your URL structure and avoid excessive depth"
|
||
]
|
||
|
||
st.write("**Key Recommendations:**")
|
||
for rec in recommendations:
|
||
st.write(rec)
|
||
|
||
|
||
# Display the app
|
||
def url_seo_checker():
|
||
show_title_and_intro()
|
||
|
||
# User input for URL
|
||
url_input = st.text_input("Paste your URL here:", "https://www.example.com/")
|
||
st.write(" ") # Add spacing
|
||
|
||
# When the analyze button is clicked
|
||
if st.button("Let's Analyze!"):
|
||
with st.spinner('Checking your link...'):
|
||
url = url_input.strip() # Clean up the input
|
||
|
||
# Validate URL format
|
||
if not url.startswith(("http://", "https://")):
|
||
st.error("Oops! It seems like your URL needs 'http://' or 'https://' at the beginning. Please add it!")
|
||
st.stop()
|
||
|
||
# Run the analysis
|
||
run_analysis(url)
|