AI SEO tools - Readibility & Analysis, Agents content ideator
This commit is contained in:
270
lib/ai_seo_tools/sitemap_analysis.py
Normal file
270
lib/ai_seo_tools/sitemap_analysis.py
Normal file
@@ -0,0 +1,270 @@
|
||||
import streamlit as st
|
||||
import advertools as adv
|
||||
import pandas as pd
|
||||
import plotly.graph_objects as go
|
||||
from urllib.error import URLError
|
||||
import xml.etree.ElementTree as ET
|
||||
import requests
|
||||
|
||||
def main():
|
||||
st.title("📊 Sitemap Analyzer")
|
||||
st.write("""
|
||||
This tool analyzes a website's sitemap to understand its content structure and publishing trends.
|
||||
Enter a sitemap URL to start your analysis.
|
||||
""")
|
||||
|
||||
sitemap_url = st.text_input("Please enter the sitemap URL:", "https://www.example.com/sitemap.xml")
|
||||
|
||||
if st.button("Analyze Sitemap"):
|
||||
try:
|
||||
sitemap_df = fetch_all_sitemaps(sitemap_url)
|
||||
if sitemap_df is not None and not sitemap_df.empty:
|
||||
sitemap_df = process_lastmod_column(sitemap_df)
|
||||
ppmonth = analyze_content_trends(sitemap_df)
|
||||
sitemap_df = categorize_and_shorten_sitemaps(sitemap_df)
|
||||
|
||||
display_key_metrics(sitemap_df, ppmonth)
|
||||
plot_sitemap_content_distribution(sitemap_df)
|
||||
plot_content_trends(ppmonth)
|
||||
plot_content_type_breakdown(sitemap_df)
|
||||
plot_publishing_frequency(sitemap_df)
|
||||
|
||||
st.success("🎉 Analysis complete!")
|
||||
else:
|
||||
st.error("No valid URLs found in the sitemap.")
|
||||
except URLError as e:
|
||||
st.error(f"Error fetching the sitemap: {e}")
|
||||
except Exception as e:
|
||||
st.error(f"An unexpected error occurred: {e}")
|
||||
|
||||
def fetch_all_sitemaps(sitemap_url):
|
||||
st.write(f"🚀 Fetching and analyzing the sitemap: {sitemap_url}...")
|
||||
|
||||
try:
|
||||
sitemap_df = fetch_sitemap(sitemap_url)
|
||||
|
||||
if sitemap_df is not None:
|
||||
all_sitemaps = sitemap_df.loc[sitemap_df['loc'].str.contains('sitemap'), 'loc'].tolist()
|
||||
|
||||
if all_sitemaps:
|
||||
st.write(f"🔄 Found {len(all_sitemaps)} additional sitemaps. Fetching data from them...")
|
||||
all_urls_df = pd.DataFrame()
|
||||
|
||||
for sitemap in all_sitemaps:
|
||||
try:
|
||||
st.write(f"Fetching URLs from {sitemap}...")
|
||||
temp_df = fetch_sitemap(sitemap)
|
||||
if temp_df is not None:
|
||||
all_urls_df = pd.concat([all_urls_df, temp_df], ignore_index=True)
|
||||
except Exception as e:
|
||||
st.error(f"Error fetching {sitemap}: {e}")
|
||||
|
||||
st.write(f"✅ Successfully fetched {len(all_urls_df)} URLs from all sitemaps.")
|
||||
return all_urls_df
|
||||
|
||||
else:
|
||||
st.write(f"✅ Successfully fetched {len(sitemap_df)} URLs from the main sitemap.")
|
||||
return sitemap_df
|
||||
else:
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error fetching the sitemap: {e}")
|
||||
return None
|
||||
|
||||
def fetch_sitemap(url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
ET.fromstring(response.content)
|
||||
|
||||
sitemap_df = adv.sitemap_to_df(url)
|
||||
return sitemap_df
|
||||
|
||||
except requests.RequestException as e:
|
||||
st.error(f"⚠️ Request error: {e}")
|
||||
return None
|
||||
except ET.ParseError as e:
|
||||
st.error(f"⚠️ XML parsing error: {e}")
|
||||
return None
|
||||
|
||||
def process_lastmod_column(sitemap_df):
|
||||
st.write("📅 Converting 'lastmod' column to DateTime format and setting it as the index...")
|
||||
|
||||
try:
|
||||
sitemap_df = sitemap_df.dropna(subset=['lastmod'])
|
||||
sitemap_df['lastmod'] = pd.to_datetime(sitemap_df['lastmod'])
|
||||
sitemap_df.set_index('lastmod', inplace=True)
|
||||
|
||||
st.write("✅ 'lastmod' column successfully converted to DateTime format and set as the index.")
|
||||
return sitemap_df
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error processing the 'lastmod' column: {e}")
|
||||
return None
|
||||
|
||||
def categorize_and_shorten_sitemaps(sitemap_df):
|
||||
st.write("🔍 Categorizing and shortening sitemap names...")
|
||||
|
||||
try:
|
||||
sitemap_df['sitemap_name'] = sitemap_df['sitemap'].str.split('/').str[4]
|
||||
sitemap_df['sitemap_name'] = sitemap_df['sitemap_name'].replace({
|
||||
'sitemap-site-kasko-fiyatlari.xml': 'Kasko',
|
||||
'sitemap-site-bireysel.xml': 'Personal',
|
||||
'sitemap-site-kurumsal.xml': 'Cooperate',
|
||||
'sitemap-site-arac-sigortasi.xml': 'Car',
|
||||
'sitemap-site.xml': 'Others'
|
||||
})
|
||||
|
||||
st.write("✅ Sitemap names categorized and shortened.")
|
||||
return sitemap_df
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error categorizing sitemap names: {e}")
|
||||
return sitemap_df
|
||||
|
||||
def analyze_content_trends(sitemap_df):
|
||||
st.write("📅 Analyzing content publishing trends...")
|
||||
|
||||
try:
|
||||
ppmonth = sitemap_df.resample('M').size()
|
||||
sitemap_df['monthly_count'] = sitemap_df.index.to_period('M').value_counts().sort_index()
|
||||
|
||||
st.write("✅ Content trends analysis completed.")
|
||||
return ppmonth
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error during content trends analysis: {e}")
|
||||
return pd.Series()
|
||||
|
||||
def display_key_metrics(sitemap_df, ppmonth):
|
||||
st.write("### Key Metrics")
|
||||
|
||||
total_urls = len(sitemap_df)
|
||||
total_articles = ppmonth.sum()
|
||||
average_frequency = ppmonth.mean()
|
||||
|
||||
st.write(f"**Total URLs Found:** {total_urls:,}")
|
||||
st.write(f"**Total Articles Published:** {total_articles:,}")
|
||||
st.write(f"**Average Monthly Publishing Frequency:** {average_frequency:.2f} articles/month")
|
||||
|
||||
def plot_sitemap_content_distribution(sitemap_df):
|
||||
st.write("📊 Visualizing content amount by sitemap categories...")
|
||||
|
||||
try:
|
||||
if 'sitemap_name' in sitemap_df.columns:
|
||||
stmc = sitemap_df.groupby('sitemap_name').size()
|
||||
fig = go.Figure()
|
||||
fig.add_bar(x=stmc.index, y=stmc.values, name='Sitemap Categories')
|
||||
fig.update_layout(
|
||||
title='Content Amount by Sitemap Categories',
|
||||
xaxis_title='Sitemap Categories',
|
||||
yaxis_title='Number of Articles',
|
||||
paper_bgcolor='#E5ECF6'
|
||||
)
|
||||
st.plotly_chart(fig)
|
||||
else:
|
||||
st.warning("⚠️ The 'sitemap_name' column is missing in the data.")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error during sitemap content distribution plotting: {e}")
|
||||
|
||||
def plot_content_trends(ppmonth):
|
||||
st.write("📈 Plotting content publishing trends over time...")
|
||||
|
||||
try:
|
||||
fig = go.Figure()
|
||||
fig.add_scatter(x=ppmonth.index, y=ppmonth.values, mode='lines+markers', name='Publishing Trends')
|
||||
fig.update_layout(
|
||||
title='Content Publishing Trends Over Time',
|
||||
xaxis_title='Month',
|
||||
yaxis_title='Number of Articles',
|
||||
paper_bgcolor='#E5ECF6'
|
||||
)
|
||||
st.plotly_chart(fig)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error during content trends plotting: {e}")
|
||||
|
||||
def plot_content_type_breakdown(sitemap_df):
|
||||
st.write("🔍 Plotting content type breakdown...")
|
||||
|
||||
try:
|
||||
if 'sitemap_name' in sitemap_df.columns and not sitemap_df['sitemap_name'].empty:
|
||||
content_type_counts = sitemap_df['sitemap_name'].value_counts()
|
||||
st.write("Content Type Counts:", content_type_counts) # Debug line
|
||||
|
||||
if not content_type_counts.empty:
|
||||
fig = go.Figure(data=[go.Pie(labels=content_type_counts.index, values=content_type_counts.values)])
|
||||
fig.update_layout(
|
||||
title='Content Type Breakdown',
|
||||
paper_bgcolor='#E5ECF6'
|
||||
)
|
||||
st.plotly_chart(fig)
|
||||
else:
|
||||
st.warning("⚠️ No content types to display.")
|
||||
else:
|
||||
st.warning("⚠️ The 'sitemap_name' column is missing or empty.")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error during content type breakdown plotting: {e}")
|
||||
|
||||
def visualize_content_by_sitemap_category(sitemap_df):
|
||||
st.write("🔍 Visualizing content amount by sitemap categories...")
|
||||
|
||||
try:
|
||||
# Check if the 'sitemap_name' column exists and is non-empty
|
||||
if 'sitemap_name' in sitemap_df.columns and not sitemap_df['sitemap_name'].empty:
|
||||
st.write("Sitemap Data Preview:", sitemap_df['sitemap_name'].head()) # Check the data
|
||||
|
||||
# Group and count the number of entries per sitemap category
|
||||
sitemap_category_counts = sitemap_df['sitemap_name'].value_counts()
|
||||
|
||||
# Display the grouped data
|
||||
st.write("Sitemap Category Counts:", sitemap_category_counts) # Debug line
|
||||
|
||||
# If we have valid data, plot it
|
||||
if not sitemap_category_counts.empty:
|
||||
fig = go.Figure()
|
||||
fig.add_bar(x=sitemap_category_counts.index, y=sitemap_category_counts.values, name='Sitemap Names')
|
||||
fig.update_layout(
|
||||
title='Content Amount by Sitemap Categories',
|
||||
paper_bgcolor='#E5ECF6',
|
||||
yaxis_title='Article Amount',
|
||||
)
|
||||
st.plotly_chart(fig)
|
||||
else:
|
||||
st.warning("⚠️ No data to display for sitemap categories.")
|
||||
else:
|
||||
st.warning("⚠️ The 'sitemap_name' column is missing or empty.")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error during content amount visualization: {e}")
|
||||
|
||||
def plot_publishing_frequency(sitemap_df):
|
||||
st.write("📆 Plotting publishing frequency by month...")
|
||||
|
||||
try:
|
||||
if not sitemap_df.empty:
|
||||
frequency_by_month = sitemap_df.index.to_period('M').value_counts().sort_index()
|
||||
frequency_by_month.index = frequency_by_month.index.astype(str) # Convert Period to string for Plotly
|
||||
|
||||
fig = go.Figure()
|
||||
fig.add_bar(x=frequency_by_month.index, y=frequency_by_month.values, name='Publishing Frequency')
|
||||
fig.update_layout(
|
||||
title='Publishing Frequency by Month',
|
||||
xaxis_title='Month',
|
||||
yaxis_title='Number of Articles',
|
||||
paper_bgcolor='#E5ECF6'
|
||||
)
|
||||
st.plotly_chart(fig)
|
||||
else:
|
||||
st.warning("⚠️ No data available to plot publishing frequency.")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"⚠️ Error during publishing frequency plotting: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user