Code Improvements: Error Handling: Improve error messages to be more informative. Log errors for debugging purposes. Code Readability: Add docstrings and comments to explain the purpose of functions and complex code blocks. Use consistent formatting and naming conventions. Modularization: Split large functions into smaller, reusable functions. Group related functions together. Optimization: Use caching where possible to reduce redundant operations. Optimize data processing steps for better performance. User Experience Improvements: User Feedback: Provide immediate feedback on actions (e.g., loading spinners, success, and error messages). Use placeholders and help text to guide users on what inputs are expected. Interactive Elements: Use more interactive elements like sliders, date pickers, and multi-selects to enhance the user interface.
341 lines
11 KiB
Python
341 lines
11 KiB
Python
import streamlit as st
|
|
import advertools as adv
|
|
import pandas as pd
|
|
import plotly.graph_objects as go
|
|
from urllib.error import URLError
|
|
import xml.etree.ElementTree as ET
|
|
import requests
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main function to run the Sitemap Analyzer Streamlit app.
|
|
"""
|
|
st.title("📊 Sitemap Analyzer")
|
|
st.write("""
|
|
This tool analyzes a website's sitemap to understand its content structure and publishing trends.
|
|
Enter a sitemap URL to start your analysis.
|
|
""")
|
|
|
|
sitemap_url = st.text_input(
|
|
"Please enter the sitemap URL:",
|
|
"https://www.example.com/sitemap.xml"
|
|
)
|
|
|
|
if st.button("Analyze Sitemap"):
|
|
try:
|
|
sitemap_df = fetch_all_sitemaps(sitemap_url)
|
|
if sitemap_df is not None and not sitemap_df.empty:
|
|
sitemap_df = process_lastmod_column(sitemap_df)
|
|
ppmonth = analyze_content_trends(sitemap_df)
|
|
sitemap_df = categorize_and_shorten_sitemaps(sitemap_df)
|
|
|
|
display_key_metrics(sitemap_df, ppmonth)
|
|
plot_sitemap_content_distribution(sitemap_df)
|
|
plot_content_trends(ppmonth)
|
|
plot_content_type_breakdown(sitemap_df)
|
|
plot_publishing_frequency(sitemap_df)
|
|
|
|
st.success("🎉 Analysis complete!")
|
|
else:
|
|
st.error("No valid URLs found in the sitemap.")
|
|
except URLError as e:
|
|
st.error(f"Error fetching the sitemap: {e}")
|
|
except Exception as e:
|
|
st.error(f"An unexpected error occurred: {e}")
|
|
|
|
|
|
def fetch_all_sitemaps(sitemap_url):
|
|
"""
|
|
Fetches all sitemaps from the provided sitemap URL and concatenates their URLs into a DataFrame.
|
|
|
|
Parameters:
|
|
sitemap_url (str): The URL of the sitemap.
|
|
|
|
Returns:
|
|
DataFrame: A DataFrame containing all URLs from the sitemaps.
|
|
"""
|
|
st.write(f"🚀 Fetching and analyzing the sitemap: {sitemap_url}...")
|
|
|
|
try:
|
|
sitemap_df = fetch_sitemap(sitemap_url)
|
|
|
|
if sitemap_df is not None:
|
|
all_sitemaps = sitemap_df.loc[
|
|
sitemap_df['loc'].str.contains('sitemap'),
|
|
'loc'
|
|
].tolist()
|
|
|
|
if all_sitemaps:
|
|
st.write(
|
|
f"🔄 Found {len(all_sitemaps)} additional sitemaps. Fetching data from them..."
|
|
)
|
|
all_urls_df = pd.DataFrame()
|
|
|
|
for sitemap in all_sitemaps:
|
|
try:
|
|
st.write(f"Fetching URLs from {sitemap}...")
|
|
temp_df = fetch_sitemap(sitemap)
|
|
if temp_df is not None:
|
|
all_urls_df = pd.concat(
|
|
[all_urls_df, temp_df], ignore_index=True
|
|
)
|
|
except Exception as e:
|
|
st.error(f"Error fetching {sitemap}: {e}")
|
|
|
|
st.write(
|
|
f"✅ Successfully fetched {len(all_urls_df)} URLs from all sitemaps."
|
|
)
|
|
return all_urls_df
|
|
|
|
else:
|
|
st.write(f"✅ Successfully fetched {len(sitemap_df)} URLs from the main sitemap.")
|
|
return sitemap_df
|
|
else:
|
|
return None
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error fetching the sitemap: {e}")
|
|
return None
|
|
|
|
|
|
def fetch_sitemap(url):
|
|
"""
|
|
Fetches and parses the sitemap from the provided URL.
|
|
|
|
Parameters:
|
|
url (str): The URL of the sitemap.
|
|
|
|
Returns:
|
|
DataFrame: A DataFrame containing the URLs from the sitemap.
|
|
"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
ET.fromstring(response.content)
|
|
|
|
sitemap_df = adv.sitemap_to_df(url)
|
|
return sitemap_df
|
|
|
|
except requests.RequestException as e:
|
|
st.error(f"⚠️ Request error: {e}")
|
|
return None
|
|
except ET.ParseError as e:
|
|
st.error(f"⚠️ XML parsing error: {e}")
|
|
return None
|
|
|
|
|
|
def process_lastmod_column(sitemap_df):
|
|
"""
|
|
Processes the 'lastmod' column in the sitemap DataFrame by converting it to DateTime format and setting it as the index.
|
|
|
|
Parameters:
|
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
|
|
|
Returns:
|
|
DataFrame: The processed sitemap DataFrame with 'lastmod' as the index.
|
|
"""
|
|
st.write("📅 Converting 'lastmod' column to DateTime format and setting it as the index...")
|
|
|
|
try:
|
|
sitemap_df = sitemap_df.dropna(subset=['lastmod'])
|
|
sitemap_df['lastmod'] = pd.to_datetime(sitemap_df['lastmod'])
|
|
sitemap_df.set_index('lastmod', inplace=True)
|
|
|
|
st.write("✅ 'lastmod' column successfully converted to DateTime format and set as the index.")
|
|
return sitemap_df
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error processing the 'lastmod' column: {e}")
|
|
return None
|
|
|
|
|
|
def categorize_and_shorten_sitemaps(sitemap_df):
|
|
"""
|
|
Categorizes and shortens the sitemap names in the sitemap DataFrame.
|
|
|
|
Parameters:
|
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
|
|
|
Returns:
|
|
DataFrame: The sitemap DataFrame with categorized and shortened sitemap names.
|
|
"""
|
|
st.write("🔍 Categorizing and shortening sitemap names...")
|
|
|
|
try:
|
|
sitemap_df['sitemap_name'] = sitemap_df['sitemap'].str.split('/').str[4]
|
|
sitemap_df['sitemap_name'] = sitemap_df['sitemap_name'].replace({
|
|
'sitemap-site-kasko-fiyatlari.xml': 'Kasko',
|
|
'sitemap-site-bireysel.xml': 'Personal',
|
|
'sitemap-site-kurumsal.xml': 'Cooperate',
|
|
'sitemap-site-arac-sigortasi.xml': 'Car',
|
|
'sitemap-site.xml': 'Others'
|
|
})
|
|
|
|
st.write("✅ Sitemap names categorized and shortened.")
|
|
return sitemap_df
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error categorizing sitemap names: {e}")
|
|
return sitemap_df
|
|
|
|
|
|
def analyze_content_trends(sitemap_df):
|
|
"""
|
|
Analyzes content publishing trends in the sitemap DataFrame.
|
|
|
|
Parameters:
|
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
|
|
|
Returns:
|
|
Series: A Series representing the number of contents published each month.
|
|
"""
|
|
st.write("📅 Analyzing content publishing trends...")
|
|
|
|
try:
|
|
ppmonth = sitemap_df.resample('M').size()
|
|
sitemap_df['monthly_count'] = sitemap_df.index.to_period('M').value_counts().sort_index()
|
|
|
|
st.write("✅ Content trends analysis completed.")
|
|
return ppmonth
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error during content trends analysis: {e}")
|
|
return pd.Series()
|
|
|
|
|
|
def display_key_metrics(sitemap_df, ppmonth):
|
|
"""
|
|
Displays key metrics of the sitemap analysis.
|
|
|
|
Parameters:
|
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
|
ppmonth (Series): The Series representing the number of contents published each month.
|
|
"""
|
|
st.write("### Key Metrics")
|
|
|
|
total_urls = len(sitemap_df)
|
|
total_articles = ppmonth.sum()
|
|
average_frequency = ppmonth.mean()
|
|
|
|
st.write(f"**Total URLs Found:** {total_urls:,}")
|
|
st.write(f"**Total Articles Published:** {total_articles:,}")
|
|
st.write(f"**Average Monthly Publishing Frequency:** {average_frequency:.2f} articles/month")
|
|
|
|
|
|
def plot_sitemap_content_distribution(sitemap_df):
|
|
"""
|
|
Plots the content distribution by sitemap categories.
|
|
|
|
Parameters:
|
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
|
"""
|
|
st.write("📊 Visualizing content amount by sitemap categories...")
|
|
|
|
try:
|
|
if 'sitemap_name' in sitemap_df.columns:
|
|
stmc = sitemap_df.groupby('sitemap_name').size()
|
|
fig = go.Figure()
|
|
fig.add_bar(x=stmc.index, y=stmc.values, name='Sitemap Categories')
|
|
fig.update_layout(
|
|
title='Content Amount by Sitemap Categories',
|
|
xaxis_title='Sitemap Categories',
|
|
yaxis_title='Number of Articles',
|
|
paper_bgcolor='#E5ECF6'
|
|
)
|
|
st.plotly_chart(fig)
|
|
else:
|
|
st.warning("⚠️ The 'sitemap_name' column is missing in the data.")
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error during sitemap content distribution plotting: {e}")
|
|
|
|
|
|
def plot_content_trends(ppmonth):
|
|
"""
|
|
Plots the content publishing trends over time.
|
|
|
|
Parameters:
|
|
ppmonth (Series): The Series representing the number of contents published each month.
|
|
"""
|
|
st.write("📈 Plotting content publishing trends over time...")
|
|
|
|
try:
|
|
fig = go.Figure()
|
|
fig.add_scatter(x=ppmonth.index, y=ppmonth.values, mode='lines+markers', name='Publishing Trends')
|
|
fig.update_layout(
|
|
title='Content Publishing Trends Over Time',
|
|
xaxis_title='Month',
|
|
yaxis_title='Number of Articles',
|
|
paper_bgcolor='#E5ECF6'
|
|
)
|
|
st.plotly_chart(fig)
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error during content trends plotting: {e}")
|
|
|
|
|
|
def plot_content_type_breakdown(sitemap_df):
|
|
"""
|
|
Plots the content type breakdown.
|
|
|
|
Parameters:
|
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
|
"""
|
|
st.write("🔍 Plotting content type breakdown...")
|
|
|
|
try:
|
|
if 'sitemap_name' in sitemap_df.columns and not sitemap_df['sitemap_name'].empty:
|
|
content_type_counts = sitemap_df['sitemap_name'].value_counts()
|
|
st.write("Content Type Counts:", content_type_counts)
|
|
|
|
if not content_type_counts.empty:
|
|
fig = go.Figure(data=[go.Pie(labels=content_type_counts.index, values=content_type_counts.values)])
|
|
fig.update_layout(
|
|
title='Content Type Breakdown',
|
|
paper_bgcolor='#E5ECF6'
|
|
)
|
|
st.plotly_chart(fig)
|
|
else:
|
|
st.warning("⚠️ No content types to display.")
|
|
else:
|
|
st.warning("⚠️ The 'sitemap_name' column is missing or empty.")
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error during content type breakdown plotting: {e}")
|
|
|
|
|
|
def plot_publishing_frequency(sitemap_df):
|
|
"""
|
|
Plots the publishing frequency by month.
|
|
|
|
Parameters:
|
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
|
"""
|
|
st.write("📆 Plotting publishing frequency by month...")
|
|
|
|
try:
|
|
if not sitemap_df.empty:
|
|
frequency_by_month = sitemap_df.index.to_period('M').value_counts().sort_index()
|
|
frequency_by_month.index = frequency_by_month.index.astype(str)
|
|
|
|
fig = go.Figure()
|
|
fig.add_bar(x=frequency_by_month.index, y=frequency_by_month.values, name='Publishing Frequency')
|
|
fig.update_layout(
|
|
title='Publishing Frequency by Month',
|
|
xaxis_title='Month',
|
|
yaxis_title='Number of Articles',
|
|
paper_bgcolor='#E5ECF6'
|
|
)
|
|
st.plotly_chart(fig)
|
|
else:
|
|
st.warning("⚠️ No data available to plot publishing frequency.")
|
|
|
|
except Exception as e:
|
|
st.error(f"⚠️ Error during publishing frequency plotting: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|