Update sitemap_analysis.py
Code Improvements: Error Handling: Improve error messages to be more informative. Log errors for debugging purposes. Code Readability: Add docstrings and comments to explain the purpose of functions and complex code blocks. Use consistent formatting and naming conventions. Modularization: Split large functions into smaller, reusable functions. Group related functions together. Optimization: Use caching where possible to reduce redundant operations. Optimize data processing steps for better performance. User Experience Improvements: User Feedback: Provide immediate feedback on actions (e.g., loading spinners, success, and error messages). Use placeholders and help text to guide users on what inputs are expected. Interactive Elements: Use more interactive elements like sliders, date pickers, and multi-selects to enhance the user interface.
This commit is contained in:
@@ -6,14 +6,21 @@ from urllib.error import URLError
|
|||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
"""
|
||||||
|
Main function to run the Sitemap Analyzer Streamlit app.
|
||||||
|
"""
|
||||||
st.title("📊 Sitemap Analyzer")
|
st.title("📊 Sitemap Analyzer")
|
||||||
st.write("""
|
st.write("""
|
||||||
This tool analyzes a website's sitemap to understand its content structure and publishing trends.
|
This tool analyzes a website's sitemap to understand its content structure and publishing trends.
|
||||||
Enter a sitemap URL to start your analysis.
|
Enter a sitemap URL to start your analysis.
|
||||||
""")
|
""")
|
||||||
|
|
||||||
sitemap_url = st.text_input("Please enter the sitemap URL:", "https://www.example.com/sitemap.xml")
|
sitemap_url = st.text_input(
|
||||||
|
"Please enter the sitemap URL:",
|
||||||
|
"https://www.example.com/sitemap.xml"
|
||||||
|
)
|
||||||
|
|
||||||
if st.button("Analyze Sitemap"):
|
if st.button("Analyze Sitemap"):
|
||||||
try:
|
try:
|
||||||
@@ -37,17 +44,32 @@ def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"An unexpected error occurred: {e}")
|
st.error(f"An unexpected error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
def fetch_all_sitemaps(sitemap_url):
|
def fetch_all_sitemaps(sitemap_url):
|
||||||
|
"""
|
||||||
|
Fetches all sitemaps from the provided sitemap URL and concatenates their URLs into a DataFrame.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_url (str): The URL of the sitemap.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: A DataFrame containing all URLs from the sitemaps.
|
||||||
|
"""
|
||||||
st.write(f"🚀 Fetching and analyzing the sitemap: {sitemap_url}...")
|
st.write(f"🚀 Fetching and analyzing the sitemap: {sitemap_url}...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sitemap_df = fetch_sitemap(sitemap_url)
|
sitemap_df = fetch_sitemap(sitemap_url)
|
||||||
|
|
||||||
if sitemap_df is not None:
|
if sitemap_df is not None:
|
||||||
all_sitemaps = sitemap_df.loc[sitemap_df['loc'].str.contains('sitemap'), 'loc'].tolist()
|
all_sitemaps = sitemap_df.loc[
|
||||||
|
sitemap_df['loc'].str.contains('sitemap'),
|
||||||
|
'loc'
|
||||||
|
].tolist()
|
||||||
|
|
||||||
if all_sitemaps:
|
if all_sitemaps:
|
||||||
st.write(f"🔄 Found {len(all_sitemaps)} additional sitemaps. Fetching data from them...")
|
st.write(
|
||||||
|
f"🔄 Found {len(all_sitemaps)} additional sitemaps. Fetching data from them..."
|
||||||
|
)
|
||||||
all_urls_df = pd.DataFrame()
|
all_urls_df = pd.DataFrame()
|
||||||
|
|
||||||
for sitemap in all_sitemaps:
|
for sitemap in all_sitemaps:
|
||||||
@@ -55,11 +77,15 @@ def fetch_all_sitemaps(sitemap_url):
|
|||||||
st.write(f"Fetching URLs from {sitemap}...")
|
st.write(f"Fetching URLs from {sitemap}...")
|
||||||
temp_df = fetch_sitemap(sitemap)
|
temp_df = fetch_sitemap(sitemap)
|
||||||
if temp_df is not None:
|
if temp_df is not None:
|
||||||
all_urls_df = pd.concat([all_urls_df, temp_df], ignore_index=True)
|
all_urls_df = pd.concat(
|
||||||
|
[all_urls_df, temp_df], ignore_index=True
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Error fetching {sitemap}: {e}")
|
st.error(f"Error fetching {sitemap}: {e}")
|
||||||
|
|
||||||
st.write(f"✅ Successfully fetched {len(all_urls_df)} URLs from all sitemaps.")
|
st.write(
|
||||||
|
f"✅ Successfully fetched {len(all_urls_df)} URLs from all sitemaps."
|
||||||
|
)
|
||||||
return all_urls_df
|
return all_urls_df
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -72,7 +98,17 @@ def fetch_all_sitemaps(sitemap_url):
|
|||||||
st.error(f"⚠️ Error fetching the sitemap: {e}")
|
st.error(f"⚠️ Error fetching the sitemap: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def fetch_sitemap(url):
|
def fetch_sitemap(url):
|
||||||
|
"""
|
||||||
|
Fetches and parses the sitemap from the provided URL.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
url (str): The URL of the sitemap.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: A DataFrame containing the URLs from the sitemap.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -89,7 +125,17 @@ def fetch_sitemap(url):
|
|||||||
st.error(f"⚠️ XML parsing error: {e}")
|
st.error(f"⚠️ XML parsing error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def process_lastmod_column(sitemap_df):
|
def process_lastmod_column(sitemap_df):
|
||||||
|
"""
|
||||||
|
Processes the 'lastmod' column in the sitemap DataFrame by converting it to DateTime format and setting it as the index.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: The processed sitemap DataFrame with 'lastmod' as the index.
|
||||||
|
"""
|
||||||
st.write("📅 Converting 'lastmod' column to DateTime format and setting it as the index...")
|
st.write("📅 Converting 'lastmod' column to DateTime format and setting it as the index...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -104,7 +150,17 @@ def process_lastmod_column(sitemap_df):
|
|||||||
st.error(f"⚠️ Error processing the 'lastmod' column: {e}")
|
st.error(f"⚠️ Error processing the 'lastmod' column: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def categorize_and_shorten_sitemaps(sitemap_df):
|
def categorize_and_shorten_sitemaps(sitemap_df):
|
||||||
|
"""
|
||||||
|
Categorizes and shortens the sitemap names in the sitemap DataFrame.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: The sitemap DataFrame with categorized and shortened sitemap names.
|
||||||
|
"""
|
||||||
st.write("🔍 Categorizing and shortening sitemap names...")
|
st.write("🔍 Categorizing and shortening sitemap names...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -124,7 +180,17 @@ def categorize_and_shorten_sitemaps(sitemap_df):
|
|||||||
st.error(f"⚠️ Error categorizing sitemap names: {e}")
|
st.error(f"⚠️ Error categorizing sitemap names: {e}")
|
||||||
return sitemap_df
|
return sitemap_df
|
||||||
|
|
||||||
|
|
||||||
def analyze_content_trends(sitemap_df):
|
def analyze_content_trends(sitemap_df):
|
||||||
|
"""
|
||||||
|
Analyzes content publishing trends in the sitemap DataFrame.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Series: A Series representing the number of contents published each month.
|
||||||
|
"""
|
||||||
st.write("📅 Analyzing content publishing trends...")
|
st.write("📅 Analyzing content publishing trends...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -138,7 +204,15 @@ def analyze_content_trends(sitemap_df):
|
|||||||
st.error(f"⚠️ Error during content trends analysis: {e}")
|
st.error(f"⚠️ Error during content trends analysis: {e}")
|
||||||
return pd.Series()
|
return pd.Series()
|
||||||
|
|
||||||
|
|
||||||
def display_key_metrics(sitemap_df, ppmonth):
|
def display_key_metrics(sitemap_df, ppmonth):
|
||||||
|
"""
|
||||||
|
Displays key metrics of the sitemap analysis.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
||||||
|
ppmonth (Series): The Series representing the number of contents published each month.
|
||||||
|
"""
|
||||||
st.write("### Key Metrics")
|
st.write("### Key Metrics")
|
||||||
|
|
||||||
total_urls = len(sitemap_df)
|
total_urls = len(sitemap_df)
|
||||||
@@ -149,7 +223,14 @@ def display_key_metrics(sitemap_df, ppmonth):
|
|||||||
st.write(f"**Total Articles Published:** {total_articles:,}")
|
st.write(f"**Total Articles Published:** {total_articles:,}")
|
||||||
st.write(f"**Average Monthly Publishing Frequency:** {average_frequency:.2f} articles/month")
|
st.write(f"**Average Monthly Publishing Frequency:** {average_frequency:.2f} articles/month")
|
||||||
|
|
||||||
|
|
||||||
def plot_sitemap_content_distribution(sitemap_df):
|
def plot_sitemap_content_distribution(sitemap_df):
|
||||||
|
"""
|
||||||
|
Plots the content distribution by sitemap categories.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
||||||
|
"""
|
||||||
st.write("📊 Visualizing content amount by sitemap categories...")
|
st.write("📊 Visualizing content amount by sitemap categories...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -170,7 +251,14 @@ def plot_sitemap_content_distribution(sitemap_df):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"⚠️ Error during sitemap content distribution plotting: {e}")
|
st.error(f"⚠️ Error during sitemap content distribution plotting: {e}")
|
||||||
|
|
||||||
|
|
||||||
def plot_content_trends(ppmonth):
|
def plot_content_trends(ppmonth):
|
||||||
|
"""
|
||||||
|
Plots the content publishing trends over time.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
ppmonth (Series): The Series representing the number of contents published each month.
|
||||||
|
"""
|
||||||
st.write("📈 Plotting content publishing trends over time...")
|
st.write("📈 Plotting content publishing trends over time...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -187,13 +275,20 @@ def plot_content_trends(ppmonth):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"⚠️ Error during content trends plotting: {e}")
|
st.error(f"⚠️ Error during content trends plotting: {e}")
|
||||||
|
|
||||||
|
|
||||||
def plot_content_type_breakdown(sitemap_df):
|
def plot_content_type_breakdown(sitemap_df):
|
||||||
|
"""
|
||||||
|
Plots the content type breakdown.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
||||||
|
"""
|
||||||
st.write("🔍 Plotting content type breakdown...")
|
st.write("🔍 Plotting content type breakdown...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if 'sitemap_name' in sitemap_df.columns and not sitemap_df['sitemap_name'].empty:
|
if 'sitemap_name' in sitemap_df.columns and not sitemap_df['sitemap_name'].empty:
|
||||||
content_type_counts = sitemap_df['sitemap_name'].value_counts()
|
content_type_counts = sitemap_df['sitemap_name'].value_counts()
|
||||||
st.write("Content Type Counts:", content_type_counts) # Debug line
|
st.write("Content Type Counts:", content_type_counts)
|
||||||
|
|
||||||
if not content_type_counts.empty:
|
if not content_type_counts.empty:
|
||||||
fig = go.Figure(data=[go.Pie(labels=content_type_counts.index, values=content_type_counts.values)])
|
fig = go.Figure(data=[go.Pie(labels=content_type_counts.index, values=content_type_counts.values)])
|
||||||
@@ -210,45 +305,20 @@ def plot_content_type_breakdown(sitemap_df):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"⚠️ Error during content type breakdown plotting: {e}")
|
st.error(f"⚠️ Error during content type breakdown plotting: {e}")
|
||||||
|
|
||||||
def visualize_content_by_sitemap_category(sitemap_df):
|
|
||||||
st.write("🔍 Visualizing content amount by sitemap categories...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if the 'sitemap_name' column exists and is non-empty
|
|
||||||
if 'sitemap_name' in sitemap_df.columns and not sitemap_df['sitemap_name'].empty:
|
|
||||||
st.write("Sitemap Data Preview:", sitemap_df['sitemap_name'].head()) # Check the data
|
|
||||||
|
|
||||||
# Group and count the number of entries per sitemap category
|
|
||||||
sitemap_category_counts = sitemap_df['sitemap_name'].value_counts()
|
|
||||||
|
|
||||||
# Display the grouped data
|
|
||||||
st.write("Sitemap Category Counts:", sitemap_category_counts) # Debug line
|
|
||||||
|
|
||||||
# If we have valid data, plot it
|
|
||||||
if not sitemap_category_counts.empty:
|
|
||||||
fig = go.Figure()
|
|
||||||
fig.add_bar(x=sitemap_category_counts.index, y=sitemap_category_counts.values, name='Sitemap Names')
|
|
||||||
fig.update_layout(
|
|
||||||
title='Content Amount by Sitemap Categories',
|
|
||||||
paper_bgcolor='#E5ECF6',
|
|
||||||
yaxis_title='Article Amount',
|
|
||||||
)
|
|
||||||
st.plotly_chart(fig)
|
|
||||||
else:
|
|
||||||
st.warning("⚠️ No data to display for sitemap categories.")
|
|
||||||
else:
|
|
||||||
st.warning("⚠️ The 'sitemap_name' column is missing or empty.")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
st.error(f"⚠️ Error during content amount visualization: {e}")
|
|
||||||
|
|
||||||
def plot_publishing_frequency(sitemap_df):
|
def plot_publishing_frequency(sitemap_df):
|
||||||
|
"""
|
||||||
|
Plots the publishing frequency by month.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sitemap_df (DataFrame): The sitemap DataFrame.
|
||||||
|
"""
|
||||||
st.write("📆 Plotting publishing frequency by month...")
|
st.write("📆 Plotting publishing frequency by month...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not sitemap_df.empty:
|
if not sitemap_df.empty:
|
||||||
frequency_by_month = sitemap_df.index.to_period('M').value_counts().sort_index()
|
frequency_by_month = sitemap_df.index.to_period('M').value_counts().sort_index()
|
||||||
frequency_by_month.index = frequency_by_month.index.astype(str) # Convert Period to string for Plotly
|
frequency_by_month.index = frequency_by_month.index.astype(str)
|
||||||
|
|
||||||
fig = go.Figure()
|
fig = go.Figure()
|
||||||
fig.add_bar(x=frequency_by_month.index, y=frequency_by_month.values, name='Publishing Frequency')
|
fig.add_bar(x=frequency_by_month.index, y=frequency_by_month.values, name='Publishing Frequency')
|
||||||
@@ -265,6 +335,6 @@ def plot_publishing_frequency(sitemap_df):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"⚠️ Error during publishing frequency plotting: {e}")
|
st.error(f"⚠️ Error during publishing frequency plotting: {e}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user