ALwrity Chatbot, SEO, Social media, Settings, Dashboard UI styling changes

This commit is contained in:
ajaysi
2025-06-08 05:59:22 +05:30
parent fad9647b46
commit bbe56a364d
24 changed files with 7248 additions and 2222 deletions

View File

@@ -7,13 +7,16 @@ from bs4 import BeautifulSoup
import requests
import csv
import time
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
import validators
import readability
import textstat
import re
from PIL import Image
import io
import advertools as adv
import pandas as pd
from collections import Counter
from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
def fetch_and_parse_html(url):
@@ -421,6 +424,314 @@ def check_alt_text(soup):
st.warning(f"⚠️ Error checking alt text: {e}")
return {}
def analyze_keyword_density(text, url=None):
"""
Analyze keyword density and word frequency using advertools for comprehensive SEO insights.
Args:
text (str): The main content text from the webpage
url (str): Optional URL for additional context
Returns:
dict: Comprehensive keyword density analysis
"""
try:
# Use advertools word_frequency for professional analysis
word_freq_df = adv.word_frequency(text)
if word_freq_df.empty:
return {
"word_frequency": [],
"keyword_density": {},
"top_keywords": [],
"analysis_message": "⚠️ Unable to analyze content - no words found",
"recommendations": []
}
# Get top 20 most frequent words (excluding very common words)
# Filter out common stopwords and very short words
common_stopwords = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these', 'those', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'a', 'an', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
# Filter and process the word frequency data
filtered_words = []
total_words = len(text.split())
for idx, row in word_freq_df.iterrows():
word = row['word'].lower().strip()
count = row['abs_freq']
# Filter criteria
if (len(word) >= 3 and
word not in common_stopwords and
word.isalpha() and
count >= 2): # Minimum frequency of 2
density = (count / total_words) * 100
filtered_words.append({
'word': word,
'count': count,
'density': round(density, 2)
})
# Sort by frequency and take top 15
top_keywords = sorted(filtered_words, key=lambda x: x['count'], reverse=True)[:15]
# Calculate keyword density categories
keyword_density = {
'high_density': [kw for kw in top_keywords if kw['density'] > 3],
'medium_density': [kw for kw in top_keywords if 1 <= kw['density'] <= 3],
'low_density': [kw for kw in top_keywords if kw['density'] < 1]
}
# Generate analysis messages and recommendations
analysis_messages = []
recommendations = []
if len(top_keywords) == 0:
analysis_messages.append("⚠️ No significant keywords found in content")
recommendations.append("Add more descriptive and relevant keywords to your content")
else:
analysis_messages.append(f"✅ Found {len(top_keywords)} significant keywords")
# Check for keyword stuffing
if keyword_density['high_density']:
high_density_words = [kw['word'] for kw in keyword_density['high_density']]
analysis_messages.append(f"⚠️ Potential keyword stuffing detected: {', '.join(high_density_words[:3])}")
recommendations.append("Consider reducing frequency of over-optimized keywords (>3% density)")
# Check for good keyword distribution
if len(keyword_density['medium_density']) >= 3:
analysis_messages.append("✅ Good keyword distribution found")
else:
recommendations.append("Consider adding more medium-density keywords (1-3% density)")
# Check total word count
if total_words < 300:
recommendations.append("Content is quite short - consider expanding to at least 300 words")
elif total_words > 2000:
recommendations.append("Content is quite long - ensure it's well-structured with headings")
return {
"word_frequency": word_freq_df.to_dict('records') if not word_freq_df.empty else [],
"keyword_density": keyword_density,
"top_keywords": top_keywords,
"total_words": total_words,
"analysis_message": " | ".join(analysis_messages) if analysis_messages else "✅ Keyword analysis complete",
"recommendations": recommendations
}
except Exception as e:
st.warning(f"⚠️ Error in keyword density analysis: {e}")
return {
"word_frequency": [],
"keyword_density": {},
"top_keywords": [],
"total_words": 0,
"analysis_message": f"⚠️ Error analyzing keywords: {str(e)}",
"recommendations": []
}
def analyze_url_structure_with_advertools(text, url):
"""
Analyze URL structure and extract URLs using advertools for comprehensive link analysis.
Args:
text (str): The main content text from the webpage
url (str): The current webpage URL for context
Returns:
dict: Comprehensive URL analysis using advertools
"""
try:
# Use advertools extract_urls for professional URL extraction
extracted_urls = adv.extract_urls(text)
if not extracted_urls:
return {
"extracted_urls": [],
"url_analysis": {},
"link_insights": [],
"recommendations": ["No URLs found in content text"]
}
# Convert to DataFrame for easier analysis
urls_df = pd.DataFrame(extracted_urls, columns=['urls'])
# Analyze URL patterns and structure
current_domain = urlparse(url).netloc.lower()
# Categorize URLs
internal_urls = []
external_urls = []
social_urls = []
email_urls = []
file_urls = []
# Social media domains for classification
social_domains = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com',
'youtube.com', 'pinterest.com', 'tiktok.com', 'snapchat.com']
# File extensions to identify downloadable content
file_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
'.zip', '.rar', '.mp4', '.mp3', '.jpg', '.png', '.gif']
for extracted_url in extracted_urls:
url_lower = extracted_url.lower()
parsed_url = urlparse(extracted_url)
domain = parsed_url.netloc.lower()
# Categorize URLs
if extracted_url.startswith('mailto:'):
email_urls.append(extracted_url)
elif any(ext in url_lower for ext in file_extensions):
file_urls.append(extracted_url)
elif any(social in domain for social in social_domains):
social_urls.append(extracted_url)
elif current_domain in domain or domain == '':
internal_urls.append(extracted_url)
else:
external_urls.append(extracted_url)
# Generate insights and recommendations
insights = []
recommendations = []
# URL distribution analysis
total_urls = len(extracted_urls)
if total_urls > 0:
insights.append(f"✅ Found {total_urls} URLs in content")
# Internal vs External ratio analysis
internal_ratio = (len(internal_urls) / total_urls) * 100
external_ratio = (len(external_urls) / total_urls) * 100
if internal_ratio > 70:
insights.append(f"✅ Good internal linking: {len(internal_urls)} internal URLs ({internal_ratio:.1f}%)")
elif internal_ratio < 30:
insights.append(f"⚠️ Low internal linking: {len(internal_urls)} internal URLs ({internal_ratio:.1f}%)")
recommendations.append("Consider adding more internal links to improve site structure")
else:
insights.append(f"✅ Balanced linking: {len(internal_urls)} internal, {len(external_urls)} external URLs")
# External links analysis
if external_urls:
insights.append(f"🔗 {len(external_urls)} external links found ({external_ratio:.1f}%)")
if len(external_urls) > 10:
recommendations.append("Consider reviewing external links - too many might dilute page authority")
else:
recommendations.append("Consider adding relevant external links to authoritative sources")
# Social media presence
if social_urls:
insights.append(f"📱 {len(social_urls)} social media links found")
else:
recommendations.append("Consider adding social media links for better engagement")
# File downloads
if file_urls:
insights.append(f"📄 {len(file_urls)} downloadable files linked")
# Email links
if email_urls:
insights.append(f"📧 {len(email_urls)} email links found")
# URL quality analysis
broken_or_suspicious = []
for extracted_url in extracted_urls:
# Check for common issues
if extracted_url.count('http') > 1:
broken_or_suspicious.append(f"Malformed URL: {extracted_url}")
elif len(extracted_url) > 200:
broken_or_suspicious.append(f"Very long URL: {extracted_url[:100]}...")
if broken_or_suspicious:
insights.append(f"⚠️ {len(broken_or_suspicious)} potentially problematic URLs found")
recommendations.extend(broken_or_suspicious[:3]) # Show first 3
# Performance insights
if total_urls > 50:
recommendations.append("High number of URLs - ensure they're all necessary for user experience")
elif total_urls < 5:
recommendations.append("Consider adding more relevant links to improve content value")
return {
"extracted_urls": extracted_urls,
"url_analysis": {
"total_urls": total_urls,
"internal_urls": internal_urls,
"external_urls": external_urls,
"social_urls": social_urls,
"email_urls": email_urls,
"file_urls": file_urls,
"internal_ratio": round((len(internal_urls) / total_urls) * 100, 1) if total_urls > 0 else 0,
"external_ratio": round((len(external_urls) / total_urls) * 100, 1) if total_urls > 0 else 0
},
"link_insights": insights,
"recommendations": recommendations,
"problematic_urls": broken_or_suspicious
}
except Exception as e:
st.warning(f"⚠️ Error in URL analysis: {e}")
return {
"extracted_urls": [],
"url_analysis": {},
"link_insights": [f"⚠️ Error analyzing URLs: {str(e)}"],
"recommendations": []
}
def enhanced_content_analysis(soup, url):
"""
Enhanced content analysis that includes advertools word frequency and URL analysis.
Args:
soup (BeautifulSoup): Parsed HTML content
url (str): The URL of the webpage
Returns:
dict: Enhanced content analysis data
"""
try:
# Get the main content text (excluding navigation, footers, etc.)
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
# Get text content
main_text = soup.get_text()
# Clean up the text
lines = (line.strip() for line in main_text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
clean_text = ' '.join(chunk for chunk in chunks if chunk)
# Perform keyword density analysis
keyword_analysis = analyze_keyword_density(clean_text, url)
# Perform URL analysis using advertools
url_analysis = analyze_url_structure_with_advertools(clean_text, url)
# Get existing content data
content_data = extract_content_data(soup, url)
# Enhance with keyword and URL analysis
content_data.update({
"keyword_analysis": keyword_analysis,
"url_analysis": url_analysis,
"clean_text_length": len(clean_text),
"clean_word_count": len(clean_text.split())
})
# Update link insights with advertools analysis
if url_analysis.get('link_insights'):
content_data['link_insights'] = url_analysis['link_insights']
return content_data
except Exception as e:
st.warning(f"⚠️ Error in enhanced content analysis: {e}")
return extract_content_data(soup, url) # Fallback to original
def fetch_seo_data(url):
"""
Fetches SEO-related data from the provided URL and returns a dictionary with results.
@@ -444,7 +755,7 @@ def fetch_seo_data(url):
ctas = suggest_ctas(soup)
alternates_and_canonicals = extract_alternates_and_canonicals(soup)
schema_markup = extract_schema_markup(soup)
content_data = extract_content_data(soup, url)
content_data = enhanced_content_analysis(soup, url)
open_graph = extract_open_graph(soup)
return {
@@ -481,10 +792,11 @@ def analyze_onpage_seo():
"""
Main function to analyze on-page SEO using Streamlit.
"""
st.title("ALwrity On Page SEO Analyzer")
st.title("🔍 ALwrity On-Page SEO Analyzer")
st.write("Enhanced with AI-powered keyword density and URL analysis")
url = st.text_input("Enter URL to Analyze", "")
if st.button("Analyze"):
if st.button("🚀 Analyze"):
if not url:
st.error("⚠️ Please enter a URL.")
else:
@@ -496,72 +808,263 @@ def analyze_onpage_seo():
alt_text = check_alt_text(fetch_and_parse_html(url))
if results:
st.subheader("Meta Data")
st.write(f"**Title:** {results['meta_data']['metatitle']}")
st.write(f"**Description:** {results['meta_data']['metadescription']}")
st.write(f"**Robots Directives:** {', '.join(results['meta_data']['robots_directives'])}")
st.write(f"**Viewport:** {results['meta_data']['viewport']}")
st.write(f"**Charset:** {results['meta_data']['charset']}")
st.write(f"**Language:** {results['meta_data']['html_language']}")
st.write(results['meta_data']['title_message'])
st.write(results['meta_data']['description_message'])
# Create tabs for better organization
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"📄 Meta & Content",
"🔤 Keywords & Density",
"🖼️ Media & Links",
"📱 Technical",
"📊 Performance"
])
with tab1:
st.subheader("Meta Data")
col1, col2 = st.columns(2)
with col1:
st.write(f"**Title:** {results['meta_data']['metatitle']}")
st.write(f"**Description:** {results['meta_data']['metadescription']}")
st.write(f"**Language:** {results['meta_data']['html_language']}")
st.write(results['meta_data']['title_message'])
st.write(results['meta_data']['description_message'])
with col2:
st.write(f"**Robots Directives:** {', '.join(results['meta_data']['robots_directives'])}")
st.write(f"**Viewport:** {results['meta_data']['viewport']}")
st.write(f"**Charset:** {results['meta_data']['charset']}")
st.subheader("Headings")
st.write(results['headings'])
st.subheader("Content Overview")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Text Length", f"{results['content_data']['text_length']} chars")
with col2:
if 'clean_word_count' in results['content_data']:
st.metric("Word Count", results['content_data']['clean_word_count'])
with col3:
st.metric("Readability Score", f"{results['readability_score']:.1f}")
st.write(results['content_data']['h1_message'])
st.write(results['content_data']['content_message'])
st.subheader("Readability Score")
st.write(f"**Readability Score:** {results['readability_score']}")
st.subheader("Headings Structure")
if results['headings']:
headings_df = pd.DataFrame(results['headings'])
st.dataframe(headings_df, use_container_width=True)
else:
st.write("No headings found")
st.subheader("Images")
st.write(results['images'])
with tab2:
st.subheader("🎯 Keyword Density Analysis")
if 'keyword_analysis' in results['content_data']:
keyword_data = results['content_data']['keyword_analysis']
# Display analysis message
st.write(keyword_data['analysis_message'])
# Show recommendations if any
if keyword_data['recommendations']:
st.write("**💡 Recommendations:**")
for rec in keyword_data['recommendations']:
st.write(f"{rec}")
# Display top keywords
if keyword_data['top_keywords']:
st.subheader("📈 Top Keywords")
# Create a DataFrame for better visualization
keywords_df = pd.DataFrame(keyword_data['top_keywords'])
# Color code by density
def highlight_density(val):
if val > 3:
return 'background-color: #ffcccc' # Light red for high density
elif val >= 1:
return 'background-color: #ccffcc' # Light green for good density
else:
return 'background-color: #ffffcc' # Light yellow for low density
styled_df = keywords_df.style.applymap(highlight_density, subset=['density'])
st.dataframe(styled_df, use_container_width=True)
# Keyword density categories
col1, col2, col3 = st.columns(3)
with col1:
st.write("**🔴 High Density (>3%)**")
if keyword_data['keyword_density']['high_density']:
for kw in keyword_data['keyword_density']['high_density']:
st.write(f"{kw['word']}: {kw['density']}%")
else:
st.write("None found ✅")
with col2:
st.write("**🟢 Good Density (1-3%)**")
if keyword_data['keyword_density']['medium_density']:
for kw in keyword_data['keyword_density']['medium_density'][:5]:
st.write(f"{kw['word']}: {kw['density']}%")
else:
st.write("None found")
with col3:
st.write("**🟡 Low Density (<1%)**")
if keyword_data['keyword_density']['low_density']:
for kw in keyword_data['keyword_density']['low_density'][:5]:
st.write(f"{kw['word']}: {kw['density']}%")
else:
st.write("None found")
else:
st.warning("No significant keywords found in content")
else:
st.warning("Keyword analysis not available")
st.subheader("Broken Links")
st.write(results['broken_links'])
with tab3:
st.subheader("Images Analysis")
st.write(results['content_data']['alt_text_message'])
if results['images']:
st.write(f"**Total Images:** {len(results['images'])}")
with st.expander("View Image Details"):
for i, img in enumerate(results['images'][:10]): # Show first 10
st.write(f"**Image {i+1}:** {img}")
st.subheader("🔗 Advanced Link Analysis")
# Display advertools URL analysis if available
if 'url_analysis' in results['content_data']:
url_data = results['content_data']['url_analysis']
# URL Statistics
st.subheader("📊 URL Statistics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total URLs", url_data['url_analysis'].get('total_urls', 0))
with col2:
st.metric("Internal Links", len(url_data['url_analysis'].get('internal_urls', [])))
with col3:
st.metric("External Links", len(url_data['url_analysis'].get('external_urls', [])))
with col4:
st.metric("Social Links", len(url_data['url_analysis'].get('social_urls', [])))
# Link Distribution
if url_data['url_analysis'].get('total_urls', 0) > 0:
st.subheader("🎯 Link Distribution")
col1, col2 = st.columns(2)
with col1:
st.write("**Internal vs External Ratio:**")
internal_ratio = url_data['url_analysis'].get('internal_ratio', 0)
external_ratio = url_data['url_analysis'].get('external_ratio', 0)
st.write(f"• Internal: {internal_ratio}%")
st.write(f"• External: {external_ratio}%")
with col2:
st.write("**Link Categories:**")
if url_data['url_analysis'].get('email_urls'):
st.write(f"• Email: {len(url_data['url_analysis']['email_urls'])}")
if url_data['url_analysis'].get('file_urls'):
st.write(f"• Files: {len(url_data['url_analysis']['file_urls'])}")
if url_data['url_analysis'].get('social_urls'):
st.write(f"• Social: {len(url_data['url_analysis']['social_urls'])}")
# URL Insights and Recommendations
if url_data.get('link_insights'):
st.subheader("💡 Link Analysis Insights")
for insight in url_data['link_insights']:
st.write(f"{insight}")
if url_data.get('recommendations'):
st.subheader("🎯 Link Optimization Recommendations")
for rec in url_data['recommendations']:
st.write(f"{rec}")
# Show extracted URLs
if url_data.get('extracted_urls'):
with st.expander(f"📋 View All Extracted URLs ({len(url_data['extracted_urls'])})"):
# Categorize and display URLs
internal_urls = url_data['url_analysis'].get('internal_urls', [])
external_urls = url_data['url_analysis'].get('external_urls', [])
social_urls = url_data['url_analysis'].get('social_urls', [])
if internal_urls:
st.write("**🏠 Internal URLs:**")
for url in internal_urls[:10]: # Show first 10
st.write(f"{url}")
if external_urls:
st.write("**🌐 External URLs:**")
for url in external_urls[:10]: # Show first 10
st.write(f"{url}")
if social_urls:
st.write("**📱 Social Media URLs:**")
for url in social_urls:
st.write(f"{url}")
else:
# Fallback to original link analysis
st.subheader("Links Analysis")
for insight in results['content_data']['link_insights']:
st.write(f"- {insight}")
st.write(results['content_data']['internal_links_message'])
st.write(results['content_data']['external_links_message'])
if results['broken_links']:
st.subheader("⚠️ Broken Links")
for link in results['broken_links'][:5]: # Show first 5
st.write(f"{link}")
else:
st.success("✅ No broken links detected")
st.subheader("Suggested CTAs")
st.write(results['ctas'])
with tab4:
st.subheader("Schema Markup")
st.write(f"**Schema Types:** {results['schema_markup']['schema_types']}")
st.write(results['schema_markup']['schema_message'])
st.subheader("Canonical and Hreflangs")
st.write(f"**Canonical:** {results['alternates_and_canonicals']['canonical']}")
st.write(f"**Hreflangs:** {results['alternates_and_canonicals']['hreflangs']}")
st.write(f"**Mobile Alternate:** {results['alternates_and_canonicals']['mobile_alternate']}")
st.write(results['alternates_and_canonicals']['canonical_message'])
st.write(results['alternates_and_canonicals']['hreflangs_message'])
st.subheader("Open Graph & Social")
st.write(f"**Open Graph Tags:** {results['open_graph']['open_graph']}")
st.write(results['open_graph']['open_graph_message'])
st.write(f"**Twitter Cards:** {social_tags['twitter_cards']}")
st.write(social_tags['twitter_message'])
st.write(f"**Facebook Open Graph:** {social_tags['facebook_open_graph']}")
st.write(social_tags['facebook_message'])
with tab5:
st.subheader("Performance & Usability")
col1, col2 = st.columns(2)
with col1:
st.write("**Page Speed**")
st.write(speed['speed_message'])
st.write("**Mobile Usability**")
st.write(mobile_usability['mobile_message'])
with col2:
st.write("**Accessibility**")
st.write(alt_text['alt_text_message'])
st.write("**CTAs Found**")
if results['ctas']:
for cta in results['ctas']:
st.write(f"{cta}")
else:
st.write("No common CTAs detected")
st.subheader("Canonical and Hreflangs")
st.write(f"**Canonical:** {results['alternates_and_canonicals']['canonical']}")
st.write(f"**Hreflangs:** {results['alternates_and_canonicals']['hreflangs']}")
st.write(f"**Mobile Alternate:** {results['alternates_and_canonicals']['mobile_alternate']}")
st.write(results['alternates_and_canonicals']['canonical_message'])
st.write(results['alternates_and_canonicals']['hreflangs_message'])
st.subheader("Schema Markup")
st.write(f"**Schema Types:** {results['schema_markup']['schema_types']}")
st.write(results['schema_markup']['schema_message'])
st.subheader("Content Data")
st.write(f"**Text Length:** {results['content_data']['text_length']} characters")
st.write(results['content_data']['h1_message'])
st.write(results['content_data']['content_message'])
st.write(results['content_data']['alt_text_message'])
for insight in results['content_data']['link_insights']:
st.write(f"- {insight}")
st.write(results['content_data']['internal_links_message'])
st.write(results['content_data']['external_links_message'])
st.subheader("Open Graph Data")
st.write(f"**Open Graph Tags:** {results['open_graph']['open_graph']}")
st.write(results['open_graph']['open_graph_message'])
st.subheader("Social Tags")
st.write(f"**Twitter Cards:** {social_tags['twitter_cards']}")
st.write(social_tags['twitter_message'])
st.write(f"**Facebook Open Graph:** {social_tags['facebook_open_graph']}")
st.write(social_tags['facebook_message'])
st.subheader("Performance Metrics")
st.write(speed['speed_message'])
st.subheader("Mobile Usability")
st.write(mobile_usability['mobile_message'])
st.subheader("Accessibility")
st.write(alt_text['alt_text_message'])
if st.button("Download CSV"):
# Export functionality
st.subheader("📥 Export Data")
if st.button("Download Complete Analysis as CSV"):
download_csv(results)