From c9b22b3653215fa7a58282190aa30585b99cb748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D9=8A?= Date: Fri, 17 Jan 2025 12:23:10 +0530 Subject: [PATCH] Update on_page_seo_analyzer.py Additional Insights for Non-Technical Users Content Quality Insights: Readability Score: Use libraries like textstat to calculate a readability score (e.g., Flesch Reading Ease) for the webpage content. Keywords Highlighting: Extract and highlight frequently used keywords in the content, helping users understand what topics are emphasized. Duplicate Content Check: Flag if the meta description or titles are repeated multiple times in the page content. SEO Health Checks: Broken Links Detection: Identify broken internal or external links and recommend fixing them. Image Optimization Tips: Suggest reducing image sizes if the file sizes exceed a certain threshold. Recommend modern formats like WebP for better performance. Alt Text Suggestions: Provide actionable suggestions for missing or insufficient alt text, such as "Describe the image's purpose or key elements." Social Media Enhancement: Suggest best practices for Open Graph and Twitter tags, such as recommended tag content length or formats. Generate suggested meta descriptions and Open Graph descriptions for improved click-through rates. Accessibility Recommendations: Heading Structure Audit: Check for skipped heading levels (e.g., h2 follows h4) and provide guidance on correcting them. Contrast Ratio Check: Flag potential text-to-background contrast issues for visually impaired users (can use APIs like Lighthouse). ARIA Tags: Check for the presence of ARIA (Accessible Rich Internet Applications) tags and recommend their addition if missing. Performance Insights: Lazy Loading Suggestions: Highlight images without loading="lazy" and recommend lazy loading to improve page load speed. Critical CSS Suggestions: Advise inlining critical CSS for faster initial render. Script Optimization: Highlight unminified or unused JavaScript and recommend optimization. Custom Recommendations: Call to Action (CTA) Suggestions: Analyze the text for actionable elements like buttons or links and recommend improving CTAs. Internal Linking Suggestions: Suggest adding internal links for keywords or headings that lack links. Schema Markup Expansion: Recommend additional schema types (e.g., FAQ, Product, Review) based on the page content. Mobile Friendliness Enhancements: Check for touch targets (buttons and links) being too small or too close together. Flag pages without mobile-friendly navigation menus. Enhancements to User Experience Highlight Strengths and Weaknesses: Use color-coded sections to differentiate between well-optimized and underperforming areas. Simplified Metrics: Break down complex scores (like PageSpeed or SEO scores) into "Good," "Needs Improvement," and "Poor" categories. Provide plain-English explanations for non-technical users. Recommendations Section: Provide step-by-step instructions or examples for fixing identified issues, such as "How to add a canonical tag" or "How to structure hreflang attributes." Actionable Insights Dashboard: Summarize all findings in a visually appealing dashboard with prioritized to-do lists. Export Reports: Allow users to export the analysis and recommendations in a PDF or CSV format for easier sharing and tracking. --- lib/ai_seo_tools/on_page_seo_analyzer.py | 150 ++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/lib/ai_seo_tools/on_page_seo_analyzer.py b/lib/ai_seo_tools/on_page_seo_analyzer.py index 6f3e76ae..945bca0a 100644 --- a/lib/ai_seo_tools/on_page_seo_analyzer.py +++ b/lib/ai_seo_tools/on_page_seo_analyzer.py @@ -7,6 +7,13 @@ from bs4 import BeautifulSoup import requests import csv import time +from urllib.parse import urlparse +import validators +from readability import Readability +import textstat +import re +from PIL import Image +import io from ..gpt_providers.text_generation.main_text_generation import llm_text_gen def fetch_and_parse_html(url): @@ -65,6 +72,121 @@ def extract_meta_data(soup): st.warning(f"⚠️ Error extracting meta data: {e}") return {} +def analyze_headings(soup): + """ + Analyzes the headings on the webpage. + + Args: + soup (BeautifulSoup): Parsed HTML content. + + Returns: + dict: Count of each heading tag. + """ + try: + headings = { + 'h1': len(soup.find_all('h1')), + 'h2': len(soup.find_all('h2')), + 'h3': len(soup.find_all('h3')), + 'h4': len(soup.find_all('h4')), + 'h5': len(soup.find_all('h5')), + 'h6': len(soup.find_all('h6')) + } + return headings + except Exception as e: + st.warning(f"⚠️ Error analyzing headings: {e}") + return {} + +def check_readability(text): + """ + Checks the readability score of the text. + + Args: + text (str): The text content of the webpage. + + Returns: + float: Readability score. + """ + try: + readability_score = textstat.flesch_reading_ease(text) + return readability_score + except Exception as e: + st.warning(f"⚠️ Error checking readability: {e}") + return None + +def analyze_images(soup, url): + """ + Analyzes the images on the webpage. + + Args: + soup (BeautifulSoup): Parsed HTML content. + url (str): The URL of the webpage. + + Returns: + list: List of dictionaries containing image src and alt text. + """ + try: + images = soup.find_all('img') + image_data = [] + for img in images: + src = img.get('src') + if not src: + continue + if not validators.url(src): + src = urlparse(url).scheme + '://' + urlparse(url).netloc + src + alt_text = img.get('alt', '') + image_data.append({'src': src, 'alt': alt_text}) + return image_data + except Exception as e: + st.warning(f"⚠️ Error analyzing images: {e}") + return [] + +def analyze_links(soup): + """ + Analyzes the links on the webpage. + + Args: + soup (BeautifulSoup): Parsed HTML content. + + Returns: + list: List of broken links. + """ + try: + links = soup.find_all('a', href=True) + broken_links = [] + for link in links: + href = link['href'] + if not validators.url(href): + continue + try: + response = requests.head(href, timeout=5) + if response.status_code >= 400: + broken_links.append(href) + except requests.RequestException: + broken_links.append(href) + return broken_links + except Exception as e: + st.warning(f"⚠️ Error analyzing links: {e}") + return [] + +def suggest_ctas(soup): + """ + Suggests call-to-action phrases present on the webpage. + + Args: + soup (BeautifulSoup): Parsed HTML content. + + Returns: + list: List of found CTA phrases. + """ + try: + cta_keywords = ['buy now', 'subscribe', 'learn more', 'sign up', 'get started'] + text = soup.get_text().lower() + ctas_found = [cta for cta in cta_keywords if cta in text] + return ctas_found + except Exception as e: + st.warning(f"⚠️ Error suggesting CTAs: {e}") + return [] + def extract_alternates_and_canonicals(soup): """ Extracts canonical URL, hreflangs, and mobile alternate links from the parsed HTML. @@ -167,7 +289,7 @@ def extract_content_data(soup, url): link_insights = [] if internal_links: link_insights.append("✅ Internal links are present.") - if external_links: + if external links: link_insights.append("✅ External links are present.") return { @@ -313,6 +435,12 @@ def fetch_seo_data(url): return {} meta_data = extract_meta_data(soup) + headings = analyze_headings(soup) + text = soup.get_text() + readability_score = check_readability(text) + images = analyze_images(soup, url) + broken_links = analyze_links(soup) + ctas = suggest_ctas(soup) alternates_and_canonicals = extract_alternates_and_canonicals(soup) schema_markup = extract_schema_markup(soup) content_data = extract_content_data(soup, url) @@ -320,6 +448,11 @@ def fetch_seo_data(url): return { "meta_data": meta_data, + "headings": headings, + "readability_score": readability_score, + "images": images, + "broken_links": broken_links, + "ctas": ctas, "alternates_and_canonicals": alternates_and_canonicals, "schema_markup": schema_markup, "content_data": content_data, @@ -371,6 +504,21 @@ def analyze_onpage_seo(): st.write(f"**Language:** {results['meta_data']['html_language']}") st.write(results['meta_data']['title_message']) st.write(results['meta_data']['description_message']) + + st.subheader("Headings") + st.write(results['headings']) + + st.subheader("Readability Score") + st.write(f"**Readability Score:** {results['readability_score']}") + + st.subheader("Images") + st.write(results['images']) + + st.subheader("Broken Links") + st.write(results['broken_links']) + + st.subheader("Suggested CTAs") + st.write(results['ctas']) st.subheader("Canonical and Hreflangs") st.write(f"**Canonical:** {results['alternates_and_canonicals']['canonical']}")