Update on_page_seo_analyzer.py

Additional Insights for Non-Technical Users Content Quality Insights: Readability Score: Use libraries like textstat to calculate a readability score (e.g., Flesch Reading Ease) for the webpage content. Keywords Highlighting: Extract and highlight frequently used keywords in the content, helping users understand what topics are emphasized. Duplicate Content Check: Flag if the meta description or titles are repeated multiple times in the page content. SEO Health Checks: Broken Links Detection: Identify broken internal or external links and recommend fixing them. Image Optimization Tips: Suggest reducing image sizes if the file sizes exceed a certain threshold. Recommend modern formats like WebP for better performance. Alt Text Suggestions: Provide actionable suggestions for missing or insufficient alt text, such as "Describe the image's purpose or key elements." Social Media Enhancement: Suggest best practices for Open Graph and Twitter tags, such as recommended tag content length or formats. Generate suggested meta descriptions and Open Graph descriptions for improved click-through rates. Accessibility Recommendations: Heading Structure Audit: Check for skipped heading levels (e.g., h2 follows h4) and provide guidance on correcting them. Contrast Ratio Check: Flag potential text-to-background contrast issues for visually impaired users (can use APIs like Lighthouse). ARIA Tags: Check for the presence of ARIA (Accessible Rich Internet Applications) tags and recommend their addition if missing. Performance Insights: Lazy Loading Suggestions: Highlight images without loading="lazy" and recommend lazy loading to improve page load speed. Critical CSS Suggestions: Advise inlining critical CSS for faster initial render. Script Optimization: Highlight unminified or unused JavaScript and recommend optimization. Custom Recommendations: Call to Action (CTA) Suggestions: Analyze the text for actionable elements like buttons or links and recommend improving CTAs. Internal Linking Suggestions: Suggest adding internal links for keywords or headings that lack links. Schema Markup Expansion: Recommend additional schema types (e.g., FAQ, Product, Review) based on the page content. Mobile Friendliness Enhancements: Check for touch targets (buttons and links) being too small or too close together. Flag pages without mobile-friendly navigation menus. Enhancements to User Experience Highlight Strengths and Weaknesses: Use color-coded sections to differentiate between well-optimized and underperforming areas. Simplified Metrics: Break down complex scores (like PageSpeed or SEO scores) into "Good," "Needs Improvement," and "Poor" categories. Provide plain-English explanations for non-technical users. Recommendations Section: Provide step-by-step instructions or examples for fixing identified issues, such as "How to add a canonical tag" or "How to structure hreflang attributes." Actionable Insights Dashboard: Summarize all findings in a visually appealing dashboard with prioritized to-do lists. Export Reports: Allow users to export the analysis and recommendations in a PDF or CSV format for easier sharing and tracking.
2025-01-17 12:23:10 +05:30
parent 6bfc851a1c
commit c9b22b3653
1 changed files with 149 additions and 1 deletions
--- a/lib/ai_seo_tools/on_page_seo_analyzer.py
+++ b/lib/ai_seo_tools/on_page_seo_analyzer.py
@@ -7,6 +7,13 @@ from bs4 import BeautifulSoup
 import requests
 import csv
 import time
+from urllib.parse import urlparse
+import validators
+from readability import Readability
+import textstat
+import re
+from PIL import Image
+import io
 from ..gpt_providers.text_generation.main_text_generation import llm_text_gen

 def fetch_and_parse_html(url):
@@ -65,6 +72,121 @@ def extract_meta_data(soup):
        st.warning(f"⚠️ Error extracting meta data: {e}")
        return {}

+def analyze_headings(soup):
+    """
+    Analyzes the headings on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+
+    Returns:
+        dict: Count of each heading tag.
+    """
+    try:
+        headings = {
+            'h1': len(soup.find_all('h1')),
+            'h2': len(soup.find_all('h2')),
+            'h3': len(soup.find_all('h3')),
+            'h4': len(soup.find_all('h4')),
+            'h5': len(soup.find_all('h5')),
+            'h6': len(soup.find_all('h6'))
+        }
+        return headings
+    except Exception as e:
+        st.warning(f"⚠️ Error analyzing headings: {e}")
+        return {}
+
+def check_readability(text):
+    """
+    Checks the readability score of the text.
+
+    Args:
+        text (str): The text content of the webpage.
+
+    Returns:
+        float: Readability score.
+    """
+    try:
+        readability_score = textstat.flesch_reading_ease(text)
+        return readability_score
+    except Exception as e:
+        st.warning(f"⚠️ Error checking readability: {e}")
+        return None
+
+def analyze_images(soup, url):
+    """
+    Analyzes the images on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+        url (str): The URL of the webpage.
+
+    Returns:
+        list: List of dictionaries containing image src and alt text.
+    """
+    try:
+        images = soup.find_all('img')
+        image_data = []
+        for img in images:
+            src = img.get('src')
+            if not src:
+                continue
+            if not validators.url(src):
+                src = urlparse(url).scheme + '://' + urlparse(url).netloc + src
+            alt_text = img.get('alt', '')
+            image_data.append({'src': src, 'alt': alt_text})
+        return image_data
+    except Exception as e:
+        st.warning(f"⚠️ Error analyzing images: {e}")
+        return []
+
+def analyze_links(soup):
+    """
+    Analyzes the links on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+
+    Returns:
+        list: List of broken links.
+    """
+    try:
+        links = soup.find_all('a', href=True)
+        broken_links = []
+        for link in links:
+            href = link['href']
+            if not validators.url(href):
+                continue
+            try:
+                response = requests.head(href, timeout=5)
+                if response.status_code >= 400:
+                    broken_links.append(href)
+            except requests.RequestException:
+                broken_links.append(href)
+        return broken_links
+    except Exception as e:
+        st.warning(f"⚠️ Error analyzing links: {e}")
+        return []
+
+def suggest_ctas(soup):
+    """
+    Suggests call-to-action phrases present on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+
+    Returns:
+        list: List of found CTA phrases.
+    """
+    try:
+        cta_keywords = ['buy now', 'subscribe', 'learn more', 'sign up', 'get started']
+        text = soup.get_text().lower()
+        ctas_found = [cta for cta in cta_keywords if cta in text]
+        return ctas_found
+    except Exception as e:
+        st.warning(f"⚠️ Error suggesting CTAs: {e}")
+        return []
+
 def extract_alternates_and_canonicals(soup):
    """
    Extracts canonical URL, hreflangs, and mobile alternate links from the parsed HTML.
@@ -167,7 +289,7 @@ def extract_content_data(soup, url):
        link_insights = []
        if internal_links:
            link_insights.append("✅ Internal links are present.")
-        if external_links:
+        if external links:
            link_insights.append("✅ External links are present.")

        return {
@@ -313,6 +435,12 @@ def fetch_seo_data(url):
        return {}
    
    meta_data = extract_meta_data(soup)
+    headings = analyze_headings(soup)
+    text = soup.get_text()
+    readability_score = check_readability(text)
+    images = analyze_images(soup, url)
+    broken_links = analyze_links(soup)
+    ctas = suggest_ctas(soup)
    alternates_and_canonicals = extract_alternates_and_canonicals(soup)
    schema_markup = extract_schema_markup(soup)
    content_data = extract_content_data(soup, url)
@@ -320,6 +448,11 @@ def fetch_seo_data(url):
    
    return {
        "meta_data": meta_data,
+        "headings": headings,
+        "readability_score": readability_score,
+        "images": images,
+        "broken_links": broken_links,
+        "ctas": ctas,
        "alternates_and_canonicals": alternates_and_canonicals,
        "schema_markup": schema_markup,
        "content_data": content_data,
@@ -371,6 +504,21 @@ def analyze_onpage_seo():
                st.write(f"**Language:** {results['meta_data']['html_language']}")
                st.write(results['meta_data']['title_message'])
                st.write(results['meta_data']['description_message'])
+
+                st.subheader("Headings")
+                st.write(results['headings'])
+
+                st.subheader("Readability Score")
+                st.write(f"**Readability Score:** {results['readability_score']}")
+
+                st.subheader("Images")
+                st.write(results['images'])
+
+                st.subheader("Broken Links")
+                st.write(results['broken_links'])
+
+                st.subheader("Suggested CTAs")
+                st.write(results['ctas'])
                
                st.subheader("Canonical and Hreflangs")
                st.write(f"**Canonical:** {results['alternates_and_canonicals']['canonical']}")