From c9b22b3653215fa7a58282190aa30585b99cb748 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D9=8A?= <ajay.calsoft@gmail.com>
Date: Fri, 17 Jan 2025 12:23:10 +0530
Subject: [PATCH] Update on_page_seo_analyzer.py

Additional Insights for Non-Technical Users
Content Quality Insights:

Readability Score: Use libraries like textstat to calculate a readability score (e.g., Flesch Reading Ease) for the webpage content.
Keywords Highlighting: Extract and highlight frequently used keywords in the content, helping users understand what topics are emphasized.
Duplicate Content Check: Flag if the meta description or titles are repeated multiple times in the page content.
SEO Health Checks:

Broken Links Detection: Identify broken internal or external links and recommend fixing them.
Image Optimization Tips:
Suggest reducing image sizes if the file sizes exceed a certain threshold.
Recommend modern formats like WebP for better performance.
Alt Text Suggestions: Provide actionable suggestions for missing or insufficient alt text, such as "Describe the image's purpose or key elements."
Social Media Enhancement:

Suggest best practices for Open Graph and Twitter tags, such as recommended tag content length or formats.
Generate suggested meta descriptions and Open Graph descriptions for improved click-through rates.
Accessibility Recommendations:

Heading Structure Audit: Check for skipped heading levels (e.g., h2 follows h4) and provide guidance on correcting them.
Contrast Ratio Check: Flag potential text-to-background contrast issues for visually impaired users (can use APIs like Lighthouse).
ARIA Tags: Check for the presence of ARIA (Accessible Rich Internet Applications) tags and recommend their addition if missing.
Performance Insights:

Lazy Loading Suggestions: Highlight images without loading="lazy" and recommend lazy loading to improve page load speed.
Critical CSS Suggestions: Advise inlining critical CSS for faster initial render.
Script Optimization: Highlight unminified or unused JavaScript and recommend optimization.
Custom Recommendations:

Call to Action (CTA) Suggestions: Analyze the text for actionable elements like buttons or links and recommend improving CTAs.
Internal Linking Suggestions: Suggest adding internal links for keywords or headings that lack links.
Schema Markup Expansion: Recommend additional schema types (e.g., FAQ, Product, Review) based on the page content.
Mobile Friendliness Enhancements:

Check for touch targets (buttons and links) being too small or too close together.
Flag pages without mobile-friendly navigation menus.
Enhancements to User Experience
Highlight Strengths and Weaknesses: Use color-coded sections to differentiate between well-optimized and underperforming areas.

Simplified Metrics:

Break down complex scores (like PageSpeed or SEO scores) into "Good," "Needs Improvement," and "Poor" categories.
Provide plain-English explanations for non-technical users.
Recommendations Section:

Provide step-by-step instructions or examples for fixing identified issues, such as "How to add a canonical tag" or "How to structure hreflang attributes."
Actionable Insights Dashboard: Summarize all findings in a visually appealing dashboard with prioritized to-do lists.

Export Reports: Allow users to export the analysis and recommendations in a PDF or CSV format for easier sharing and tracking.
---
 lib/ai_seo_tools/on_page_seo_analyzer.py | 150 ++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 1 deletion(-)

diff --git a/lib/ai_seo_tools/on_page_seo_analyzer.py b/lib/ai_seo_tools/on_page_seo_analyzer.py
index 6f3e76ae..945bca0a 100644
--- a/lib/ai_seo_tools/on_page_seo_analyzer.py
+++ b/lib/ai_seo_tools/on_page_seo_analyzer.py
@@ -7,6 +7,13 @@ from bs4 import BeautifulSoup
 import requests
 import csv
 import time
+from urllib.parse import urlparse
+import validators
+from readability import Readability
+import textstat
+import re
+from PIL import Image
+import io
 from ..gpt_providers.text_generation.main_text_generation import llm_text_gen
 
 def fetch_and_parse_html(url):
@@ -65,6 +72,121 @@ def extract_meta_data(soup):
         st.warning(f"⚠️ Error extracting meta data: {e}")
         return {}
 
+def analyze_headings(soup):
+    """
+    Analyzes the headings on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+
+    Returns:
+        dict: Count of each heading tag.
+    """
+    try:
+        headings = {
+            'h1': len(soup.find_all('h1')),
+            'h2': len(soup.find_all('h2')),
+            'h3': len(soup.find_all('h3')),
+            'h4': len(soup.find_all('h4')),
+            'h5': len(soup.find_all('h5')),
+            'h6': len(soup.find_all('h6'))
+        }
+        return headings
+    except Exception as e:
+        st.warning(f"⚠️ Error analyzing headings: {e}")
+        return {}
+
+def check_readability(text):
+    """
+    Checks the readability score of the text.
+
+    Args:
+        text (str): The text content of the webpage.
+
+    Returns:
+        float: Readability score.
+    """
+    try:
+        readability_score = textstat.flesch_reading_ease(text)
+        return readability_score
+    except Exception as e:
+        st.warning(f"⚠️ Error checking readability: {e}")
+        return None
+
+def analyze_images(soup, url):
+    """
+    Analyzes the images on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+        url (str): The URL of the webpage.
+
+    Returns:
+        list: List of dictionaries containing image src and alt text.
+    """
+    try:
+        images = soup.find_all('img')
+        image_data = []
+        for img in images:
+            src = img.get('src')
+            if not src:
+                continue
+            if not validators.url(src):
+                src = urlparse(url).scheme + '://' + urlparse(url).netloc + src
+            alt_text = img.get('alt', '')
+            image_data.append({'src': src, 'alt': alt_text})
+        return image_data
+    except Exception as e:
+        st.warning(f"⚠️ Error analyzing images: {e}")
+        return []
+
+def analyze_links(soup):
+    """
+    Analyzes the links on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+
+    Returns:
+        list: List of broken links.
+    """
+    try:
+        links = soup.find_all('a', href=True)
+        broken_links = []
+        for link in links:
+            href = link['href']
+            if not validators.url(href):
+                continue
+            try:
+                response = requests.head(href, timeout=5)
+                if response.status_code >= 400:
+                    broken_links.append(href)
+            except requests.RequestException:
+                broken_links.append(href)
+        return broken_links
+    except Exception as e:
+        st.warning(f"⚠️ Error analyzing links: {e}")
+        return []
+
+def suggest_ctas(soup):
+    """
+    Suggests call-to-action phrases present on the webpage.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+
+    Returns:
+        list: List of found CTA phrases.
+    """
+    try:
+        cta_keywords = ['buy now', 'subscribe', 'learn more', 'sign up', 'get started']
+        text = soup.get_text().lower()
+        ctas_found = [cta for cta in cta_keywords if cta in text]
+        return ctas_found
+    except Exception as e:
+        st.warning(f"⚠️ Error suggesting CTAs: {e}")
+        return []
+
 def extract_alternates_and_canonicals(soup):
     """
     Extracts canonical URL, hreflangs, and mobile alternate links from the parsed HTML.
@@ -167,7 +289,7 @@ def extract_content_data(soup, url):
         link_insights = []
         if internal_links:
             link_insights.append("✅ Internal links are present.")
-        if external_links:
+        if external links:
             link_insights.append("✅ External links are present.")
 
         return {
@@ -313,6 +435,12 @@ def fetch_seo_data(url):
         return {}
     
     meta_data = extract_meta_data(soup)
+    headings = analyze_headings(soup)
+    text = soup.get_text()
+    readability_score = check_readability(text)
+    images = analyze_images(soup, url)
+    broken_links = analyze_links(soup)
+    ctas = suggest_ctas(soup)
     alternates_and_canonicals = extract_alternates_and_canonicals(soup)
     schema_markup = extract_schema_markup(soup)
     content_data = extract_content_data(soup, url)
@@ -320,6 +448,11 @@ def fetch_seo_data(url):
     
     return {
         "meta_data": meta_data,
+        "headings": headings,
+        "readability_score": readability_score,
+        "images": images,
+        "broken_links": broken_links,
+        "ctas": ctas,
         "alternates_and_canonicals": alternates_and_canonicals,
         "schema_markup": schema_markup,
         "content_data": content_data,
@@ -371,6 +504,21 @@ def analyze_onpage_seo():
                 st.write(f"**Language:** {results['meta_data']['html_language']}")
                 st.write(results['meta_data']['title_message'])
                 st.write(results['meta_data']['description_message'])
+
+                st.subheader("Headings")
+                st.write(results['headings'])
+
+                st.subheader("Readability Score")
+                st.write(f"**Readability Score:** {results['readability_score']}")
+
+                st.subheader("Images")
+                st.write(results['images'])
+
+                st.subheader("Broken Links")
+                st.write(results['broken_links'])
+
+                st.subheader("Suggested CTAs")
+                st.write(results['ctas'])
                 
                 st.subheader("Canonical and Hreflangs")
                 st.write(f"**Canonical:** {results['alternates_and_canonicals']['canonical']}")