ALwrity/lib/ai_seo_tools/on_page_seo_analyzer.py

import os
import json
import streamlit as st
from tenacity import retry, stop_after_attempt, wait_random_exponential
import cloudscraper
from bs4 import BeautifulSoup
import requests
import csv
import time
from ..gpt_providers.text_generation.main_text_generation import llm_text_gen

def fetch_and_parse_html(url):
    """
    Fetches HTML content from the given URL using CloudScraper and parses it with BeautifulSoup.

    Args:
        url (str): The URL of the webpage to fetch.

    Returns:
        BeautifulSoup: Parsed HTML content.
    """
    try:
        scraper = cloudscraper.create_scraper()
        html = scraper.get(url)
        soup = BeautifulSoup(html.text, 'html.parser')
        return soup
    except Exception as e:
        st.error(f"⚠️ Error fetching or parsing HTML: {e}")
        return None

def extract_meta_data(soup):
    """
    Extracts meta data like title, description, and robots directives from the parsed HTML.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        dict: Extracted meta data.
    """
    try:
        metatitle = soup.find('title').get_text() if soup.find('title') else "Title not found"
        metadescription = soup.find('meta', attrs={'name': 'description'})["content"] if soup.find('meta', attrs={'name': 'description'}) else "Description not found"
        robots_directives = [directive.strip() for directive in soup.find('meta', attrs={'name': 'robots'})["content"].split(",")] if soup.find('meta', attrs={'name': 'robots'}) else []
        viewport = soup.find('meta', attrs={'name': 'viewport'})["content"] if soup.find('meta', attrs={'name': 'viewport'}) else "Viewport not found"
        charset = soup.find('meta', attrs={'charset': True})["charset"] if soup.find('meta', attrs={'charset': True}) else "Charset not found"
        html_language = soup.find('html')["lang"] if soup.find('html') else "Language not found"

        title_length = len(metatitle) if metatitle != "Title not found" else 0
        description_length = len(metadescription) if metadescription != "Description not found" else 0
        title_message = "✅ Title length is good." if 30 <= title_length <= 60 else "⚠️ Title length should be between 30-60 characters."
        description_message = "✅ Meta description length is good." if 70 <= description_length <= 160 else "⚠️ Meta description should be between 70-160 characters."

        return {
            "metatitle": metatitle,
            "metadescription": metadescription,
            "robots_directives": robots_directives,
            "viewport": viewport,
            "charset": charset,
            "html_language": html_language,
            "title_message": title_message,
            "description_message": description_message
        }
    except Exception as e:
        st.warning(f"⚠️ Error extracting meta data: {e}")
        return {}

def extract_alternates_and_canonicals(soup):
    """
    Extracts canonical URL, hreflangs, and mobile alternate links from the parsed HTML.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        dict: Extracted alternates and canonicals.
    """
    try:
        canonical = soup.find('link', attrs={'rel': 'canonical'})["href"] if soup.find('link', attrs={'rel': 'canonical'}) else "Canonical not found"
        list_hreflangs = [[a['href'], a["hreflang"]] for a in soup.find_all('link', href=True, hreflang=True)] if soup.find_all('link', href=True, hreflang=True) else []
        mobile_alternate = soup.find('link', attrs={'media': 'only screen and (max-width: 640px)'})["href"] if soup.find('link', attrs={'media': 'only screen and (max-width: 640px)'}) else "Mobile Alternate not found"

        canonical_message = "✅ Canonical tag found. Great! This helps avoid duplicate content issues." if canonical != "Canonical not found" else "⚠️ Consider adding a canonical tag."
        hreflangs_message = "✅ Hreflang tags are implemented. Good job!" if list_hreflangs else "⚠️ Consider implementing hreflang tags."

        return {
            "canonical": canonical,
            "hreflangs": list_hreflangs,
            "mobile_alternate": mobile_alternate,
            "canonical_message": canonical_message,
            "hreflangs_message": hreflangs_message
        }
    except Exception as e:
        st.warning(f"⚠️ Error extracting alternates and canonicals: {e}")
        return {}

def extract_schema_markup(soup):
    """
    Extracts schema markup data from the parsed HTML.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        dict: Extracted schema markup data.
    """
    try:
        json_schema = soup.find('script', attrs={'type': 'application/ld+json'})
        if json_schema:
            json_file = json.loads(json_schema.get_text())
            schema_types = [x['@type'] for x in json_file.get("@graph", [])] if "@graph" in json_file else [json_file["@type"]]
            schema_message = "✅ Schema markup found. Wonderful!" if schema_types else "⚠️ No schema markup found."
            return {
                "schema_types": schema_types,
                "schema_message": schema_message
            }
        else:
            return {
                "schema_message": "⚠️ No schema markup found."
            }
    except Exception as e:
        st.warning(f"⚠️ Error extracting schema markup: {e}")
        return {}

def extract_content_data(soup, url):
    """
    Extracts content data such as text length, headers, and insights about images and links.

    Args:
        soup (BeautifulSoup): Parsed HTML content.
        url (str): The URL of the webpage.

    Returns:
        dict: Extracted content data.
    """
    try:
        paragraph = [a.get_text() for a in soup.find_all('p')]
        text_length = sum([len(a) for a in paragraph])
        h1 = [a.get_text() for a in soup.find_all('h1')]
        headers = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
        list_headers = [[str(x)[1:3], x.get_text()] for x in headers]

        images = []
        for img in soup.find_all('img'):
            src = img.get("src", "No src attribute")
            alt_text = img.get("alt", "No alt text")
            images.append([src, alt_text])

        internal_links = []
        external_links = []
        domain = url.split("//")[-1].split("/")[0]

        for link in soup.find_all('a', href=True):
            href = link['href']
            if domain in href:
                internal_links.append(href)
            else:
                external_links.append(href)

        content_message = "✅ Content length is adequate." if text_length > 300 else "⚠️ Consider adding more content (minimum 300 words)."
        h1_message = "✅ H1 tag found. Good!" if h1 else "⚠️ Missing H1 tag."
        missing_alt_texts = sum([1 for img in images if img[1] == "No alt text"])
        alt_text_message = "✅ All images have alt text. Great!" if missing_alt_texts == 0 else f"⚠️ {missing_alt_texts} images are missing alt text."
        internal_links_message = f"✅ {len(internal_links)} internal links found."
        external_links_message = f"✅ {len(external_links)} external links found."

        link_insights = []
        if internal_links:
            link_insights.append("✅ Internal links are present.")
        if external_links:
            link_insights.append("✅ External links are present.")

        return {
            "text_length": text_length,
            "headers": list_headers,
            "images": images,
            "h1_message": h1_message,
            "content_message": content_message,
            "alt_text_message": alt_text_message,
            "internal_links_message": internal_links_message,
            "external_links_message": external_links_message,
            "link_insights": link_insights
        }
    except Exception as e:
        st.warning(f"⚠️ Error extracting content data: {e}")
        return {}

def extract_open_graph(soup):
    """
    Extracts Open Graph data from the parsed HTML.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        dict: Extracted Open Graph data.
    """
    try:
        open_graph = [[a["property"].replace("og:", ""), a["content"]] for a in soup.select("meta[property^=og]")]
        open_graph_message = "✅ Open Graph tags found. Awesome!" if open_graph else "⚠️ No Open Graph tags found."
        return {
            "open_graph": open_graph,
            "open_graph_message": open_graph_message
        }
    except Exception as e:
        st.warning(f"⚠️ Error extracting Open Graph data: {e}")
        return {}

def extract_social_tags(soup):
    """
    Extracts Twitter Card and Facebook Open Graph data from the parsed HTML.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        dict: Extracted social tags.
    """
    try:
        twitter_cards = [[a["name"].replace("twitter:", ""), a["content"]] for a in soup.select("meta[name^=twitter]")]
        facebook_open_graph = [[a["property"].replace("og:", ""), a["content"]] for a in soup.select("meta[property^=og]")]

        twitter_message = "✅ Twitter Card tags found." if twitter_cards else "⚠️ No Twitter Card tags found."
        facebook_message = "✅ Facebook Open Graph tags found." if facebook_open_graph else "⚠️ No Facebook Open Graph tags found."

        return {
            "twitter_cards": twitter_cards,
            "facebook_open_graph": facebook_open_graph,
            "twitter_message": twitter_message,
            "facebook_message": facebook_message
        }
    except Exception as e:
        st.warning(f"⚠️ Error extracting social tags: {e}")
        return {}

def check_page_speed(url):
    """
    Fetches and analyzes page speed metrics using Google PageSpeed Insights API.

    Args:
        url (str): The URL of the webpage.

    Returns:
        dict: Page speed data.
    """
    try:
        api_key = "YOUR_GOOGLE_PAGESPEED_API_KEY"
        response = requests.get(f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={url}&key={api_key}")
        data = response.json()
        score = data.get('overall_category_score', 'N/A')
        speed_message = f"Page Speed Score: {score}" if score != 'N/A' else "⚠️ Unable to retrieve page speed score."
        return {
            "speed_score": score,
            "speed_message": speed_message
        }
    except Exception as e:
        st.warning(f"⚠️ Error fetching page speed data: {e}")
        return {}

def check_mobile_usability(soup):
    """
    Checks if the website is mobile-friendly based on viewport and other elements.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        dict: Mobile usability data.
    """
    try:
        viewport = soup.find('meta', attrs={'name': 'viewport'})["content"] if soup.find('meta', attrs={'name': 'viewport'}) else ""
        mobile_message = "✅ Mobile viewport is set." if viewport else "⚠️ Mobile viewport meta tag is missing."
        return {
            "mobile_message": mobile_message
        }
    except Exception as e:
        st.warning(f"⚠️ Error checking mobile usability: {e}")
        return {}

def check_alt_text(soup):
    """
    Checks if all images have alt text.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        dict: Alt text data.
    """
    try:
        images = soup.find_all('img')
        missing_alt_texts = sum([1 for img in images if not img.get("alt")])
        alt_text_message = "✅ All images have alt text. Great!" if missing_alt_texts == 0 else f"⚠️ {missing_alt_texts} images are missing alt text."
        return {
            "alt_text_message": alt_text_message
        }
    except Exception as e:
        st.warning(f"⚠️ Error checking alt text: {e}")
        return {}

def fetch_seo_data(url):
    """
    Fetches SEO-related data from the provided URL and returns a dictionary with results.

    Args:
        url (str): The URL of the webpage to analyze.

    Returns:
        dict: SEO data.
    """
    soup = fetch_and_parse_html(url)
    if not soup:
        return {}

    meta_data = extract_meta_data(soup)
    alternates_and_canonicals = extract_alternates_and_canonicals(soup)
    schema_markup = extract_schema_markup(soup)
    content_data = extract_content_data(soup, url)
    open_graph = extract_open_graph(soup)

    return {
        "meta_data": meta_data,
        "alternates_and_canonicals": alternates_and_canonicals,
        "schema_markup": schema_markup,
        "content_data": content_data,
        "open_graph": open_graph
    }

def download_csv(data, filename='seo_data.csv'):
    """
    Downloads the data as a CSV file.

    Args:
        data (dict): SEO data to download.
        filename (str): Filename for the downloaded CSV file.
    """
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for key, value in data.items():
            if isinstance(value, list):
                writer.writerow([key] + value)
            else:
                writer.writerow([key, value])
    st.success(f"Data exported to {filename}")

def analyze_onpage_seo():
    """
    Main function to analyze on-page SEO using Streamlit.
    """
    st.title("ALwrity On Page SEO Analyzer")

    url = st.text_input("Enter URL to Analyze", "")
    if st.button("Analyze"):
        if not url:
            st.error("⚠️ Please enter a URL.")
        else:
            with st.spinner("Fetching and analyzing data..."):
                results = fetch_seo_data(url)
                social_tags = extract_social_tags(fetch_and_parse_html(url))
                speed = check_page_speed(url)
                mobile_usability = check_mobile_usability(fetch_and_parse_html(url))
                alt_text = check_alt_text(fetch_and_parse_html(url))

            if results:
                st.subheader("Meta Data")
                st.write(f"**Title:** {results['meta_data']['metatitle']}")
                st.write(f"**Description:** {results['meta_data']['metadescription']}")
                st.write(f"**Robots Directives:** {', '.join(results['meta_data']['robots_directives'])}")
                st.write(f"**Viewport:** {results['meta_data']['viewport']}")
                st.write(f"**Charset:** {results['meta_data']['charset']}")
                st.write(f"**Language:** {results['meta_data']['html_language']}")
                st.write(results['meta_data']['title_message'])
                st.write(results['meta_data']['description_message'])

                st.subheader("Canonical and Hreflangs")
                st.write(f"**Canonical:** {results['alternates_and_canonicals']['canonical']}")
                st.write(f"**Hreflangs:** {results['alternates_and_canonicals']['hreflangs']}")
                st.write(f"**Mobile Alternate:** {results['alternates_and_canonicals']['mobile_alternate']}")
                st.write(results['alternates_and_canonicals']['canonical_message'])
                st.write(results['alternates_and_canonicals']['hreflangs_message'])

                st.subheader("Schema Markup")
                st.write(f"**Schema Types:** {results['schema_markup']['schema_types']}")
                st.write(results['schema_markup']['schema_message'])

                st.subheader("Content Data")
                st.write(f"**Text Length:** {results['content_data']['text_length']} characters")
                st.write(results['content_data']['h1_message'])
                st.write(results['content_data']['content_message'])
                st.write(results['content_data']['alt_text_message'])

                for insight in results['content_data']['link_insights']:
                    st.write(f"- {insight}")

                st.write(results['content_data']['internal_links_message'])
                st.write(results['content_data']['external_links_message'])

                st.subheader("Open Graph Data")
                st.write(f"**Open Graph Tags:** {results['open_graph']['open_graph']}")
                st.write(results['open_graph']['open_graph_message'])

                st.subheader("Social Tags")
                st.write(f"**Twitter Cards:** {social_tags['twitter_cards']}")
                st.write(social_tags['twitter_message'])
                st.write(f"**Facebook Open Graph:** {social_tags['facebook_open_graph']}")
                st.write(social_tags['facebook_message'])

                st.subheader("Performance Metrics")
                st.write(speed['speed_message'])

                st.subheader("Mobile Usability")
                st.write(mobile_usability['mobile_message'])

                st.subheader("Accessibility")
                st.write(alt_text['alt_text_message'])

                if st.button("Download CSV"):
                    download_csv(results)