WIP- Try AI-Writer and Web research; working.

2024-02-24 15:15:01 +05:30
parent d89d9ad3d2
commit a87a87a620
21 changed files with 587 additions and 279 deletions
--- a/lib/utils/seo_module/README.md
+++ b/lib/utils/seo_module/README.md
@@ -1,33 +0,0 @@
-## Implementation approach
-
-To implement the SEO module, we will use the following open-source tools and frameworks:
-
-1. Natural Language Toolkit (NLTK): NLTK is a popular library for natural language processing in Python. We can leverage NLTK to perform various SEO checks on the given text, such as keyword density, readability analysis, and sentiment analysis.
-
-2. Beautiful Soup: Beautiful Soup is a Python library for web scraping. We can use Beautiful Soup to extract relevant information from the given text, such as meta tags, headings, and image alt attributes.
-
-3. PyEnchant: PyEnchant is a spell checking library for Python. We can utilize PyEnchant to check the spelling and grammar of the given text and provide suggestions for improvement.
-
-4. TextBlob: TextBlob is a library for processing textual data. We can use TextBlob to perform part-of-speech tagging, noun phrase extraction, and other linguistic analyses on the given text.
-
-5. Flask: Use Flask for local testing and development purposes. Flask provides a lightweight web framework that allows us to quickly build and test our SEO module.
-
-Overall, by leveraging these open-source tools and frameworks, we can develop a comprehensive and efficient SEO module that meets the requirements and provides valuable insights and suggestions for improving the SEO of the given text.
-
-## Required Python third-party packages
-
- nltk==3.6.2
- beautifulsoup4==4.9.3
- pyenchant==3.2.1
- textblob==0.15.3
- flask==1.1.2
-
-## Modules
-
-The 'text_processor.py' file contains the TextProcessor class, which is responsible for extracting meta tags, headings, and image alt attributes from the given text.
-        
-The 'spell_checker.py' file contains the SpellChecker class, which is responsible for checking the spelling and grammar of the given text.
-        
-The 'seo_checker.py' file contains the SEOChecker class, which is responsible for coordinating the SEO checks by utilizing the TextProcessor and SpellChecker classes.
-
-
--- a/lib/utils/seo_module/cgpt_seo_analyzer.py
+++ b/lib/utils/seo_module/cgpt_seo_analyzer.py
@@ -1,135 +0,0 @@
-###################################################
-#
-# The script covers many SEO factors, including keyword presence, title length, 
-# meta description, images, img alt text, headings, internal links, external links, 
-# spelling errors, grammar errors, and readability.
-#
-##################################################
-
-import re
-from bs4 import BeautifulSoup
-from textstat import flesch_reading_ease
-import spellchecker
-
-class SEOAnalyzer:
-    def __init__(self, html_content, target_keywords):
-        self.html_content = html_content
-        self.target_keywords = target_keywords
-
-    def analyze_html_content(self):
-        try:
-            soup = BeautifulSoup(self.html_content, 'html.parser')
-
-            # Extract and clean text from HTML
-            text = ' '.join(soup.stripped_strings)
-            text = re.sub(r'\s+', ' ', text)
-
-            # Calculate keyword density
-            keyword_density = {}
-            for keyword in self.target_keywords:
-                keyword_density[keyword] = (text.lower().count(keyword.lower()) / len(text.split())) * 100
-
-            # Check for the presence of keywords in the title
-            title_tag = soup.find('title')
-            title_text = title_tag.text.lower() if title_tag else ''
-            keyword_presence_in_title = {keyword: keyword.lower() in title_text for keyword in self.target_keywords}
-
-            # Check for the presence of images and keywords in image alt text
-            images = soup.find_all('img')
-            img_alt_text = [img.get('alt', '').lower() for img in images]
-            keyword_presence_in_img_alt_text = {keyword: any(keyword.lower() in alt_text for alt_text in img_alt_text) for keyword in self.target_keywords}
-
-            # Check for the presence of headings
-            headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
-            headings_text = ' '.join(heading.text.lower() for heading in headings)
-
-            # Check for the presence of internal and external links
-            internal_links = len([link for link in soup.find_all('a') if '#' not in link.get('href', '')])
-            external_links = len([link for link in soup.find_all('a') if 'http' in link.get('href', '')])
-
-            # Calculate readability score
-            readability_score = flesch_reading_ease(text)
-
-            # Check for spelling and grammar errors
-            spell = spellchecker.SpellChecker()
-            spelling_errors = len(spell.unknown(text.split()))
-            grammar_errors = len(spell.check_grammar(text))
-
-            # Calculate SEO score
-            seo_score = 0
-
-            # Check for the presence of relevant keywords
-            for keyword in self.target_keywords:
-                if keyword in text.lower():
-                    seo_score += 1
-
-            # Check for title length
-            title_length = len(title_text.split()) if title_text else 0
-            recommended_title_length = (50, 70)
-
-            if recommended_title_length[0] <= title_length <= recommended_title_length[1]:
-                seo_score += 1
-
-            # Generate suggestions for improvement
-            suggestions = []
-            if seo_score < 5:
-                suggestions.append("Add more relevant keywords to your HTML content.")
-                suggestions.append("Make sure your title contains keywords.")
-                suggestions.append("Add keywords to image alt text.")
-                suggestions.append("Add headings to your HTML content.")
-                suggestions.append("Add internal links to your HTML content.")
-
-            return {
-                'Keyword Density': keyword_density,
-                'Keyword Presence in Title': keyword_presence_in_title,
-                'Keyword Presence in Image Alt Text': keyword_presence_in_img_alt_text,
-                'Headings Text': headings_text,
-                'Internal Links': internal_links,
-                'External Links': external_links,
-                'Readability Score': readability_score,
-                'Spelling Errors': spelling_errors,
-                'Grammar Errors': grammar_errors,
-                'SEO Score': seo_score,
-                'Suggestions': suggestions
-            }
-        except Exception as e:
-            return {'error': str(e)}
-
-# Example usage:
-if __name__ == "__main__":
-    html_content = """
-    <!DOCTYPE html>
-    <html>
-    <head>
-        <title>SEO Analyzer - Sample Page</title>
-        <meta name="description" content="This is a sample page for SEO analysis.">
-    </head>
-    <body>
-        <h1>Welcome to the SEO Analyzer</h1>
-        <p>This is a sample page with some sample content for SEO analysis. It mentions the target keywords SEO, keywords, and content.</p>
-        <img src="image1.jpg" alt="SEO image">
-        <img src="image2.jpg" alt="Keywords image">
-    </body>
-    </html>
-    """
-
-    keywords = ['SEO', 'keywords', 'content']  # Replace with your target keywords
-
-    seo_analyzer = SEOAnalyzer(html_content, keywords)
-    results = seo_analyzer.analyze_html_content()
-
-    print("SEO Analysis Results:")
-    print(f"Keyword Density: {results['Keyword Density']}")
-    print(f"Keyword Presence in Title: {results['Keyword Presence in Title']}")
-    print(f"Keyword Presence in Image Alt Text: {results['Keyword Presence in Image Alt Text']}")
-    print(f"Headings Text: {results['Headings Text']}")
-    print(f"Internal Links: {results['Internal Links']}")
-    print(f"External Links: {results['External Links']}")
-    print(f"Readability Score: {results['Readability Score']}")
-    print(f"Spelling Errors: {results['Spelling Errors']}")
-    print(f"Grammar Errors: {results['Grammar Errors']}")
-    print(f"SEO Score: {results['SEO Score']}")
-    print("Suggestions:")
-    for suggestion in results['Suggestions']:
-        print(suggestion)
-
--- a/lib/utils/seo_module/is_content_ai_generated.py
+++ b/lib/utils/seo_module/is_content_ai_generated.py
@@ -1,65 +0,0 @@
-##############################################################################################
-#
-# Checks for:
-# Short, fragmented sentences that lack human-like coherence.
-# Frequent use of overly complex words or technical jargon.
-#
-# These checks are based on common observations that AI-generated content may sometimes produce 
-# text with unusual patterns or characteristics. However, please keep in mind that these 
-# heuristics are not guaranteed to detect all AI-generated content, and false positives or 
-# negatives can still occur. More advanced techniques and models would be required for more accurate detection.
-#
-#############################################################################################
-
-import spacy
-
-# Load the English language model from spaCy
-nlp = spacy.load("en_core_web_sm")
-
-def is_ai_generated(text):
-    # Tokenize the text using spaCy
-    doc = nlp(text)
-
-    # Check for indicators of AI-generated content
-    ai_indicators = [
-        "generated by AI",
-        "auto-generated",
-        "machine-generated",
-        "artificial intelligence",
-        "neural network",
-        "GPT-3",
-        "AI model",
-    ]
-
-    for indicator in ai_indicators:
-        if indicator.lower() in text.lower():
-            return True
-
-    # Check for repetitive patterns or lack of human-like variations
-    for i in range(len(doc) - 2):
-        if doc[i].text == doc[i + 1].text == doc[i + 2].text:
-            return True
-
-    # Check for short, fragmented sentences that lack human-like coherence
-    for sentence in doc.sents:
-        if len(sentence) < 5:
-            return True
-
-    # Check for frequent use of overly complex words or technical jargon
-    complex_word_count = sum(1 for token in doc if token.is_alpha and len(token.text) > 10)
-    if complex_word_count > len(doc) // 10:  # Adjust the threshold as needed
-        return True
-
-    return False
-
-if __name__ == "__main__":
-    input_text = """
-    This is an article generated by a state-of-the-art AI model.
-    The content is machine-generated and may not represent human writing style.
-    """
-
-    if is_ai_generated(input_text):
-        print("The content appears to be AI-generated.")
-    else:
-        print("The content appears to be written by a human.")
-
--- a/lib/utils/seo_module/plagiarism_checker_from_known_sources.py
+++ b/lib/utils/seo_module/plagiarism_checker_from_known_sources.py
@@ -1,80 +0,0 @@
-import re
-
-#######################################################################
-#
-# I want you to act as a plagiarism checker. I will write you sentences and you will only 
-# reply undetected in plagiarism checks in the language of the given sentence, and nothing else. 
-# Do not write explanations on replies. My first sentence is {}
-#
-######################################################################
-
-
-class PlagiarismChecker:
-    def __init__(self, known_sources):
-        self.known_sources = known_sources
-
-    def check_plagiarism(self, html_content):
-        try:
-            # Preprocess the HTML content by removing HTML tags and extra spaces
-            text = re.sub(r'<[^>]+>', ' ', html_content)
-            text = re.sub(r'\s+', ' ', text).strip().lower()
-
-            # Check for exact matches with known sources
-            for source in self.known_sources:
-                source_text = re.sub(r'<[^>]+>', ' ', source)
-                source_text = re.sub(r'\s+', ' ', source_text).strip().lower()
-                if text == source_text:
-                    return f"Plagiarism detected: Matches known source - {source}"
-
-            # If no exact matches are found
-            return "No plagiarism detected. Content is original."
-
-        except Exception as e:
-            return str(e)
-
-# Example usage:
-if __name__ == "__main__":
-    # List of known sources
-    known_sources = [
-        """
-        <html>
-        <head>
-            <title>Sample Page 1</title>
-        </head>
-        <body>
-            <h1>Hello, World!</h1>
-            <p>This is sample content from known source 1.</p>
-        </body>
-        </html>
-        """,
-        """
-        <html>
-        <head>
-            <title>Sample Page 2</title>
-        </head>
-        <body>
-            <h1>Welcome to Known Source 2</h1>
-            <p>This is some content from another known source.</p>
-        </body>
-        </html>
-        """
-    ]
-
-    # HTML content to check for plagiarism
-    html_content = """
-    <html>
-    <head>
-        <title>Sample Page</title>
-    </head>
-    <body>
-        <h1>Hello, World!</h1>
-        <p>This is sample content.</p>
-    </body>
-    </html>
-    """
-
-    plagiarism_checker = PlagiarismChecker(known_sources)
-    result = plagiarism_checker.check_plagiarism(html_content)
-
-    print(result)
-
--- a/lib/utils/seo_module/prompt
+++ b/lib/utils/seo_module/prompt
@@ -1,3 +0,0 @@
-Act as an SEO specialist, analyze [website URL], and make improvement suggestions regarding technical SEO with the ways to make those improvements listed in a table.
-
-
--- a/lib/utils/seo_module/seo_analysis.py
+++ b/lib/utils/seo_module/seo_analysis.py
@@ -1,115 +0,0 @@
-from typing import List, Dict, Union
-from nltk import tokenize, stem, pos_tag
-from textblob import TextBlob
-import enchant
-
-class TextPreprocessor:
-    def preprocess_text(self, text: str) -> str:
-        # Tokenize the text
-        tokens = tokenize.word_tokenize(text)
-        
-        # Stem the tokens
-        stemmer = stem.PorterStemmer()
-        stemmed_tokens = [stemmer.stem(token) for token in tokens]
-        
-        # Join the stemmed tokens back into a string
-        preprocessed_text = ' '.join(stemmed_tokens)
-        
-        return preprocessed_text
-
-class SEOAnalyzer:
-    def calculate_seo_percentage(self, text: str, keywords: List[str]) -> float:
-        # Calculate the keyword density
-        keyword_density = self.calculate_keyword_density(text, keywords)
-        
-        # Calculate the readability score
-        readability_score = self.calculate_readability_score(text)
-        
-        # Perform semantic analysis
-        semantic_score = self.perform_semantic_analysis(text)
-        
-        # Calculate the SEO percentage based on the metrics
-        seo_percentage = (keyword_density + readability_score + semantic_score) / 3
-        
-        return seo_percentage
-    
-    def calculate_keyword_density(self, text: str, keywords: List[str]) -> float:
-        # Count the number of occurrences of each keyword in the text
-        keyword_counts = {keyword: text.lower().count(keyword.lower()) for keyword in keywords}
-        
-        # Calculate the total number of words in the text
-        word_count = len(tokenize.word_tokenize(text))
-        
-        # Calculate the keyword density
-        keyword_density = sum(keyword_counts.values()) / word_count
-        
-        return keyword_density
-    
-    def calculate_readability_score(self, text: str) -> float:
-        # Calculate the average number of words per sentence
-        sentences = tokenize.sent_tokenize(text)
-        word_count = sum(len(tokenize.word_tokenize(sentence)) for sentence in sentences)
-        sentence_count = len(sentences)
-        average_words_per_sentence = word_count / sentence_count
-        
-        # Calculate the readability score
-        readability_score = 1 / average_words_per_sentence
-        
-        return readability_score
-    
-    def perform_semantic_analysis(self, text: str) -> float:
-        # Perform part-of-speech tagging on the text
-        tagged_text = pos_tag(tokenize.word_tokenize(text))
-        
-        # Calculate the semantic score based on the number of nouns and verbs
-        noun_count = sum(1 for word, pos in tagged_text if pos.startswith('N'))
-        verb_count = sum(1 for word, pos in tagged_text if pos.startswith('V'))
-        semantic_score = (noun_count + verb_count) / len(tagged_text)
-        
-        return semantic_score
-
-class SpellChecker:
-    def check_spelling(self, text: str) -> List[str]:
-        # Create a spellchecker object
-        spellchecker = enchant.Dict("en_US")
-        
-        # Tokenize the text
-        tokens = tokenize.word_tokenize(text)
-        
-        # Check the spelling of each token
-        misspelled_words = [token for token in tokens if not spellchecker.check(token)]
-        
-        return misspelled_words
-
-class SEOAnalysisModule:
-    def __init__(self):
-        self.text_preprocessor = TextPreprocessor()
-        self.seo_analyzer = SEOAnalyzer()
-        self.spell_checker = SpellChecker()
-    
-    def analyze_text(self, text: str, keywords: List[str]) -> Dict[str, Union[float, List[str]]]:
-        # Preprocess the text
-        preprocessed_text = self.text_preprocessor.preprocess_text(text)
-        
-        # Calculate the SEO percentage
-        seo_percentage = self.seo_analyzer.calculate_seo_percentage(preprocessed_text, keywords)
-        
-        # Calculate the keyword density
-        keyword_density = self.seo_analyzer.calculate_keyword_density(preprocessed_text, keywords)
-        
-        # Calculate the readability score
-        readability_score = self.seo_analyzer.calculate_readability_score(preprocessed_text)
-        
-        # Perform semantic analysis
-        semantic_score = self.seo_analyzer.perform_semantic_analysis(preprocessed_text)
-        
-        # Check the spelling
-        spelling_errors = self.spell_checker.check_spelling(preprocessed_text)
-        
-        return {
-            'seo_percentage': seo_percentage,
-            'keyword_density': keyword_density,
-            'readability_score': readability_score,
-            'semantic_score': semantic_score,
-            'spelling_errors': spelling_errors
-        }
				`@@ -1,3 +0,0 @@`
				`Act as an SEO specialist, analyze [website URL], and make improvement suggestions regarding technical SEO with the ways to make those improvements listed in a table.`