Blogen-V0.1 Added features. WIP

2023-12-21 21:21:09 +05:30
parent eaf13c2d16
commit 8f89de7b69
21 changed files with 775 additions and 471 deletions
--- a/lib/seo_module/is_content_ai_generated.py
+++ b/lib/seo_module/is_content_ai_generated.py
@@ -0,0 +1,65 @@
+##############################################################################################
+#
+# Checks for:
+# Short, fragmented sentences that lack human-like coherence.
+# Frequent use of overly complex words or technical jargon.
+#
+# These checks are based on common observations that AI-generated content may sometimes produce 
+# text with unusual patterns or characteristics. However, please keep in mind that these 
+# heuristics are not guaranteed to detect all AI-generated content, and false positives or 
+# negatives can still occur. More advanced techniques and models would be required for more accurate detection.
+#
+#############################################################################################
+
+import spacy
+
+# Load the English language model from spaCy
+nlp = spacy.load("en_core_web_sm")
+
+def is_ai_generated(text):
+    # Tokenize the text using spaCy
+    doc = nlp(text)
+
+    # Check for indicators of AI-generated content
+    ai_indicators = [
+        "generated by AI",
+        "auto-generated",
+        "machine-generated",
+        "artificial intelligence",
+        "neural network",
+        "GPT-3",
+        "AI model",
+    ]
+
+    for indicator in ai_indicators:
+        if indicator.lower() in text.lower():
+            return True
+
+    # Check for repetitive patterns or lack of human-like variations
+    for i in range(len(doc) - 2):
+        if doc[i].text == doc[i + 1].text == doc[i + 2].text:
+            return True
+
+    # Check for short, fragmented sentences that lack human-like coherence
+    for sentence in doc.sents:
+        if len(sentence) < 5:
+            return True
+
+    # Check for frequent use of overly complex words or technical jargon
+    complex_word_count = sum(1 for token in doc if token.is_alpha and len(token.text) > 10)
+    if complex_word_count > len(doc) // 10:  # Adjust the threshold as needed
+        return True
+
+    return False
+
+if __name__ == "__main__":
+    input_text = """
+    This is an article generated by a state-of-the-art AI model.
+    The content is machine-generated and may not represent human writing style.
+    """
+
+    if is_ai_generated(input_text):
+        print("The content appears to be AI-generated.")
+    else:
+        print("The content appears to be written by a human.")
+
--- a/lib/seo_module/plagiarism_checker_from_known_sources.py
+++ b/lib/seo_module/plagiarism_checker_from_known_sources.py
@@ -0,0 +1,80 @@
+import re
+
+#######################################################################
+#
+# I want you to act as a plagiarism checker. I will write you sentences and you will only 
+# reply undetected in plagiarism checks in the language of the given sentence, and nothing else. 
+# Do not write explanations on replies. My first sentence is {}
+#
+######################################################################
+
+
+class PlagiarismChecker:
+    def __init__(self, known_sources):
+        self.known_sources = known_sources
+
+    def check_plagiarism(self, html_content):
+        try:
+            # Preprocess the HTML content by removing HTML tags and extra spaces
+            text = re.sub(r'<[^>]+>', ' ', html_content)
+            text = re.sub(r'\s+', ' ', text).strip().lower()
+
+            # Check for exact matches with known sources
+            for source in self.known_sources:
+                source_text = re.sub(r'<[^>]+>', ' ', source)
+                source_text = re.sub(r'\s+', ' ', source_text).strip().lower()
+                if text == source_text:
+                    return f"Plagiarism detected: Matches known source - {source}"
+
+            # If no exact matches are found
+            return "No plagiarism detected. Content is original."
+
+        except Exception as e:
+            return str(e)
+
+# Example usage:
+if __name__ == "__main__":
+    # List of known sources
+    known_sources = [
+        """
+        <html>
+        <head>
+            <title>Sample Page 1</title>
+        </head>
+        <body>
+            <h1>Hello, World!</h1>
+            <p>This is sample content from known source 1.</p>
+        </body>
+        </html>
+        """,
+        """
+        <html>
+        <head>
+            <title>Sample Page 2</title>
+        </head>
+        <body>
+            <h1>Welcome to Known Source 2</h1>
+            <p>This is some content from another known source.</p>
+        </body>
+        </html>
+        """
+    ]
+
+    # HTML content to check for plagiarism
+    html_content = """
+    <html>
+    <head>
+        <title>Sample Page</title>
+    </head>
+    <body>
+        <h1>Hello, World!</h1>
+        <p>This is sample content.</p>
+    </body>
+    </html>
+    """
+
+    plagiarism_checker = PlagiarismChecker(known_sources)
+    result = plagiarism_checker.check_plagiarism(html_content)
+
+    print(result)
+