WIP- Under maintenence- Web research working.

2024-02-05 15:15:07 +05:30
parent fd7053fb4b
commit 2a3315f211
96 changed files with 4320 additions and 565 deletions
--- a/lib/blog_postprocessing/blog_proof_reader.py
+++ b/lib/blog_postprocessing/blog_proof_reader.py
@@ -0,0 +1,43 @@
+from .gpt_providers.gemini_pro_text import gemini_text_response
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+
+
+def blog_proof_editor(blog_content, blog_keywords, gpt_provider="openai"):
+    """
+        Helper for blog proof reading.
+    """
+    prompt = f"""I am looking for detailed editing and enhancement of the given blog post, 
+        with a particular focus on maintaining originality. 
+        The topic of the content is [{blog_keywords}]. Please go through the blog and make direct edits to improve it, 
+        ensuring the final output is both high-quality and original. 
+        Note: There are duplicates headings and corresponding paragraphs, rewrite into one subheading.
+
+        Here are the specific areas to focus on:
+
+        1). Ensure Originality: Edit any sections that lack originality, replacing them with unique and creative content.
+        2). Eliminate Repetitive Language: Rewrite repetitive phrases with varied and engaging language.
+        3). Vocabulary and Grammar Enhancement: Directly correct any grammatical errors and upgrade the 
+        vocabulary for better readability.
+        4). Improve Sentence Structure: Enhance sentence construction for better clarity and flow.
+        5). Tone and Brand Alignment: Adjust the tone, voice, personality of given content to make it unique.
+        6). Optimize Content Structure: Reorganize the content for a more impactful presentation, 
+        including better paragraphing and transitions.
+        7). Remove Redundancies: Important, Cut out any redundant information or overly complex jargon.
+        8). Refine Overall Structure: Make structural changes to improve the overall impact of the content.
+        9). Remember, rewrite all content that repeated, while maintaining the formatting of the given blog text.
+
+        Please apply these changes directly to the following blog post and provide the edited version:\n 
+        '{blog_content}'. """
+
+    if 'openai' in gpt_provider:
+        try:
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            SystemError(f"Openai Error Blog Proof Reading: {err}")
+    elif 'gemini' in gpt_provider:
+        try:
+            response = gemini_text_response(prompt)
+            return response
+        except Exception as err:
+            SystemError(f"Gemini Error Blog Proof Reading: {err}")
--- a/lib/blog_postprocessing/convert_content_to_markdown.py
+++ b/lib/blog_postprocessing/convert_content_to_markdown.py
@@ -0,0 +1,75 @@
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+from .gpt_providers.gemini_pro_text import gemini_text_response
+
+
+def convert_tomarkdown_format(blog_content, gpt_provider="openai"):
+    """ Helper for converting content to markdown format for static sites. """
+    
+    prompt = f"""As an expert in markdown language format and font matter,
+    I will provide you with a blog post.
+    Your task is to only Improve the formatting and structure of a blog post to enhance readability, visual appeal, and overall user experience. Do not alter the content of the provided blog. Modify only for the formatting.
+    Dont provide explanations, just your final response.
+
+    Guidelines to do formatting:
+    1. **Headings for Structure:**
+   - Use # for the main title of the blog post.
+   - Use ## for subheadings that divide the post into clear sections.
+   - Use ###, ####, etc. for additional subheadings as needed.
+   - Keep the headings concise and descriptive.
+
+    2. **Emphasizing Text:**
+   - Use * or _ for italicizing important words or phrases.
+   - Use ** or __ for bolding key points.
+   - Use *** or ___ for bold italicizing very important text.
+   - Use sparingly to avoid overwhelming the reader.
+
+    3. **Lists:**
+   - Use - or * for unordered lists.
+   - Use 1., 2., etc. for ordered lists.
+   - Keep list items concise and to the point.
+   - Use consistent formatting for all lists.
+
+    4. **Blockquotes:**
+   - Use > to indent and highlight quotes or important information.
+   - Use additional > for nested blockquotes.
+   - Attribute quotes to their original source if applicable.
+
+    5. **Code Blocks:**
+   - Use backticks ` for inline code.
+   - Use triple backticks ``` for code blocks.
+   - Specify the language of the code block for syntax highlighting, e.g., ```python```.
+   - Use code blocks to display code snippets or technical information.
+
+    6. **Horizontal Lines:**
+   - Use three or more asterisks, dashes, or underscores to create a horizontal line, e.g., ***, ---, or ___
+   - Use horizontal lines to separate different sections of the blog post.
+
+    7. **Table Formatting:**
+   - Use pipes | and dashes - to create tables.
+   - Align text within columns using colons :.
+   - Use tables to present data or information in a structured format.
+
+    8. **Other Best Practices:**
+   - Use emojis sparingly and appropriately to add visual interest and enhance the reader's experience.
+   - Proofread carefully for any errors in grammar, spelling, or formatting.
+   - Keep the blog post organized and easy to navigate.
+   - Use a consistent formatting style throughout the post.
+    
+    Blog Post: '{blog_content}'"""
+    
+    if 'openai' in gpt_provider:
+        try:
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            SystemError(f"Openai Error in converting to Markdown format.")
+    elif 'gemini' in gpt_provider:
+
+        prompt = f""" Convert the given blog post into well structured MARKDOWN content. 
+        Do not alter the given blog post.
+        blog post: "{blog_content}" """
+        try:
+            response = gemini_text_response(prompt)
+            return response
+        except Exception as err:
+            SystemError(f"Gemini Error in converting to Markdown format.")
--- a/lib/blog_postprocessing/convert_markdown_to_html.py
+++ b/lib/blog_postprocessing/convert_markdown_to_html.py
@@ -0,0 +1,37 @@
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+
+def convert_markdown_to_html(md_content):
+    """ Helper function to convert given text to HTML
+    """
+    prompt =f"""
+			You are a skilled web developer tasked with converting a Markdown-formatted text to HTML. 
+            You will be given text in markdown format. Follow these steps to perform the conversion:
+			
+			1. Parse User's Markdown Input: You will receive a Markdown-formatted text as input from the user. 
+            Carefully analyze the provided Markdown text, paying attention to different elements such as headings (#), 
+            lists (unordered and ordered), bold and italic text, links, images, and code blocks.
+			2. Generate and Validate HTML: Generate corresponding HTML code for each Markdown element following 
+            the conversion guidelines below. Ensure the generated HTML is well-structured and syntactically correct.
+			3. Preserve Line Breaks: Markdown line breaks (soft breaks) represented by two spaces at the end of a 
+            line should be converted to <br> tags in HTML to preserve the line breaks.
+			4. REMEMBER to generate complete, valid HTML response only.
+			
+			Follow below Conversion Guidelines:
+			- Headers: Convert Markdown headers (#, ##, ###, etc.) to corresponding HTML header tags (<h1>, <h2>, <h3>, etc.).
+			- Lists: Convert unordered lists (*) and ordered lists (1., 2., 3., etc.) to <ul> and <ol> HTML tags, respectively. 
+            List items should be enclosed in <li> tags.
+			- Emphasis: Convert bold (**) and italic (*) text to <strong> and <em> HTML tags, respectively.
+			- Links: Convert Markdown links ([text](url)) to HTML anchor (<a>) tags. Ensure the href attribute contains the correct URL.
+			- Images: Convert Markdown image tags (![alt text](image_url)) to HTML image (<img>) tags. 
+            Include the alt attribute for accessibility.
+			- Code: Convert inline code (`code`) to <code> HTML tags. Convert code blocks (```) to <pre> HTML tags 
+            for preserving formatting.
+			- Blockquotes: Convert blockquotes (>) to <blockquote> HTML tags.
+			Convert the following Markdown text to HTML:  {md_content}
+            """
+    try:
+        # TBD: Add logic for which_provider and which_model
+        response = openai_chatgpt(prompt)
+        return response
+    except Exception as err:
+        SystemError(f"Error in convert to HTML")
--- a/lib/blog_postprocessing/save_blog_to_file.py
+++ b/lib/blog_postprocessing/save_blog_to_file.py
@@ -0,0 +1,135 @@
+import sys
+import os
+import re
+import datetime
+import random
+from dateutil.relativedelta import relativedelta
+from textwrap import dedent
+import logging
+from zoneinfo import ZoneInfo
+import nltk
+from nltk.corpus import stopwords
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+# fixme: Remove the hardcoding, need add another option OR in config ?
+image_dir = "blog_images"
+image_dir = os.path.join(os.getcwd(), image_dir)
+# TBD: This can come from config file.
+output_path = "blogs"
+output_path = os.path.join(os.getcwd(), output_path)
+
+
+def random_date_last_three_months():
+    current_date = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
+    three_months_ago = current_date - relativedelta(months=3)
+
+    # Generate a random date between three_months_ago and current_date
+    random_date = three_months_ago + datetime.timedelta(
+        seconds=random.randint(0, int((current_date - three_months_ago).total_seconds()))
+    )
+
+    return random_date.strftime('%Y-%m-%d %H:%M:%S %z')
+
+
+def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_categories, main_img_path=None, file_type="md"):
+    """
+    Saves the provided blog content to a file in the specified format.
+
+    Args:
+        blog_content (str): The main content of the blog.
+        blog_title (str): Title of the blog.
+        blog_meta_desc (str): Meta description of the blog.
+        blog_tags (list): List of tags associated with the blog.
+        blog_categories (list): List of categories associated with the blog.
+        main_img_path (str): Path to the main image of the blog.
+        output_path (str): Path to the directory where the blog will be saved.
+        file_type (str, optional): The file format for saving the blog ('md' for Markdown or 'html' for HTML). Defaults to 'md'.
+
+    Raises:
+        FileNotFoundError: If the output_path does not exist.
+        Exception: If the blog content cannot be written to the file.
+    """
+    blog_frontmatter = ''
+    # Sanitize and prepare the blog title
+    # Remove colon and ampersand
+    blog_title_md = blog_title.replace(":", "").replace("&", "")
+    # Replace spaces with hyphens
+    blog_title_md = blog_title_md.replace(" ", "-")
+    blog_title_md = re.sub('[^A-Za-z0-9-]', '', blog_title_md)
+    # Replace multiple consecutive dashes with a single dash
+    blog_title_md = re.sub('-+', '-', blog_title_md)
+    blog_title_md = remove_stop_words(blog_title_md)
+    logger.debug(f"Blog Title is: {blog_title_md}")
+
+    # Check if output path exists
+    if not os.path.exists(output_path):
+        logger.error(f"Error: Blog output directory is set to {output_path}, which does not exist.")
+        raise FileNotFoundError(f"Output directory does not exist: {output_path}")
+
+    # Handle Markdown file type
+    if file_type == "md":
+        logger.info("Writing/Saving the resultant blog content in Markdown format.")
+        # Hmmmm, bulk generation will benefit from randomizing publishing dates.
+        #dtobj = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
+        #formatted_date = dtobj.strftime('%Y-%m-%d %H:%M:%S %z')
+        formatted_date = random_date_last_three_months()
+        blog_title = blog_title.replace(":", "-").replace('"', '').replace('**', '')
+        if main_img_path:
+            blog_frontmatter = dedent(f"""\
+                ---
+                title: {blog_title}
+                date: {formatted_date}
+                categories: [{blog_categories}]
+                tags: [{blog_tags}]
+                description: {blog_meta_desc.replace(":", "-").replace('**', '')}
+                img_path: '/assets/'
+                image:
+                    path: {os.path.basename(main_img_path)}
+                    alt: {blog_title}
+                ---\n\n""")
+        else:
+            blog_frontmatter = dedent(f"""\
+                ---
+                title: {blog_title}
+                date: {formatted_date}
+                categories: [{blog_categories}]
+                tags: [{blog_tags}]
+                description: {blog_meta_desc.replace(":", "-")}
+                ---\n\n""")
+
+        blog_output_path = os.path.join(
+            output_path,
+            f"{datetime.date.today().strftime('%Y-%m-%d')}-{blog_title_md}.md"
+        )
+
+        # Write to the file
+        try:
+            with open(blog_output_path, "w") as f:
+                f.write(blog_frontmatter)
+                f.write(blog_content)
+        except Exception as e:
+            raise Exception(f"Failed to write blog content: {e}")
+
+        logger.info(f"Successfully saved and posted blog at: {blog_output_path}")
+
+
+# Helper function
+def remove_stop_words(sentence):
+    """
+    Removes stop words from a given sentence.
+
+    Args:
+        sentence (str): The sentence from which to remove stop words.
+
+    Returns:
+        str: The sentence after removing stop words.
+    """
+    words = nltk.word_tokenize(sentence)
+    stop_words = set(stopwords.words('english'))
+    filtered_words = [word for word in words if word.lower() not in stop_words]
+    return ' '.join(filtered_words)