ALwrity/lib/blog_postprocessing/save_blog_to_file.py

import sys
import os
import re
import datetime
import random
from dateutil.relativedelta import relativedelta
from textwrap import dedent
import logging
from zoneinfo import ZoneInfo
import nltk
from nltk.corpus import stopwords
from loguru import logger
logger.remove()
logger.add(sys.stdout,
        colorize=True,
        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
    )

# fixme: Remove the hardcoding, need add another option OR in config ?
image_dir = "blog_images"
image_dir = os.path.join(os.getcwd(), image_dir)
# TBD: This can come from config file.
output_path = "blogs"
output_path = os.path.join(os.getcwd(), output_path)


def random_date_last_three_months():
    current_date = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
    three_months_ago = current_date - relativedelta(months=3)

    # Generate a random date between three_months_ago and current_date
    random_date = three_months_ago + datetime.timedelta(
        seconds=random.randint(0, int((current_date - three_months_ago).total_seconds()))
    )

    return random_date.strftime('%Y-%m-%d %H:%M:%S %z')


def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_categories, main_img_path=None, file_type="md"):
    """
    Saves the provided blog content to a file in the specified format.

    Args:
        blog_content (str): The main content of the blog.
        blog_title (str): Title of the blog.
        blog_meta_desc (str): Meta description of the blog.
        blog_tags (list): List of tags associated with the blog.
        blog_categories (list): List of categories associated with the blog.
        main_img_path (str): Path to the main image of the blog.
        output_path (str): Path to the directory where the blog will be saved.
        file_type (str, optional): The file format for saving the blog ('md' for Markdown or 'html' for HTML). Defaults to 'md'.

    Raises:
        FileNotFoundError: If the output_path does not exist.
        Exception: If the blog content cannot be written to the file.
    """
    blog_frontmatter = ''
    # Sanitize and prepare the blog title
    # Remove colon and ampersand
    blog_title_md = blog_title.replace(":", "").replace("&", "")
    # Replace spaces with hyphens
    blog_title_md = blog_title_md.replace(" ", "-")
    blog_title_md = re.sub('[^A-Za-z0-9-]', '', blog_title_md)
    # Replace multiple consecutive dashes with a single dash
    blog_title_md = re.sub('-+', '-', blog_title_md)
    blog_title_md = remove_stop_words(blog_title_md)
    logger.debug(f"Blog Title is: {blog_title_md}")

    # Check if output path exists
    if not os.path.exists(output_path):
        logger.error(f"Error: Blog output directory is set to {output_path}, which does not exist.")
        raise FileNotFoundError(f"Output directory does not exist: {output_path}")

    # Handle Markdown file type
    if file_type == "md":
        logger.info("Writing/Saving the resultant blog content in Markdown format.")
        # Hmmmm, bulk generation will benefit from randomizing publishing dates.
        #dtobj = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
        #formatted_date = dtobj.strftime('%Y-%m-%d %H:%M:%S %z')
        formatted_date = random_date_last_three_months()
        blog_title = blog_title.replace(":", "-").replace('"', '').replace('**', '')
        if main_img_path:
            blog_frontmatter = dedent(f"""\
                ---
                title: {blog_title}
                date: {formatted_date}
                categories: [{blog_categories}]
                tags: [{blog_tags}]
                description: {blog_meta_desc.replace(":", "-").replace('**', '')}
                img_path: '/assets/'
                image:
                    path: {os.path.basename(main_img_path)}
                    alt: {blog_title}
                ---\n\n""")
        else:
            blog_frontmatter = dedent(f"""\
                ---
                title: {blog_title}
                date: {formatted_date}
                categories: [{blog_categories}]
                tags: [{blog_tags}]
                description: {blog_meta_desc.replace(":", "-")}
                ---\n\n""")

        blog_output_path = os.path.join(
            output_path,
            f"{datetime.date.today().strftime('%Y-%m-%d')}-{blog_title_md}.md"
        )

        # Write to the file
        try:
            with open(blog_output_path, "w") as f:
                f.write(blog_frontmatter)
                f.write(blog_content)
        except Exception as e:
            raise Exception(f"Failed to write blog content: {e}")

        logger.info(f"Successfully saved and posted blog at: {blog_output_path}")


# Helper function
def remove_stop_words(sentence):
    """
    Removes stop words from a given sentence.

    Args:
        sentence (str): The sentence from which to remove stop words.

    Returns:
        str: The sentence after removing stop words.
    """
    words = nltk.word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)