Long form content generation, web researched

2024-04-23 19:40:07 +05:30
parent 9c45762680
commit 48d4371fa5
8 changed files with 365 additions and 81 deletions
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ Alwrity automates and enhances the process of blog creation, optimization, and m
 Leveraging AI technologies, it assists content creators and digital marketers in generating, formatting, and uploading blog content efficiently. The toolkit integrates advanced AI models for text generation, image creation, and data analysis, streamlining the content creation pipeline.

 # AI Content Generation Toolkit - Alwrity
-![](https://github.com/AJaySi/AI-Blog-Writer/blob/main/workspace/keyword_blog.gif)
+![](https://github.com/AJaySi/AI-Writer/blob/main/lib/workspace/keyword_blog.gif)

 ---

@@ -109,7 +109,7 @@ Coming Soon....

 ### AI-Driven Content Creation
 - **Text Generation**: Leverages OpenAI's ChatGPT, Google Gemini Pro for generating text for blogs.
- **Customizable AI Parameters**: (FIXME) Offers flexibility in adjusting AI parameters like model selection, temperature, and token limits to suit different content needs.
+- [**Customizable AI Parameters**](https://github.com/AJaySi/AI-Writer/blob/main/main_config): Offers flexibility in adjusting AI parameters like model selection, temperature, and token limits to suit different content needs.

 ### Image Detail Extraction
 - **Analyzing and Extracting Image Details**: Uses OpenAI's Vision API, Google Gemini vision to analyze images and extract details such as alt text, descriptions, titles, and captions, enhancing the SEO of image content.
@@ -148,11 +148,13 @@ Coming Soon....

 ## Packages, Tools, and APIs Used

-### Standing on the shoulders of Giants:
+### Standing on the shoulders of Giants - Credits:
 - **APIs**:
  - Metaphor API: Provides semantic search capabilities for finding similar topics and technologies.
  - Tavily API: Offers AI-powered web search functionality for conducting in-depth keyword research.
  - SerperDev API: Enables access to search engine results and competitor analysis data.
+  - YOU.com: You.com enhances web search, writing, coding, digital art creation, and solving complex problems.
+  - Stability AI: Activating humanity's potential through generative AI. Open models in every modality, for everyone, everywhere.
  - OpenAI API: Powers the Large Language Models (LLMs) for generating blog content and conducting research.
  - Gemini API: Another LLM provider for natural language processing tasks.
  - Ollama API (Work In Progress): An upcoming LLM provider for additional research and content generation capabilities.
--- a/alwrity.py
+++ b/alwrity.py
@@ -245,8 +245,12 @@ if __name__ == "__main__":
    os.system("clear" if os.name == "posix" else "cls")
    check_search_apis()
    check_llm_environs()
-    os.environ["SEARCH_SAVE_FILE"] = os.path.join(os.getcwd(), "lib", "workspace") + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+    # Export the paths and file names. Dont want alwrity to be chatty and prompt for inputs.
+    os.environ["SEARCH_SAVE_FILE"] = os.path.join(os.getcwd(), "lib", "workspace",
+                                                  f"web_research_report_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
    os.environ["IMG_SAVE_DIR"] = os.path.join(os.getcwd(), "lib", "workspace")
+    os.environ["CONTENT_SAVE_DIR"] = os.path.join(os.getcwd(), "lib", "workspace")

    load_dotenv(Path('.env'))
    app()
--- a/lib/ai_writers/long_form_ai_writer.py
+++ b/lib/ai_writers/long_form_ai_writer.py
@@ -0,0 +1,267 @@
+#####################################################
+#
+# Alwrity, AI Long form writer - Writing_with_Prompt_Chaining
+# and generative AI.
+#
+#####################################################
+
+import os
+import sys
+from pathlib import Path
+from dotenv import load_dotenv
+from google.api_core import retry
+import google.generativeai as genai
+from pprint import pprint
+from textwrap import dedent
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+from ..utils.read_main_config_params import read_return_config_section
+from ..ai_web_researcher.gpt_online_researcher import do_google_serp_search
+from ..ai_web_researcher.gpt_online_researcher import do_google_serp_search, do_metaphor_ai_research
+from ..blog_metadata.get_blog_metadata import blog_metadata
+from ..blog_postprocessing.save_blog_to_file import save_blog_to_file
+
+
+def generate_with_retry(model, prompt):
+    """
+    Generates content from the model with retry handling for errors.
+
+    Parameters:
+        model (GenerativeModel): The generative model to use for content generation.
+        prompt (str): The prompt to generate content from.
+
+    Returns:
+        str: The generated content.
+    """
+    try:
+        # FIXME: Need a progress bar here.
+        return model.generate_content(prompt, request_options={'retry':retry.Retry()})
+    except Exception as e:
+        logger.error(f"Error generating content: {e}")
+        return ""
+
+
+def long_form_generator(content_keywords):
+    """
+    Write long form content using prompt chaining and iterative generation.
+    Parameters:
+    """
+    # Read the main_config to define tone, character, personality of the content to be generated.
+    try:    
+        logger.info(f"Starting to write content on {content_keywords}.")
+        # Define persona and writing guidelines
+        content_tone, target_audience, content_type, content_language, output_format = read_return_config_section('blog_characteristics')
+    except Exception as err:
+        logger.error(f"Failed to Read config params from main_config: {err}")
+        return
+
+    writing_guidelines = f'''\
+    Writing Guidelines
+
+    As an expert Content writer and web researcher, demostrate your world class {content_type} content writing skills.
+    
+    Follow the below writing guidelines for writing your content:
+    1). You must write in {content_language} language.
+    2). Your content must appeal to target audience of {target_audience}.
+    3). The tone of your content must be consistent for {content_tone}, type of content.
+    4). I will provide you with web research, make use of provided context.
+    5). Always use web research content for providing citations and referances, to demostrate trust. 
+    6). Always ensure orignality and human-like content. Use simple words and ensure high readibility.
+    7). Use simple {content_language} words, to appeal to all readers.
+    7). Your content must be well formatted using {output_format} language.
+    8). Do not use words like: Unleash, ultimate, Uncover, Discover, Elevate, Revolutionizing, Unveiling, Harnessing, Dive, Delve into, Embrace.
+
+    Remember, your main goal is to write as much as you can. If you get through the content too fast, that is bad. 
+    Expand, never summarize.
+    '''
+
+    remove_ai_words = f'''\
+	    As an expert content writer and editor, I will provide you with my 'blog content' and 'Exception-list'.
+        Your task is to replace all occurances of words from 'Exception-list' from given 'blog content'.
+        Before generating any text, examine the Exception-list and avoid all cases of these words and phrases.
+        These instructions are critical and require absolute adherence!
+	
+	    \n\nException-list: ["realm", "navigating", "beacon", "bustling", "treasure trove", "landscape", "tailored", "tailor", “roadmap” , "tailoring", "delving", “streamlining” "dynamic", "robust", "stay tuned", "in conclusion", "seamless", "bustling", “not just a”, “cornerstone”, “paramount” ,“diving into”, “delve into”, “pivotal”, “navigating”,“dive deep”, journey”, “maze”, “puzzle”, “overwhelmed” 'Tapestry', 'Bustling', 'In summary', 'In conclusion', 'Unleash', 'Unveiling', 'ever-evolving', 'Remember that', 'Take a dive into', 'Navigating', 'Navigating the landscape', 'Navigating the complexities of', 'Landscape', 'The landscape of', 'Testament', 'a testament to', 'In the world of', 'Realm', 'Embark', 'virtuoso', 'Let's explore', 'symphony', 'game changing', 'ever-changing', 'Embrace', 'Embracing', 'game-changing', 'ever-evolving']
+	
+        \n\nBlog Content: '{{blog_content}}'
+    '''
+
+    # Generate prompts
+    content_title = f'''\
+    As an expert {content_language} digital content writer, specilizing in SEO writing for {target_audience}.
+    Your task is to write a title following guidelines below:
+
+    1). Write a digital content title for given keywords {content_keywords}. 
+    2). The title should appeal to audience level of {target_audience}.
+    3). Review the given web research result for {content_keywords}. Your title should compete against them.
+    4). Do not use words like: Unleash, ultimate, Uncover, Discover, Elevate, Revolutionizing, Unveiling, Harnessing, Dive, Delve into, Embrace.
+
+    Web research Result:
+
+    {{web_research_result}}
+
+    '''
+
+    content_outline = f'''\
+    As an expert {content_language} content writer & web researcher, specilizing in writing SEO optimised content.
+    I will provide you with 'title' of my content and relevant web research results.
+    Your task is write a detailed content outline for the given 'Title', based on given web research.
+
+    Your Content Title is:
+
+    {{content_title}}
+
+    Web research Result is:
+    
+    {{web_research_result}}
+
+    Write an outline for the content title using web research results.
+
+    '''
+
+    starting_prompt = f'''\
+    As an expert {content_language} content writer & web researcher, specilizing in writing SEO optimised content.
+
+    Your Content title is:
+
+    {{content_title}}
+
+    The outline of the content is:
+
+    {{content_outline}}
+
+    First, silently review the outline and the content title. Consider how to start writing your content.
+    Start to write the very beginning of the content. You are not expected to finish the whole content now. 
+    Your writing should be detailed enough that you are only scratching the surface of the first bullet of your outline. 
+    Try to write AT MINIMUM 600 WORDS.
+    Pay special attention to orignality, formatting and readibility of your content.
+
+    {writing_guidelines}
+    '''
+
+    continuation_prompt = f'''\
+    As an expert {content_language} content writer & web researcher, specilizing in writing SEO optimised content.
+
+    Your Content title is:
+
+    {{content_title}}
+
+    The outline of the content is:
+
+    {{content_outline}}
+
+    Relevant web research results:
+
+    {{web_research_result}}
+
+    ============\n
+
+    You've begun to write the essay and continue to do so.
+    Here's what you've written so far:
+
+    {{content_text}}
+
+    =====
+
+    First, silently review the outline and essay so far. 
+    Identify what the single next part of your outline you should write.
+
+    Your task is to continue where you left off and write the next part of the Essay.
+    You are not expected to finish the whole essay now. Your writing should be
+    detailed enough that you are only scratching the surface of the next part of
+    your outline. Try to write AT MINIMUM 600 WORDS. However, only once the essay
+    is COMPLETELY finished, write IAMDONE. Remember, do NOT write a whole chapter
+    right now.
+
+    {writing_guidelines}
+    '''
+
+    # Configure generative AI
+    load_dotenv(Path('../.env'))
+    genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
+    # Initialize the generative model
+    model = genai.GenerativeModel('gemini-pro')
+
+    # Do SERP web research for given keywords to generate title and outline.
+    web_research_result, g_titles = do_google_serp_search(content_keywords)
+
+    # Generate prompts
+    try:
+        content_title = generate_with_retry(model, content_title.format(web_research_result=web_research_result)).text
+        logger.info(f"The title of the content is: {content_title}")
+    except Exception as err:
+        logger.error(f"Content title Generation Error: {err}")
+        return
+
+    try:
+        content_outline = generate_with_retry(model, 
+                        content_outline.format(content_title=content_title, web_research_result=web_research_result)).text
+        logger.info(f"The content Outline is: {content_outline}\n\n")
+    except Exception as err:
+        logger.error(f"Failed to generate content outline: {err}")
+
+    try:
+        starting_draft = generate_with_retry(model, 
+                starting_prompt.format(content_title=content_title, content_outline=content_outline)).text
+    except Exception as err:
+        logger.error(f"Failed to Generate Starting draft: {err}")
+        return
+
+    try:
+        draft = starting_draft
+        continuation = generate_with_retry(model, 
+                continuation_prompt.format(content_title=content_title, 
+                            content_outline=content_outline, content_text=draft, web_research_result=web_research_result)).text
+    except Exception as err:
+        logger.error(f"Failed to write the initial draft: {err}")
+
+    # Add the continuation to the initial draft, keep building the story until we see 'IAMDONE'
+    try:
+        draft += '\n\n' + continuation
+    except Exception as err:
+        logger.error(f"Failed as: {err} and {continuation}")
+
+    try:
+        # Do Metaphor/Exa AI search.
+        web_research_result, m_titles = do_metaphor_ai_research(content_keywords)
+    except Exception as err:
+        logger.error(f"Failed to do Metaphor AI search: {err}")
+        return
+
+    
+    while 'IAMDONE' not in continuation:
+        try:
+            continuation = generate_with_retry(model, 
+                    continuation_prompt.format(content_title=content_title,
+                            content_outline=content_outline, content_text=draft, web_research_result=web_research_result)).text
+            draft += '\n\n' + continuation
+        except Exception as err:
+            print(f"Failed to continually write the Essay: {err}")
+            return
+
+    # Remove 'IAMDONE' and print the final story
+    final = draft.replace('IAMDONE', '').strip()
+    print(final)
+
+    blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(final,
+            content_keywords, m_titles)
+
+    generated_image_filepath = None
+    # TBD: Save the blog content as a .md file. Markdown or HTML ?
+    save_blog_to_file(final, blog_title, blog_meta_desc, blog_tags, blog_categories, generated_image_filepath)
+
+    blog_frontmatter = dedent(f"""\n\n\n\
+                ---
+                title: {blog_title}
+                categories: [{blog_categories}]
+                tags: [{blog_tags}]
+                Meta description: {blog_meta_desc.replace(":", "-")}
+                ---\n\n""")
+    logger.info(f"\n{blog_frontmatter}{final}\n\n")
+    logger.info(f"\n\n ################ Finished writing Blog for : {content_keywords} #################### \n")
--- a/lib/blog_postprocessing/save_blog_to_file.py
+++ b/lib/blog_postprocessing/save_blog_to_file.py
@@ -7,8 +7,6 @@ from dateutil.relativedelta import relativedelta
 from textwrap import dedent
 import logging
 from zoneinfo import ZoneInfo
-import nltk
-from nltk.corpus import stopwords
 from loguru import logger
 logger.remove()
 logger.add(sys.stdout,
@@ -16,13 +14,6 @@ logger.add(sys.stdout,
        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
    )

-# fixme: Remove the hardcoding, need add another option OR in config ?
-image_dir = "blog_images"
-image_dir = os.path.join(os.getcwd(), image_dir)
-# TBD: This can come from config file.
-output_path = "blogs"
-output_path = os.path.join(os.getcwd(), output_path)
-

 def random_date_last_three_months():
    current_date = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
@@ -63,10 +54,11 @@ def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_
    blog_title_md = re.sub('[^A-Za-z0-9-]', '', blog_title_md)
    # Replace multiple consecutive dashes with a single dash
    blog_title_md = re.sub('-+', '-', blog_title_md)
-    blog_title_md = remove_stop_words(blog_title_md)
+    #blog_title_md = remove_stop_words(blog_title_md)
    logger.debug(f"Blog Title is: {blog_title_md}")

    # Check if output path exists
+    output_path = os.getenv('CONTENT_SAVE_DIR')
    if not os.path.exists(output_path):
        logger.error(f"Error: Blog output directory is set to {output_path}, which does not exist.")
        raise FileNotFoundError(f"Output directory does not exist: {output_path}")
@@ -116,20 +108,3 @@ def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_
            raise Exception(f"Failed to write blog content: {e}")

        logger.info(f"Successfully saved and posted blog at: {blog_output_path}")
-
-
-# Helper function
-def remove_stop_words(sentence):
-    """
-    Removes stop words from a given sentence.
-
-    Args:
-        sentence (str): The sentence from which to remove stop words.
-
-    Returns:
-        str: The sentence after removing stop words.
-    """
-    words = nltk.word_tokenize(sentence)
-    stop_words = set(stopwords.words('english'))
-    filtered_words = [word for word in words if word.lower() not in stop_words]
-    return ' '.join(filtered_words)
--- a/lib/gpt_providers/text_generation/main_text_generation.py
+++ b/lib/gpt_providers/text_generation/main_text_generation.py
@@ -14,6 +14,7 @@ logger.add(sys.stdout,

 from .openai_text_gen import openai_chatgpt
 from .gemini_pro_text import gemini_text_response
+from ...utils.read_main_config_params import read_return_config_section


 def llm_text_gen(prompt):
@@ -25,10 +26,9 @@ def llm_text_gen(prompt):
        str: Generated text based on the prompt.
    """
    try:
-        config_path = Path(__file__).resolve().parents[3] / "main_config"
-        gpt_provider, model, temperature, max_tokens, top_p, n, fp = read_llm_parameters(config_path)
+        gpt_provider, model, temperature, max_tokens, top_p, n, fp = read_return_config_section('llm_config')

-        gpt_provider = check_gpt_provider(gpt_provider)
+        #gpt_provider = check_gpt_provider(gpt_provider)
        # Check if API key is provided for the given gpt_provider
        get_api_key(gpt_provider)

@@ -101,43 +101,3 @@ def get_api_key(gpt_provider):

    logger.info(f"Using API key for {gpt_provider}")
    return api_key
-
-
-
-def read_llm_parameters(config_path: str) -> tuple:
-    """
-    Read Language Model (LLM) parameters from the configuration file.
-
-    Args:
-        config_path (str): The path to the configuration file.
-
-    Returns:
-        tuple: A tuple containing the LLM parameters (gpt_provider, model, temperature, max_tokens, top_p, n, frequency_penalty).
-
-    Raises:
-        FileNotFoundError: If the configuration file is not found.
-        configparser.Error: If there is an error parsing the configuration file.
-    """
-    try:
-        config = configparser.ConfigParser()
-        config.read(config_path, encoding="utf-8")
-
-        gpt_provider = config.get('llm_options', 'gpt_provider')
-        model = config.get('llm_options', 'model')
-        temperature = config.getfloat('llm_options', 'temperature')
-        max_tokens = config.getint('llm_options', 'max_tokens')
-        top_p = config.getfloat('llm_options', 'top_p')
-        n = config.getint('llm_options', 'n')
-        frequency_penalty = config.getfloat('llm_options', 'frequency_penalty')
-
-        return gpt_provider, model, temperature, max_tokens, top_p, n, frequency_penalty
-
-    except FileNotFoundError:
-        logger.error(f"Configuration file not found: {config_path}")
-        raise
-    except configparser.Error as err:
-        logger.error(f"Error reading LLM parameters from config file: {err}")
-        raise
-    except Exception as err:
-        logger.error(f"An unexpected error occurred: {err}")
-        raise
--- a/lib/utils/alwrity_utils.py
+++ b/lib/utils/alwrity_utils.py
@@ -18,6 +18,8 @@ from lib.ai_writers.speech_to_blog.main_audio_to_blog import generate_audio_blog
 from lib.gpt_providers.text_generation.ai_story_writer import ai_story_generator
 from lib.gpt_providers.text_generation.ai_essay_writer import ai_essay_generator
 from lib.gpt_providers.text_to_image_generation.main_generate_image_from_prompt import generate_image
+from lib.ai_writers.long_form_ai_writer import long_form_generator
+


 def blog_from_audio():
@@ -63,12 +65,27 @@ def blog_from_keyword():
                    title='Error',
                    text='🚫 Blog keywords should be at least two words long. Please try again.'
                ).run()
-    if blog_keywords:
+    choice = radiolist_dialog(
+        title="Select content type:",
+        values=[
+            ("normal", "Normal-length content"),
+            ("long", "Long-form content")
+        ],
+        default="normal"
+    ).run()
+
+    if choice == "normal":
        try:
            write_blog_from_keywords(blog_keywords)
        except Exception as err:
            print(f"Failed to write blog on {blog_keywords}, Error: {err}\n")
            exit(1)
+    elif choice == "long":
+        try:
+            long_form_generator(blog_keywords)
+        except Exception as err:
+            print(f"Failed to write blog on {blog_keywords}, Error: {err}\n")
+            exit(1)


 def do_web_research():
--- a/lib/utils/read_main_config_params.py
+++ b/lib/utils/read_main_config_params.py
@@ -0,0 +1,63 @@
+#
+# Common utils for lib
+#
+import os
+import sys
+import configparser
+from pathlib import Path
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def read_return_config_section(config_section):
+    """ read_return_config_section
+    Read Language Model (LLM) parameters from the configuration file.
+
+    Args:
+        config_path (str): The path to the configuration file.
+
+    Returns:
+        tuple: A tuple containing the LLM parameters (gpt_provider, model, temperature, max_tokens, top_p, n, frequency_penalty).
+
+    Raises:
+        FileNotFoundError: If the configuration file is not found.
+        configparser.Error: If there is an error parsing the configuration file.
+    """
+    try:
+        config_path = Path(__file__).resolve().parents[2] / "main_config"
+        config = configparser.ConfigParser()
+        config.read(config_path, encoding="utf-8")
+        
+        if 'llm_config' in config_section:
+	        gpt_provider = config.get('llm_options', 'gpt_provider')
+	        model = config.get('llm_options', 'model')
+	        temperature = config.getfloat('llm_options', 'temperature')
+	        max_tokens = config.getint('llm_options', 'max_tokens')
+	        top_p = config.getfloat('llm_options', 'top_p')
+	        n = config.getint('llm_options', 'n')
+	        frequency_penalty = config.getfloat('llm_options', 'frequency_penalty')
+	
+	        return gpt_provider, model, temperature, max_tokens, top_p, n, frequency_penalty
+        elif 'blog_characteristics' in config_section:
+            # Access and return the specified config values
+            blog_tone = config.get('blog_characteristics', 'blog_tone')
+            blog_demographic = config.get('blog_characteristics', 'blog_demographic')
+            blog_type = config.get('blog_characteristics', 'blog_type')
+            blog_language = config.get('blog_characteristics', 'blog_language')
+            blog_output_format = config.get('blog_characteristics', 'blog_output_format')
+
+            return blog_tone, blog_demographic, blog_type, blog_language, blog_output_format
+
+    except FileNotFoundError:
+        logger.error(f"Configuration file not found: {config_path}")
+        raise
+    except configparser.Error as err:
+        logger.error(f"Error reading LLM parameters from config file: {err}")
+        raise
+    except Exception as err:
+        logger.error(f"An unexpected error occurred: {err}")
+        raise
--- a/8
+++ b/8
@@ -11,6 +11,8 @@
 # Length of blogs Or word count. Note: It wont be exact and depends on GPT providers and Max token count.
 blog_length = 3000

+# company/brand-name
+
 # professional, how-to, begginer, research, programming, casual, etc
 blog_tone = "professional"

@@ -26,12 +28,6 @@ blog_language = "English"
 # Specify the output format of the blog as: HTML, markdown, plaintext. Defaults to markdown.
 blog_output_format = "markdown"

-# Specify full path to folder where the final blog should be stored. ex: _posts
-blog_output_folder = ""
-
-# Specify full path to folder where blog images will be stored. ex: assets
-blog_image_output_folder = ""
-


 ############################################################