WIP - Streamlit UI, Porting CLI

2024-06-02 23:05:27 +05:30
parent ae8c9d0ac3
commit 2a1bb49020
11 changed files with 645 additions and 884 deletions
--- a/lib/ai_web_researcher/common_utils.py
+++ b/lib/ai_web_researcher/common_utils.py
@@ -2,7 +2,8 @@
 import os
 import sys
 import re
-import configparser
+import json
+from pathlib import Path
 import streamlit as st
 from datetime import datetime, timedelta
 from pathlib import Path
@@ -16,33 +17,33 @@ logger.add(sys.stdout,

 def cfg_search_param(flag):
    """
-    Read values from the main_config file and return them as variables and a dictionary.
+    Read values from the main_config.json file and return them as variables and a dictionary.

    Args:
-        file_path (str): The path to the main_config file.
+        flag (str): A flag to determine which configuration values to return.

    Returns:
-        dict: A dictionary containing the values read from the config file.
-        str: The geographic location value.
-        str: The search language value.
-        int: The number of search results to fetch.
+        various: The values read from the config file based on the flag.
    """
    try:
-        file_path = Path(__file__).resolve().parents[2] / "main_config"
+        file_path = Path(os.environ.get("ALWRITY_CONFIG", ""))
+        if not file_path.is_file():
+            raise FileNotFoundError(f"Configuration file not found: {file_path}")
        logger.info(f"Reading search config params from {file_path}")
-        config = configparser.ConfigParser()
-        config.read(file_path, encoding="utf-8")
-        web_research_section = config["web_research"]
+        
+        with open(file_path, 'r', encoding='utf-8') as file:
+            config = json.load(file)
+        web_research_section = config["Search Engine Parameters"]

        if 'serperdev' in flag:
            # Get values as variables
-            geo_location = web_research_section.get("geo_location")
-            search_language = web_research_section.get("search_language")
-            num_results = web_research_section.getint("num_results")
+            geo_location = web_research_section.get("Geographic Location")
+            search_language = web_research_section.get("Search Language")
+            num_results = web_research_section.get("Number of Results")
            return geo_location, search_language, num_results

        elif 'tavily' in flag:
-            include_urls = web_research_section.get("include_domains")
+            include_urls = web_research_section.get("Include Domains")
            pattern = re.compile(r"^(https?://[^\s,]+)(,\s*https?://[^\s,]+)*$")
            if pattern.match(include_urls):
                include_urls = [url.strip() for url in include_urls.split(',')]
@@ -51,7 +52,7 @@ def cfg_search_param(flag):
            return include_urls

        elif 'exa' in flag:
-            include_urls = web_research_section.get("include_domains")
+            include_urls = web_research_section.get("Include Domains")
            pattern = re.compile(r"^(https?://\w+)(,\s*https?://\w+)*$")
            if pattern.match(include_urls) is not None:
                include_urls = include_urls.split(',')
@@ -60,9 +61,9 @@ def cfg_search_param(flag):
            else:
                include_urls = None

-            num_results = web_research_section.getint("num_results")
-            similar_url = web_research_section.get("similar_url")
-            time_range = web_research_section.get("time_range")
+            num_results = web_research_section.get("Number of Results")
+            similar_url = web_research_section.get("Similar URL")
+            time_range = web_research_section.get("Time Range")
            if time_range == "past day":
                start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
            elif time_range == "past week":
@@ -86,7 +87,6 @@ def cfg_search_param(flag):
        logger.error(f"Error: Invalid value in config file: {e}")
        return {}, None, None, None

-
 def save_in_file(table_content):
    """ Helper function to save search analysis in a file. """
    file_path = os.environ.get('SEARCH_SAVE_FILE')
--- a/lib/ai_writers/blog_from_google_serp.py
+++ b/lib/ai_writers/blog_from_google_serp.py
@@ -1,7 +1,7 @@
 import os
 import sys
 import json
-import configparser
+from pathlib import Path

 from loguru import logger
 logger.remove()
@@ -13,26 +13,27 @@ logger.add(sys.stdout,
 from ..gpt_providers.text_generation.main_text_generation import llm_text_gen


-# FIXME: Provide num_blogs, num_faqs as inputs.
 def write_blog_google_serp(search_keyword, search_results):
-    """Combine the given online research and gpt blog content"""
+    """Combine the given online research and GPT blog content"""
    try:
-        config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'main_config'))
-        config = configparser.ConfigParser()
-        config.read(config_path, encoding='utf-8')
+        config_path = Path(os.environ["ALWRITY_CONFIG"])
+        with open(config_path, 'r', encoding='utf-8') as file:
+            config = json.load(file)
    except Exception as err:
-        print(f"Error: Failed to read values from config: {err}")
+        logger.error(f"Error: Failed to read values from config: {err}")
        exit(1)

+    blog_characteristics = config['Blog Content Characteristics']
+    
    prompt = f"""
        As expert Creative Content writer,
-        I want you to write {config.get('blog_characteristics', 'blog_type')} blog post,
+        I want you to write {blog_characteristics['Blog Type']} blog post,
        that explores {search_keyword} and also include 5 FAQs.
        
        Below are the guidelines to follow:
-        1). You must respond in {config.get('blog_characteristics', 'blog_language')} language.
-        2). Tone and Brand Alignment: Adjust your tone, voice, personality for {config.get('blog_characteristics', 'blog_tone')} audience.
-        3). Make sure your response content length is of {config.get('blog_characteristics', 'blog_length')} words.
+        1). You must respond in {blog_characteristics['Blog Language']} language.
+        2). Tone and Brand Alignment: Adjust your tone, voice, personality for {blog_characteristics['Blog Tone']} audience.
+        3). Make sure your response content length is of {blog_characteristics['Blog Length']} words.
        4). Include FAQs from 'People also Ask' section of provided context 'google search result'.

        I want the post to offer unique insights, relatable examples, and a fresh perspective on the topic.
@@ -40,7 +41,9 @@ def write_blog_google_serp(search_keyword, search_results):
        \n\n
        \"\"\"{search_results}\"\"\"
        """
+    
    logger.info("Generating blog and FAQs from Google web search results.")
+    
    try:
        response = llm_text_gen(prompt)
        return response
@@ -48,7 +51,6 @@ def write_blog_google_serp(search_keyword, search_results):
        logger.error(f"Exit: Failed to get response from LLM: {err}")
        exit(1)

-
 def improve_blog_intro(blog_content, blog_intro):
    """Combine the given online research and gpt blog content"""
    prompt = f"""
--- a/lib/ai_writers/keywords_to_blog_streamlit.py
+++ b/lib/ai_writers/keywords_to_blog_streamlit.py
@@ -44,9 +44,11 @@ def write_blog_from_keywords(search_keywords, url=None):

            status.update(label=f"🛀 Starting Tavily AI research: {search_keywords}")
            tavily_search_result, t_titles, t_answer = do_tavily_ai_search(search_keywords)
-            status.update(label=f"🙆 Finished Google Search & Tavily AI Search on: {search_keywords}", expanded=False)
+            status.update(label=f"🙆 Finished Google Search & Tavily AI Search on: {search_keywords}", 
+                    state="complete", expanded=False)

        except Exception as err:
+            st.error(f"Failed in web research: {err}")
            logger.error(f"Failed in web research: {err}")

    with st.status("Started Writing blog from google search..", expanded=True) as status:
@@ -56,12 +58,9 @@ def write_blog_from_keywords(search_keywords, url=None):
            status.update(label=f"🛀 Writing blog from Google Search on: {search_keywords}")
            blog_markdown_str = write_blog_google_serp(search_keywords, google_search_result)
            st.markdown(blog_markdown_str)
-
-            # Hate the robotic introductions.
-            #blog_markdown_str = improve_blog_intro(blog_markdown_str, t_answer)
-            #st.markdown(blog_markdown_str)
-            status.update(label="🙎 Draft 1: Your Content from Google search result.", expanded=False)
+            status.update(label="🙎 Draft 1: Your Content from Google search result.", state="complete", expanded=False)
        except Exception as err:
+            st.error(f"Failed in Google web research: {err}")
            logger.error(f"Failed in Google web research: {err}")

    # logger.info/check the final blog content.
--- a/lib/ai_writers/long_form_ai_writer.py
+++ b/lib/ai_writers/long_form_ai_writer.py
@@ -60,22 +60,26 @@ def long_form_generator(content_keywords):
    """
    with st.status("Start Writing Long Form Article, Hold my Beer..", expanded=True) as status:
        # Read the main_config to define tone, character, personality of the content to be generated.
-        try:    
+        try:
+            status.update(label=f"Starting to write content on {content_keywords}.")
            logger.info(f"Starting to write content on {content_keywords}.")
            # Define persona and writing guidelines
-            content_tone, target_audience, content_type, content_language, output_format = read_return_config_section('blog_characteristics')
+            content_tone, target_audience, content_type, content_language, output_format, content_length = read_return_config_section('blog_characteristics')
        except Exception as err:
            logger.error(f"Failed to Read config params from main_config: {err}")
-            return
+            st.error(f"Failed to Read config params from main_config: {err}")
+            return False
    
        try:
            filepath = os.path.join(os.environ["PROMPTS_DIR"], "long_form_ai_writer.prompts")
+            status.update(label=f"Reading Prompts from {filepath}.")
            # Check if file exists
            if not os.path.exists(filepath):
                raise FileNotFoundError(f"File {filepath} does not exist")
            with open(filepath, 'r') as file:
                prompts = yaml.safe_load(file)
        except Exception as err:
+            st.error(f"Exit: Failed to read prompts from {filepath}: {err}")
            logger.error(f"Exit: Failed to read prompts from {filepath}: {err}")
            exit(1)
    
@@ -147,11 +151,12 @@ def long_form_generator(content_keywords):
                content_title=content_title, 
                web_research_result=web_research_result)).text
            logger.info(f"The content Outline is: {content_outline}\n\n")
-            status.update(label="Generated the content outline.")
+            status.update(label=f"Completed with Content Outline.")
        except Exception as err:
            logger.error(f"Failed to generate content outline: {err}")
    
        try:
+            status.update(label=f"Do web research with Tavily to provide context for content creation.")
            logger.info("Do web research with Tavily to provide context for content creation.")
            # Do Metaphor/Exa AI search.
            table_data = []
@@ -163,6 +168,7 @@ def long_form_generator(content_keywords):
            web_research_result = table_data
        except Exception as err:
            logger.error(f"Failed to do Tavily AI search: {err}")
+            st.error(f"Failed to do Tavily AI search: {err}")
            return
    
        try:
@@ -172,6 +178,7 @@ def long_form_generator(content_keywords):
                    web_research_result=web_research_result,
                    writing_guidelines=writing_guidelines)).text
        except Exception as err:
+            st.error(f"Failed to Generate Starting draft: {err}")
            logger.error(f"Failed to Generate Starting draft: {err}")
            return
        
@@ -194,10 +201,11 @@ def long_form_generator(content_keywords):
            logger.error(f"Failed as: {err} and {continuation}")
    
        logger.info(f"Writing in progress... Current draft length: {len(draft)} characters")
+        status.update(label=f"Writing in progress... Current draft length: {len(draft)} characters")
        search_terms = f"""
            I will provide you with blog outline, your task is to read the outline & return 3 google search keywords.
            Your response will be used to do web research for writing on the given outline.
-            Do not explain your response, provide 3 google search sentences encompassing the given content outline.
+            Do not explain your response, provide 8 google search sentences encompassing the given content outline.
            Provide the search term results as comma separated values.\n\n
            Content Outline:\n
            '{content_outline}'
@@ -227,26 +235,36 @@ def long_form_generator(content_keywords):
                # At this point, the context is little stale. We should more web research on
                # related queries as per the content outline, to augment the LLM context.
            except Exception as err:
+                st.error(f"Failed to continually write the Essay: {err}")
                logger.error(f"Failed to continually write the Essay: {err}")
                return
        
        # Remove 'IAMDONE' and print the final story
        final = draft.replace('IAMDONE', '').strip()
-    
-        blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(final,
-                content_keywords, m_titles)
-    
-        generated_image_filepath = None
-        # TBD: Save the blog content as a .md file. Markdown or HTML ?
-        save_blog_to_file(final, blog_title, blog_meta_desc, blog_tags, blog_categories, generated_image_filepath)
-    
-        blog_frontmatter = f"""
-        ---
-        title: {blog_title}
-        categories: [{blog_categories}]
-        tags: [{blog_tags}]
-        Meta description: {blog_meta_desc.replace(":", "-")}
-        ---"""
-        logger.info(f"\n{blog_frontmatter}{final}\n\n")
-        st.write(f"\n{blog_frontmatter}{final}\n\n")
+        status.update(label="Success: Finished writing Long form content.")
+
+        # FIXME: The current implementation is suited for normal length content.
+        # In long content sending the whole content for each content metadata is expensive.
+#        blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(final,
+#                content_keywords, m_titles)
+#        status.update(label="Success: Finished with Title, Meta Description, Tags, categories")
+#        generated_image_filepath = None
+#        # TBD: Save the blog content as a .md file. Markdown or HTML ?
+#        save_blog_to_file(final, blog_title, blog_meta_desc, blog_tags, blog_categories, generated_image_filepath)
+#    
+#        blog_frontmatter = dedent(f"""
+#        \n---------------------------------------------------------------------
+#        title: {blog_title.strip()}\n
+#        categories: [{blog_categories.strip()}]\n
+#        tags: [{blog_tags.strip()}]\n
+#        Meta description: {blog_meta_desc.replace(":", "-").strip()}\n
+#        ---------------------------------------------------------------------\n
+#        """)
+#
+#        logger.info(f"\n{blog_frontmatter}{final}\n\n")
+#        st.markdown(f"\n{blog_frontmatter}{final}\n\n")
+        logger.info(f"\n{final}\n\n")
+
        logger.info(f"\n\n ################ Finished writing Blog for : {content_keywords} #################### \n")
+    with st.expander("**Click to View the final content draft:**"):
+        st.markdown(f"\n{final}\n\n")
--- a/lib/gpt_providers/text_generation/openai_text_gen.py
+++ b/lib/gpt_providers/text_generation/openai_text_gen.py
@@ -1,12 +1,10 @@
 import os
 import time #IWish
-import logging
 import openai
-import configparser

 # Configure standard logging
+import logging
 logging.basicConfig(level=logging.INFO, format='[%(asctime)s-%(levelname)s-%(module)s-%(lineno)d]- %(message)s')
-
 logger = logging.getLogger(__name__)
 from tenacity import (
    retry,
--- a/lib/utils/read_main_config_params.py
+++ b/lib/utils/read_main_config_params.py
@@ -1,73 +1,67 @@
-#
-# Common utils for lib
-#
 import os
-import sys
-import configparser
+import json
 from pathlib import Path
-from loguru import logger
-logger.remove()
-logger.add(sys.stdout,
-        colorize=True,
-        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
-    )
-

 def read_return_config_section(config_section):
    """ read_return_config_section
-    Read Language Model (LLM) parameters from the configuration file.
+    Read configuration parameters from the JSON configuration file.

    Args:
-        config_path (str): The path to the configuration file.
+        config_section (str): The section of the configuration file to read.

    Returns:
-        tuple: A tuple containing the LLM parameters (gpt_provider, model, temperature, max_tokens, top_p, n, frequency_penalty).
+        tuple: A tuple containing the specified configuration parameters.

    Raises:
        FileNotFoundError: If the configuration file is not found.
-        configparser.Error: If there is an error parsing the configuration file.
+        json.JSONDecodeError: If there is an error parsing the JSON configuration file.
    """
    try:
-        config_path = Path(__file__).resolve().parents[2] / "main_config"
-        config = configparser.ConfigParser()
-        config.read(config_path, encoding="utf-8")
+        config_path = Path(os.environ["ALWRITY_CONFIG"])
        
-        if 'llm_config' in config_section:
-	        gpt_provider = config.get('llm_options', 'gpt_provider')
-	        model = config.get('llm_options', 'model')
-	        temperature = config.getfloat('llm_options', 'temperature')
-	        max_tokens = config.getint('llm_options', 'max_tokens')
-	        top_p = config.getfloat('llm_options', 'top_p')
-	        n = config.getint('llm_options', 'n')
-	        frequency_penalty = config.getfloat('llm_options', 'frequency_penalty')
-	
-	        return gpt_provider, model, temperature, max_tokens, top_p, n, frequency_penalty
-        elif 'blog_characteristics' in config_section:
-            # Access and return the specified config values
-            blog_tone = config.get('blog_characteristics', 'blog_tone')
-            blog_demographic = config.get('blog_characteristics', 'blog_demographic')
-            blog_type = config.get('blog_characteristics', 'blog_type')
-            blog_language = config.get('blog_characteristics', 'blog_language')
-            blog_output_format = config.get('blog_characteristics', 'blog_output_format')
+        with open(config_path, 'r', encoding="utf-8") as file:
+            config = json.load(file)
+        
+        if config_section == 'llm_config':
+            gpt_provider = config['LLM Options']['GPT Provider']
+            model = config['LLM Options']['Model']
+            temperature = config['LLM Options']['Temperature']
+            max_tokens = config['LLM Options']['Max Tokens']
+            top_p = config['LLM Options']['Top-p']
+            n = config['LLM Options']['N']
+            frequency_penalty = config['LLM Options']['Frequency Penalty']
+            presence_penalty = config['LLM Options']['Presence Penalty']
+            
+            return gpt_provider, model, temperature, max_tokens, top_p, n, frequency_penalty

-            return blog_tone, blog_demographic, blog_type, blog_language, blog_output_format
+        elif config_section == 'blog_characteristics':
+            blog_tone = config['Blog Content Characteristics']['Blog Tone']
+            blog_demographic = config['Blog Content Characteristics']['Blog Demographic']
+            blog_type = config['Blog Content Characteristics']['Blog Type']
+            blog_language = config['Blog Content Characteristics']['Blog Language']
+            blog_output_format = config['Blog Content Characteristics']['Blog Output Format']
+            blog_length = config['Blog Content Characteristics']['Blog Length']

-        elif 'web_research' in config_section:
-            # Access the config file and return the specified values
-            geo_location = config.get('web_research', 'geo_location')
-            search_language = config.get('web_research', 'search_language')
-            num_results = config.getint('web_research', 'num_results')
-            time_range = config.get('web_research', 'time_range')
-            include_domains = config.get('web_research', 'include_domains')
-            similar_url = config.get('web_research', 'similar_url')
+            return blog_tone, blog_demographic, blog_type, blog_language, blog_output_format, blog_length
+
+        elif config_section == 'web_research':
+            geo_location = config['Search Engine Parameters']['Geographic Location']
+            search_language = config['Search Engine Parameters']['Search Language']
+            num_results = config['Search Engine Parameters']['Number of Results']
+            time_range = config['Search Engine Parameters']['Time Range']
+            include_domains = config['Search Engine Parameters']['Include Domains']
+            similar_url = config['Search Engine Parameters']['Similar URL']

            return geo_location, search_language, num_results, time_range, include_domains, similar_url

    except FileNotFoundError:
        logger.error(f"Configuration file not found: {config_path}")
        raise
-    except configparser.Error as err:
-        logger.error(f"Error reading LLM parameters from config file: {err}")
+    except json.JSONDecodeError as err:
+        logger.error(f"Error reading parameters from config file: {err}")
+        raise
+    except KeyError as err:
+        logger.error(f"Missing key in the configuration file: {err}")
        raise
    except Exception as err:
        logger.error(f"An unexpected error occurred: {err}")
--- a/lib/workspace/alwrity_config/main_config.json
+++ b/lib/workspace/alwrity_config/main_config.json
@@ -0,0 +1,32 @@
+{
+    "Blog Content Characteristics": {
+        "Blog Length": "2000",
+        "Blog Tone": "Beginner",
+        "Blog Demographic": "Tech-savvy",
+        "Blog Type": "Informational",
+        "Blog Language": "English",
+        "Blog Output Format": "markdown"
+    },
+    "Blog Images Details": {
+        "Image Generation Model": "stable-diffusion",
+        "Number of Blog Images": 1
+    },
+    "LLM Options": {
+        "GPT Provider": "google",
+        "Model": "gemini-1.5-flash-latest",
+        "Temperature": 0.7,
+        "Top-p": 0.9,
+        "Max Tokens": 4000,
+        "N": 1,
+        "Frequency Penalty": 1.0,
+        "Presence Penalty": 1.0
+    },
+    "Search Engine Parameters": {
+        "Geographic Location": "us",
+        "Search Language": "en",
+        "Number of Results": 10,
+        "Time Range": "anytime",
+        "Include Domains": "",
+        "Similar URL": ""
+    }
+}