Keyword, Audio to Blog - WIP

2024-04-09 18:07:06 +05:30
parent d968e06a9d
commit c30adb3716
10 changed files with 231 additions and 128 deletions
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ To start using this tool, simply follow one of the options below:
 - Open PowerShell or Windows Terminal: Press `Windows Key + X`, then select "Windows Terminal".
 - Paste or type and press enter:⏎.⏎.<br> 
-`winget install --id Git.Git -e --source winget`
+winget install --id Git.Git -e --source winget
 - Wait for download bars to finish
 *Note for Linux Users:* If you're on Linux and can't install these, get lost 🧙♂️
@@ -43,11 +43,11 @@ To clone the repository to your local machine, perform the following steps:
 2. **Navigate to the Desired Directory:** Use the `cd` command to move to the directory where you want to clone the repository. 
 3. **Clone the Repository:** Run the following command in PowerShell to clone the repository:
-`git clone https://github.com/AJaySi/AI-Blog-Writer.git`
+git clone https://github.com/AJaySi/AI-Blog-Writer.git
 This command will download all the files from the repository to your local machine.
 4. **Verify the Clone:** After the cloning process is complete, navigate into the newly created directory using: 
-`cd AI-Blog-Writer`
+cd AI-Blog-Writer
 ```
 Once you've cloned the repository, you can proceed with the next steps for installation and setup.
--- a/alwrity.py
+++ b/alwrity.py
@@ -1,5 +1,6 @@
 import os
 from pathlib import Path
 import configparser
 import typer
 from prompt_toolkit.shortcuts import checkboxlist_dialog, message_dialog, input_dialog
@@ -268,7 +269,7 @@ def do_web_research():
        while True:
            print("________________________________________________________________")
            search_keywords = input_dialog(
-                    title='Enter Search Keywords below:',
+                    title='Enter Search Keywords below: More Options in main_config.',
                    text='👋 Enter keywords for web research (Or keywords from your blog):',
                ).run()
            if search_keywords and len(search_keywords.split()) >= 2:
@@ -278,34 +279,15 @@ def do_web_research():
                    title='Warning',
                    text='🚫 Search keywords should be at least three words long. Please try again.'
                ).run()
    selected_time_range = prompt_for_time_range()
    # Display input dialog for similar search URL (optional)
    similar_url = input_dialog(
        title="Enter a similar search URL",
        text="👋 Enter a similar search URL (Optional: Enter to skip):\n🙋Usecases: Competitor Analysis Tool. 📡Discover similar companies, startups and technologies.",
        default="",
    ).run()
    # Display input dialog for included URLs (optional)
    include_urls = input_dialog(
        title="Enter URLs to include in the web search:",
        text="👋 Enter comma-separated URLs to include in web research (press Enter to skip):\n🙋 If you wish to [bold]confine search[/bold] to certain domains like wikipedia etc.",
        default="",
    ).run()
    try:
        print(f"🚀🎬🚀 [bold green]Starting web research on given keywords: {search_keywords}..")
-        #print(f"Web Research: Time Range - {time_range}, Search Keywords - {search_keywords}, Include URLs - {include_urls}")
+        web_research_result = gpt_web_researcher(search_keywords)
        web_research_result = gpt_web_researcher(search_keywords,
                time_range=selected_time_range,
                include_domains=include_urls,
                similar_url=similar_url)
    except Exception as err:
        print(f"\n💥🤯 [bold red]ERROR 🤯 : Failed to do web research: {err}\n")
 def check_llm_environs():
    """ Function to check which LLM api is given. """
    # Check if GPT_PROVIDER is defined in .env file
--- a/lib/ai_web_researcher/common_utils.py
+++ b/lib/ai_web_researcher/common_utils.py
@@ -0,0 +1,101 @@
 # Common utils for web_researcher
 import os
 import sys
 import re
 import configparser
 from datetime import datetime, timedelta
 from pathlib import Path
 from loguru import logger
 logger.remove()
 logger.add(sys.stdout,
        colorize=True,
        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
    )
 def cfg_search_param(flag):
    """
    Read values from the main_config file and return them as variables and a dictionary.
    Args:
        file_path (str): The path to the main_config file.
    Returns:
        dict: A dictionary containing the values read from the config file.
        str: The geographic location value.
        str: The search language value.
        int: The number of search results to fetch.
    """
    try:
        file_path = Path(__file__).resolve().parents[2] / "main_config"
        logger.info(f"Reading search config params from {file_path}")
        config = configparser.ConfigParser()
        config.read(file_path)
        web_research_section = config["web_research"]
        if 'serperdev' in flag:
            # Get values as variables
            geo_location = web_research_section.get("geo_location")
            search_language = web_research_section.get("search_language")
            num_results = web_research_section.getint("num_results")
            return geo_location, search_language, num_results
        elif 'tavily' in flag:
            include_urls = web_research_section.get("include_domains")
            pattern = re.compile(r"^(https?://\w+)(,\s*https?://\w+)*$")
            if pattern.match(include_urls) is not None:
                include_urls = include_urls.split(',')
            elif re.match(r"^http?://\w+$", include_urls) is not None:
                include_urls = include_urls.split(" ")
            else:
                include_urls = None
            return include_urls
        elif 'exa' in flag:
            include_urls = web_research_section.get("include_domains")
            pattern = re.compile(r"^(https?://\w+)(,\s*https?://\w+)*$")
            if pattern.match(include_urls) is not None:
                include_urls = include_urls.split(',')
            elif re.match(r"^http?://\w+$", include_urls) is not None:
                include_urls = include_urls.split(" ")
            else:
                include_urls = None
            num_results = web_research_section.getint("num_results")
            similar_url = web_research_section.get("similar_url")
            time_range = web_research_section.get("time_range")
            if time_range == "past day":
                start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
            elif time_range == "past week":
                start_published_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
            elif time_range == "past month":
                start_published_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
            elif time_range == "past year":
                start_published_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
            elif time_range == "anytime" or not time_range:
                start_published_date = None
            time_range = start_published_date
            return include_urls, time_range, num_results, similar_url
    except FileNotFoundError:
        logger.error(f"Error: Config file '{file_path}' not found.")
        return {}, None, None, None
    except KeyError as e:
        logger.error(f"Error: Missing section or option in config file: {e}")
        return {}, None, None, None
    except ValueError as e:
        logger.error(f"Error: Invalid value in config file: {e}")
        return {}, None, None, None
 def save_in_file(table_content):
    """ Helper function to save search analysis in a file. """
    file_path = os.environ.get('SEARCH_SAVE_FILE')
    try:
        # Save the content to the file
        with open(file_path, "a+") as file:
            file.write(table_content)
            file.write("\n" * 3)  # Add three newlines at the end
        logger.info(f"Search content saved to {file_path}")
    except Exception as e:
        logger.error(f"Error occurred while writing to the file: {e}")
--- a/lib/ai_web_researcher/google_serp_search.py
+++ b/lib/ai_web_researcher/google_serp_search.py
@@ -20,20 +20,18 @@ Modifications:
 - Customize the search parameters, such as location and language, in the functions as needed.
 - Adjust logging configurations, table formatting, and other aspects based on preferences.
 To-Do (TBD):
 - Consider adding further enhancements or customization based on specific use cases.
 Note: This script depends on external libraries such as SerpApi, Loguru, Rich, and Tabulate. Install them using 'pip install serpapi loguru rich tabulate' if not already installed.
 """
 import os
 from pathlib import Path
 import sys
-
+import configparser
 from pathlib import Path
 import pandas as pd
 import json
 import requests
 from clint.textui import progress
 #from serpapi import GoogleSearch
 from loguru import logger
 from tabulate import tabulate
@@ -49,6 +47,8 @@ logger.add(
    format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
           )
 from .common_utils import save_in_file, cfg_search_param
 from tenacity import retry, stop_after_attempt, wait_random_exponential
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
@@ -87,16 +87,6 @@ def google_search(query):
 #    except Exception as err:
 #        logger.error("FIXME: Failed to do Google search with BROWSERLESS API.")
 #        logger.debug("FIXME: Trying with dataforSEO API.")
 #                
 #    # Retry with dataforSEO API
 #    try:
 #        logger.info("Perform SERP with Data for SEO.")
 #        #search_result = perform_dataforseo_google_search(query)
 #        #return process_search_results(search_result, flag)
 #    except Exception as err:
 #        logger.error("FIXME: Failed to do Google search with dataforSEO API.")
 #        logger.debug("All retries failed. Giving up.")
 #        raise
@@ -159,12 +149,17 @@ def perform_serperdev_google_search(query):
    # Serper API endpoint URL
    url = "https://google.serper.dev/search"
    try:
        geo_loc, lang, num_results = cfg_search_param('serperdev') 
    except Exception as err:
        logger.error(f"Failed to read config {err}")
    # FIXME: Expose options to end user. Request payload
    payload = json.dumps({
        "q": query,
-        "gl": "in",
+        "gl": geo_loc,
-        "hl": "en",
+        "hl": lang,
-        "num": 10,
+        "num": num_results,
        "autocorrect": True,
        "page": 1,
        "type": "search",
@@ -294,16 +289,3 @@ def process_search_results(search_results):
    except Exception as save_results_err:
        logger.error(f"Failed to save search results: {save_results_err}")
    return search_results
 def save_in_file(table_content):
    """ Helper function to save search analysis in a file. """
    file_path = os.environ.get('SEARCH_SAVE_FILE')
    try:
        # Save the content to the file
        with open(file_path, "a+") as file:
            file.write(table_content)
            file.write("\n" * 3)  # Add three newlines at the end
        logger.info(f"Search content saved to {file_path}")
    except Exception as e:
        logger.error(f"Error occurred while writing to the file: {e}")
--- a/lib/ai_web_researcher/gpt_online_researcher.py
+++ b/lib/ai_web_researcher/gpt_online_researcher.py
@@ -1,6 +1,22 @@
 ################################################################
 # 
 # ## Features
 #
 # - **Web Research**: Alwrity enables users to conduct web research efficiently. 
 # By providing keywords or topics of interest, users can initiate searches across multiple platforms simultaneously.
 #
 # - **Google SERP Search**: The tool integrates with Google Search Engine Results Pages (SERP) 
 # to retrieve relevant information based on user queries. It offers insights into organic search results, 
 # People Also Ask, and related searches.
 #
 # - **Tavily AI Integration**: Alwrity leverages Tavily AI's capabilities to enhance web research. 
 # It utilizes advanced algorithms to search for information and extract relevant data from various sources.
 #
 # - **Metaphor AI Semantic Search**: Alwrity employs Metaphor AI's semantic search technology to find related articles and content. 
 # By analyzing context and meaning, it delivers precise and accurate results.
 #
 # - **Google Trends Analysis**: The tool provides Google Trends analysis for user-defined keywords. 
 # It helps users understand the popularity and trends associated with specific topics over time.
 # 
 ##############################################################
@@ -26,8 +42,9 @@ logger.add(sys.stdout,
           )
-def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(), similar_url=None):
+def gpt_web_researcher(search_keywords):
-    """ """
+    """ Keyword based web researcher, basic, neural and Semantic search."""
    print(f"Web Research:Time Range - {time_range},Search Keywords - {search_keywords},Include URLs - {include_domains}")
    # TBD: Keeping the results directory as fixed, for now.
    os.environ["SEARCH_SAVE_FILE"] = os.path.join(os.getcwd(), "workspace", "web_research_reports",                                                 search_keywords.replace(" ", "_") + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
@@ -35,8 +52,8 @@ def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(),
        include_domains = list()
    google_search_result = do_google_serp_search(search_keywords)
-    tavily_search_result = do_tavily_ai_search(search_keywords, include_domains)
+    tavily_search_result = do_tavily_ai_search(search_keywords)
-    metaphor_search_result = do_metaphor_ai_research(search_keywords, include_domains, time_range, similar_url)
+    metaphor_search_result = do_metaphor_ai_research(search_keywords)
    gtrends_search_result = do_google_pytrends_analysis(search_keywords)
    # get_rag_results(search_query)
    print(f"\n\nReview the analysis in this file at: {os.environ.get('SEARCH_SAVE_FILE')}\n")
@@ -54,30 +71,23 @@ def do_google_serp_search(search_keywords):
        # Not failing, as tavily would do same and then GPT-V to search.
-def do_tavily_ai_search(search_keywords, include_domains=None):
+def do_tavily_ai_search(search_keywords):
    """ Common function to do Tavily AI web research."""
    try:
        # FIXME: Include the follow-up questions as blog FAQs.
        logger.info(f"Doing Tavily AI search for: {search_keywords}")
-        t_results = get_tavilyai_results(search_keywords, include_domains)
+        t_results = get_tavilyai_results(search_keywords)
        t_titles = tavily_extract_information(t_results, 'titles')
        return(t_results, t_titles)
    except Exception as err:
        logger.error(f"Failed to do Tavily AI Search: {err}")
-def do_metaphor_ai_research(search_keywords,
+def do_metaphor_ai_research(search_keywords):
        include_domains=None,
        time_range=None,
        similar_url=None):
    """ """
    try:
        logger.info(f"Start Semantic/Neural web search with Metahpor: {search_keywords}")
-        response_articles = metaphor_search_articles(
+        response_articles = metaphor_search_articles(search_keywords)
                search_keywords,
                include_domains=include_domains,
                time_range=time_range,
                similar_url=similar_url)
        m_titles = metaphor_extract_titles_or_text(response_articles, return_titles=True)
        return(response_articles, m_titles)
    except Exception as err:
--- a/lib/ai_web_researcher/metaphor_basic_neural_web_search.py
+++ b/lib/ai_web_researcher/metaphor_basic_neural_web_search.py
@@ -26,6 +26,7 @@ from exa_py import Exa
 from tenacity import (retry, stop_after_attempt, wait_random_exponential,)# for exponential backoff
 from .gpt_summarize_web_content import summarize_web_content
 from .gpt_competitor_analysis import summarize_competitor_content
 from .common_utils import save_in_file, cfg_search_param
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
@@ -99,12 +100,7 @@ def metaphor_find_similar(similar_url):
-def metaphor_search_articles(query, 
+def metaphor_search_articles(query):
        num_results=5,
        use_autoprompt=True,
        include_domains=[],
        time_range=None,
        similar_url=None):
    """
    Search for articles using the Metaphor API.
@@ -120,16 +116,7 @@ def metaphor_search_articles(query,
    """
    metaphor = get_metaphor_client()
    try:
-        if time_range == "past day":
+        include_domains, start_published_date, num_results, similar_url = cfg_search_param('exa')
            start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        elif time_range == "past week":
            start_published_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
        elif time_range == "past month":
            start_published_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
        elif time_range == "past year":
            start_published_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
        else:
            start_published_date = None
        logger.info(f"Metaphor web search with Date: {start_published_date} and Query: {query}")
        try:
@@ -145,6 +132,7 @@ def metaphor_search_articles(query,
        # From each webpage, get a summary of the web page.
        contents_response = search_response.results
        # FIXME: Need to summarize for smaller input context window.
 #        for content in tqdm(contents_response, desc="Reading Web URL content:", unit="content"):
 #            summarized_content = summarize_web_content(content.text, "gemini")
 #            content.text = summarized_content
@@ -200,19 +188,6 @@ def print_search_result(contents_response):
        logger.error(f"Failed to save search results: {save_results_err}")
 def save_in_file(table_content):
    """ Helper function to save search analysis in a file. """
    file_path = os.environ.get('SEARCH_SAVE_FILE')
    try:
        # Save the content to the file
        with open(file_path, "a+") as file:
            file.write(table_content)
            file.write("\n" * 3)  # Add three newlines at the end
        logger.info(f"Search content saved to {file_path}")
    except Exception as e:
        logger.error(f"Error occurred while writing to the file: {e}")
 def metaphor_scholar_search(query, include_domains=None, time_range="anytime"):
    """
    Search for papers using the Metaphor API.
--- a/lib/ai_web_researcher/tavily_ai_search.py
+++ b/lib/ai_web_researcher/tavily_ai_search.py
@@ -22,7 +22,6 @@ Modifications:
 To-Do (TBD):
 - Consider adding further enhancements or customization based on specific use cases.
 Note: This script depends on external libraries such as Tavily, Rich, Tabulate, Loguru, and Tenacity. Install them using 'pip install tavily rich tabulate loguru tenacity' if not already installed.
 """
@@ -44,11 +43,14 @@ logger.add(sys.stdout,
           colorize=True,
           format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
           )
 from .common_utils import save_in_file, cfg_search_param
 from tenacity import retry, stop_after_attempt, wait_random_exponential
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
-def get_tavilyai_results(keywords, include_urls, search_depth="advanced"):
+def get_tavilyai_results(keywords):
    """
    Get Tavily AI search results based on specified keywords and options.
@@ -74,11 +76,23 @@ def get_tavilyai_results(keywords, include_urls, search_depth="advanced"):
    except Exception as err:
        logger.error(f"Failed to create Tavily client. Check TAVILY_API_KEY: {err}")
    # Read search config params from the file.
    try:
        include_urls = cfg_search_param('tavily')
    except Exception as err:
        logger.error(f"Failed to read search params from main_config: {err}")
    try:
        if include_urls:
-            tavily_search_result = client.search(keywords, search_depth, include_answer=True, include_domains=include_urls)
+            tavily_search_result = client.search(keywords, 
                    search_depth="advanced", 
                    include_answer=True, 
                    include_domains=include_urls)
        else:
-            tavily_search_result = client.search(keywords, search_depth, include_answer=True)
+            tavily_search_result = client.search(keywords, 
                    search_depth = "advanced", 
                    include_answer=True)
        print_result_table(tavily_search_result)
        return(tavily_search_result)
    except Exception as err:
--- a/lib/ai_writers/keywords_to_blog.py
+++ b/lib/ai_writers/keywords_to_blog.py
@@ -37,26 +37,26 @@ def write_blog_from_keywords(search_keywords, url=None):
    blog_markdown_str = ""
    example_blog_titles = []
-    logger.info(f"Researching and Writing Blog on keywords: {search_keywords}")
+#    logger.info(f"Researching and Writing Blog on keywords: {search_keywords}")
-    # Call on the got-researcher, tavily apis for this. Do google search for organic competition.
+#    # Call on the got-researcher, tavily apis for this. Do google search for organic competition.
-    try:
+#    try:
-        google_search_result, g_titles = do_google_serp_search(search_keywords)
+#        google_search_result, g_titles = do_google_serp_search(search_keywords)
-        example_blog_titles.append(g_titles)
+#        example_blog_titles.append(g_titles)
-        blog_markdown_str = write_blog_google_serp(search_keywords, google_search_result)
+#        blog_markdown_str = write_blog_google_serp(search_keywords, google_search_result)
-    except Exception as err:
+#    except Exception as err:
-        logger.error(f"Failed in Google web research: {err}")
+#        logger.error(f"Failed in Google web research: {err}")
-    # logger.info/check the final blog content.
+#    # logger.info/check the final blog content.
-    logger.info("\n######### Draft1: Finished Blog from Google web search: ###########\n\n")
+#    logger.info("\n######### Draft1: Finished Blog from Google web search: ###########\n\n")
-    # Do Tavily AI research to augument the above blog.
+#    # Do Tavily AI research to augument the above blog.
-    try:
+#    try:
-        tavily_search_result, t_titles = do_tavily_ai_search(search_keywords)
+#        tavily_search_result, t_titles = do_tavily_ai_search(search_keywords)
-        example_blog_titles.append(t_titles)
+#        example_blog_titles.append(t_titles)
-        blog_markdown_str = blog_with_research(blog_markdown_str, tavily_search_result)
+#        blog_markdown_str = blog_with_research(blog_markdown_str, tavily_search_result)
-        logger.info(f"######### Blog content after Tavily AI research: ######### \n\n{blog_markdown_str}\n\n")
+#        logger.info(f"######### Blog content after Tavily AI research: ######### \n\n{blog_markdown_str}\n\n")
-    except Exception as err:
+#    except Exception as err:
-        logger.error(f"Failed to do Tavily AI research: {err}")
+#        logger.error(f"Failed to do Tavily AI research: {err}")
-    logger.info("######### Draft2: Blog content after Tavily AI research: #########\n\n")
+#    logger.info("######### Draft2: Blog content after Tavily AI research: #########\n\n")
    try:
        # Do Metaphor/Exa AI search.
--- a/lib/gpt_providers/audio_to_text_generation/stt_audio_blog.py
+++ b/lib/gpt_providers/audio_to_text_generation/stt_audio_blog.py
@@ -99,7 +99,7 @@ def speech_to_text(video_url, output_path='.'):
                os.remove(audio_file)
                logger.info("Temporary audio file removed.")
        except PermissionError:
-            logger.error(f"Permission error: Cannot remove '{audio_file}'. Please make sure you have the necessary permissions.")
+            logger.error(f"Permission error: Cannot remove '{audio_file}'. Please make sure of necessary permissions.")
        except Exception as e:
            logger.error(f"An error occurred removing audio file: {e}")
--- a/39
+++ b/39
@@ -33,6 +33,7 @@ blog_output_folder = ""
 blog_image_output_folder = ""
 ############################################################
 #
 # Blog Images details.
@@ -41,6 +42,7 @@ blog_image_output_folder = ""
 #
 ############################################################
 [img_details]
 # Options are dalle2, dalle3, stable-diffusion.
 image_gen_model = "stable-diffusion"
@@ -48,6 +50,7 @@ image_gen_model = "stable-diffusion"
 num_images = 1
 ###########################################################
 #
 # Define LLM and its charateristics for fine control on output
@@ -93,3 +96,39 @@ frequency_penalty = 1
 # "Try using different words instead of repeating the same ones."
 # from -2 (more flexible while generating text) to 2 (strong discouragement in repetition).
 presence_penalty = 1
 ######################################################
 #
 # Search Engine Paramters.
 # Alwrity does comprehensive web research for given content topic.
 # Choose search engine parameters below, this finetunes search results
 # and makes the generated content more accurate.
 #
 ######################################################
 # Visit https://serper.dev/playground and provide values from there.
 # https://api.serper.dev/locations
 [web_research]
 # Geographic location(gl): This values restricts the web search to given country.
 # Examples are us for United States, in for India, fr for france, cn for china etc
 geo_location = us
 # Locale:hl:language : Define the language you want to search results in.
 # Example: en for english, zn-cn for chinese, de for german, hi for hindi etc
 search_language: en
 # num_results: Default 10 - Number of google search results to fetch.
 num_results = 10
 # time_range: Acceptable values, past day, past week, past month, past year
 # This limits the search results for given time duration, from today.
 time_range = anytime
 # include_domains (Give Full URLs, separate by comma): A list of domains to specifically include in the search results. 
 # Default is None, which includes all domains. Example: Wikipedia.com, stackoverflow.com, google schalor, reddit etc
 include_domains =
 # similar_url : A single URL, this will instruct search engines to give results similar to the given URL.
 similar_url =