WIP-AI writer, Try Web research working.

2024-02-18 10:00:24 +05:30
parent 2a3315f211
commit d89d9ad3d2
21 changed files with 117 additions and 1512 deletions
--- a/lib/ai_web_researcher/google_serp_search.py
+++ b/lib/ai_web_researcher/google_serp_search.py
@@ -213,7 +213,6 @@ def process_search_results(search_results):
    data = []
    logger.info(f"Google Search Parameters: {search_results.get('searchParameters', {})}")
    organic_results = search_results.get("organic", [])
-    print(search_results)

    # Displaying Organic Results
    organic_data = []
--- a/lib/ai_web_researcher/google_trends_researcher.py
+++ b/lib/ai_web_researcher/google_trends_researcher.py
@@ -7,7 +7,7 @@ Features:
 - Visualizes Google Trends data, including interest over time and interest by region.
 - Retrieves related queries and topics for a set of search keywords.
 - Utilizes visualization libraries such as Matplotlib, Plotly, and Rich for displaying results.
- Incorporates logging for error handling and informative messages.
+- Incorporates logger.for error handling and informative messages.

 Usage:
 - Provide a search term or a list of search terms for analysis.
@@ -22,6 +22,7 @@ Modifications:
 Note: Ensure that the required libraries are installed using 'pip install pytrends requests_html tqdm tabulate plotly rich'.
 """

+import os
 import requests
 import numpy as np
 import sys
@@ -37,14 +38,12 @@ import pandas as pd
 import matplotlib.pyplot as plt
 import plotly.express as px
 import plotly.io as pio
-import logging
 from requests_html import HTML, HTMLSession
 from urllib.parse import quote_plus
 from tqdm import tqdm
 from tabulate import tabulate
 from pytrends.request import TrendReq
-import wordcloud
-logging.basicConfig(level=logging.INFO)
+from wordcloud import WordCloud
 from loguru import logger

 # Configure logger
@@ -75,7 +74,7 @@ def fetch_google_trends_interest_overtime(keyword):

        return data
    except Exception as e:
-        logging.error(f"Error in fetch_google_trends_data: {e}")
+        logger.error(f"Error in fetch_google_trends_data: {e}")
        return pd.DataFrame()


@@ -151,10 +150,11 @@ def get_related_queries_and_save_csv(keywords, hl='en-US', tz=360, cat=0, timefr
        print("\n\033[1m🔝 Top\033[0m: The most popular search queries. Scoring is on a relative scale where a value of 100 is the most commonly searched query, 50 is a query searched half as often, and a value of 0 is a query searched for less than 1% as often as the most popular query.\n")
        print("\n\033[1m🚀 Rising\033[0m: Queries with the biggest increase in search frequency since the last time period. Results marked 'Breakout' had a tremendous increase, probably because these queries are new and had few (if any) prior searches.\n")
        # Display the DataFrame using tabulate
-        print(tabulate(all_queries_df, headers='keys', tablefmt='fancy_grid'))
+        table = tabulate(all_queries_df, headers='keys', tablefmt='fancy_grid')
+        print(table)
        # Save the combined table to a file
        try:
-            save_in_file(all_queries_df)
+            save_in_file(table)
        except Exception as save_results_err:
            logger.error(f"Failed to save search results: {save_results_err}")
        return top_rising_queries
@@ -178,6 +178,7 @@ def get_related_topics_and_save_csv(search_keywords):
        pytrends = TrendReq(hl='en-US', tz=360)
        
        # Build payload
+        # FIXME: Remove hardcoding.
        pytrends.build_payload(search_keywords, cat=0, timeframe='today 12-m')

        # Get related topics
@@ -198,10 +199,7 @@ def get_related_topics_and_save_csv(search_keywords):
        # Rename columns to avoid duplicates and provide meaningful names
        df_top_topics.columns = ['Top- ' + col if col != 'topic_title' else col for col in df_top_topics.columns]
        df_rising_topics.columns = ['Rising- ' + col if col != 'topic_title' else col for col in df_rising_topics.columns]
-
-        # Save to CSV
        all_topics_df = pd.concat([df_top_topics, df_rising_topics], axis=1)
-        #all_topics_df.to_csv('related_topics.csv', index=False)

        print(f"\n\n 📢❗🚨 Rising and Trending Keywords for {search_keywords}\n")
        print("\033[1m🔝 Top\033[0m: The most popular search topics.")
@@ -209,11 +207,15 @@ def get_related_topics_and_save_csv(search_keywords):
        # Display the DataFrame using tabulate
        pd.set_option('display.max_rows', all_topics_df.shape[0]+1)
        print(all_topics_df.head(10))
-        #print(tabulate(all_topics_df, headers='keys', tablefmt='fancy_grid'))
+        table = tabulate(all_topics_df, headers='keys', tablefmt='fancy_grid')
+        try:
+            save_in_file(table)
+        except Exception as save_results_err:
+            logger.error(f"Failed to save search results: {save_results_err}")
        return all_topics_df

    except Exception as e:
-        print(f"ERROR: An error occurred: {e}")
+        print(f"ERROR: An error occurred in related topics: {e}")
        return pd.DataFrame()


@@ -224,7 +226,7 @@ def get_source(url):
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response
    except requests.exceptions.RequestException as e:
-        logging.error(f"Error during HTTP request: {e}")
+        logger.error(f"Error during HTTP request: {e}")
        return None


@@ -240,10 +242,10 @@ def get_results(query):
        else:
            return None
    except json.JSONDecodeError as e:
-        logging.error(f"Error decoding JSON response: {e}")
+        logger.error(f"Error decoding JSON response: {e}")
        return None
    except requests.exceptions.RequestException as e:
-        logging.error(f"Error during HTTP request: {e}")
+        logger.error(f"Error during HTTP request: {e}")
        return None


@@ -256,7 +258,7 @@ def format_results(results):
            suggestions.append(suggestion)
        return suggestions
    except (KeyError, IndexError) as e:
-        logging.error(f"Error parsing search results: {e}")
+        logger.error(f"Error parsing search results: {e}")
        return []


@@ -288,7 +290,7 @@ def get_expanded_terms(query):

        return terms
    except Exception as e:
-        logging.error(f"Error in get_expanded_terms: {e}")
+        logger.error(f"Error in get_expanded_terms: {e}")
        return []


@@ -307,7 +309,7 @@ def get_expanded_suggestions(query):

        return all_results
    except Exception as e:
-        logging.error(f"Error in get_expanded_suggestions: {e}")
+        logger.error(f"Error in get_expanded_suggestions: {e}")
        return []


@@ -321,10 +323,14 @@ def get_suggestions_for_keyword(search_term):
        #expanded_results_df.to_csv('results.csv', index=False)
        pd.set_option('display.max_rows', expanded_results_df.shape[0]+1)
        expanded_results_df.drop_duplicates('Keywords', inplace=True)
-
+        table = tabulate(expanded_results_df, headers=['Keywords', 'Relevance'], tablefmt='fancy_grid')
+        try:
+            save_in_file(table)
+        except Exception as save_results_err:
+            logger.error(f"Failed to save search results: {save_results_err}")
        return expanded_results_df
    except Exception as e:
-        logging.error(f"get_suggestions_for_keyword: Error in main: {e}")
+        logger.error(f"get_suggestions_for_keyword: Error in main: {e}")



@@ -355,7 +361,7 @@ def perform_keyword_clustering(expanded_results_df, num_clusters=5):

        return expanded_results_df
    except Exception as e:
-        logging.error(f"Error in perform_keyword_clustering: {e}")
+        logger.error(f"Error in perform_keyword_clustering: {e}")
        return pd.DataFrame()


@@ -404,7 +410,7 @@ def visualize_silhouette(X, labels):

        plt.show()
    except Exception as e:
-        logging.error(f"Error in visualize_silhouette: {e}")
+        logger.error(f"Error in visualize_silhouette: {e}")



@@ -435,9 +441,9 @@ def print_and_return_top_keywords(expanded_results_df, num_clusters=5):
    table = tabulate(top_keywords_df, headers='keys', tablefmt='fancy_grid')
    # Save the combined table to a file
    try:
-        save_in_file(top_keywords_df)
+        save_in_file(table)
    except Exception as save_results_err:
-        logger.error(f"Failed to save search results: {save_results_err}")
+        logger.error(f"🚨 Failed to save search results: {save_results_err}")
    print(table)
    return top_keywords_df

@@ -484,11 +490,15 @@ def do_google_trends_analysis(search_term):
        for asearch_term in search_term:
            #FIXME: Lets work with a single root keyword.
            suggestions_df = get_suggestions_for_keyword(asearch_term)
+            if len(suggestions_df['Keywords']) > 10:
+                result_df = perform_keyword_clustering(suggestions_df)
+                # Display top keywords in each cluster
+                top_keywords = print_and_return_top_keywords(result_df)
+                all_the_keywords.append(top_keywords['Keywords'].tolist())
+            else:
+                all_the_keywords.append(suggestions_df['Keywords'].tolist())
+            all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords])

-            result_df = perform_keyword_clustering(suggestions_df)
-            # Display top keywords in each cluster
-            top_keywords = print_and_return_top_keywords(result_df)
-            all_the_keywords.append(top_keywords['Keywords'].tolist())
 #        
 #        # FIXME: Get result from vision GPT. Fetch and visualize Google Trends data
 #        #trends_data = fetch_google_trends_interest_overtime("llamaindex")
@@ -496,23 +506,17 @@ def do_google_trends_analysis(search_term):
 #        # FIXME: Plot Interest Over time.
 #        result_df = plot_interest_by_region(search_term)
 #        
-#        # Display additional information
+        # Display additional information
        result_df = get_related_topics_and_save_csv(search_term)
        # Extract 'Top' topic_title
        top_topic_title = result_df['topic_title'].values.tolist()
-
        # Join each sublist into one string separated by comma
        #top_topic_title = [','.join(filter(None, map(str, sublist))) for sublist in top_topic_title]
        top_topic_title = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in top_topic_title])

-        print(f"\nRising and Top keywords: {top_topic_title}")
-        # Print or use the extracted topic titles
-        all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords])
-        print(f"\n\n📢❗🚨 Important keywords to target: {all_the_keywords}\n\n")
-        all_the_keywords += top_topic_title
-        print(all_the_keywords)
+        # TBD: Not getting great results OR unable to understand them.
+        #all_the_keywords += top_topic_title
        all_the_keywords = all_the_keywords.split(',')
-        
        # Split the list into chunks of 5 keywords
        chunk_size = 4
        chunks = [all_the_keywords[i:i + chunk_size] for i in range(0, len(all_the_keywords), chunk_size)]
@@ -520,11 +524,15 @@ def do_google_trends_analysis(search_term):
        combined_df = pd.DataFrame(chunks, columns=[f'K📢eyword Col{i + 1}' for i in range(chunk_size)])
        
        # Print the table
-        print(tabulate(combined_df, headers='keys', tablefmt='fancy_grid'))
-        #combined_df = pd.DataFrame({'📢❗🚨 Important keywords to target': chunks})
+        table = tabulate(combined_df, headers='keys', tablefmt='fancy_grid')
+        # Save the combined table to a file
+        try:
+            save_in_file(table)
+        except Exception as save_results_err:                 
+            logger.error(f"Failed to save search results: {save_results_err}")
+        print(table)
        
-        print(all_the_keywords)
-        generate_wordcloud(all_the_keywords.split(','))
+        #generate_wordcloud(all_the_keywords)
        return(all_the_keywords)
    except Exception as e:
-        logging.error(f"Error in main: {e}")
+        logger.error(f"Error in Google Trends Analysis: {e}")
--- a/lib/ai_web_researcher/gpt_online_researcher.py
+++ b/lib/ai_web_researcher/gpt_online_researcher.py
@@ -47,33 +47,33 @@ def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(),
    google_result = None
    tavily_result = None
    report = None
-#    try:
-#        logger.info(f"Doing Google search for: {search_keywords}\n")
-#        google_result = google_search(search_keywords)
-#        blog_titles.append(extract_info(google_result, "titles"))
-#    except Exception as err:
-#        logger.error(f"Failed to do Google Serpapi research: {err}")
-#        # Not failing, as tavily would do same and then GPT-V to search.
-#
-#    try:
-#        # FIXME: Include the follow-up questions as blog FAQs.
-#        logger.info(f"Doing Tavily AI search for: {search_keywords}")
-#        tavily_result = get_tavilyai_results(search_keywords, include_domains)
-#        blog_titles.append(tavily_extract_information(tavily_result, "titles"))
-#    except Exception as err:
-#        logger.error(f"Failed to do Tavily AI Search: {err}")
+    try:
+        logger.info(f"Doing Google search for: {search_keywords}\n")
+        google_result = google_search(search_keywords)
+        blog_titles.append(extract_info(google_result, "titles"))
+    except Exception as err:
+        logger.error(f"Failed to do Google Serpapi research: {err}")
+        # Not failing, as tavily would do same and then GPT-V to search.

-#    try:
-#        logger.info(f"Start Semantic/Neural web search with Metahpor: {search_keywords}")
-#        response_articles = metaphor_search_articles(
-#                search_keywords, 
-#                include_domains=include_domains, 
-#                time_range=time_range,
-#                similar_url=similar_url)
-#        blog_titles.append(metaphor_extract_titles_or_text(response_articles, return_titles=True))
-#    except Exception as err:
-#        logger.error(f"Failed to do Metaphor search: {err}")
-#    print(blog_titles)
+    try:
+        # FIXME: Include the follow-up questions as blog FAQs.
+        logger.info(f"Doing Tavily AI search for: {search_keywords}")
+        tavily_result = get_tavilyai_results(search_keywords, include_domains)
+        blog_titles.append(tavily_extract_information(tavily_result, "titles"))
+    except Exception as err:
+        logger.error(f"Failed to do Tavily AI Search: {err}")
+
+    try:
+        logger.info(f"Start Semantic/Neural web search with Metahpor: {search_keywords}")
+        response_articles = metaphor_search_articles(
+                search_keywords, 
+                include_domains=include_domains, 
+                time_range=time_range,
+                similar_url=similar_url)
+        blog_titles.append(metaphor_extract_titles_or_text(response_articles, return_titles=True))
+    except Exception as err:
+        logger.error(f"Failed to do Metaphor search: {err}")
+    print(blog_titles)

    try:
        logger.info(f"Do Google Trends analysis for given keywords: {search_keywords}")
@@ -85,18 +85,7 @@ def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(),
    # 1. Return a list of related keywords along with search volumes.
    # 2. New blog titles to write on(niche, top) and blog sections.
    # 3. Competitors list, similar urls if given.
-
-
-class Result(NamedTuple):
-    url: str
-    id: str
-    title: str
-    score: float
-    published_date: str
-    author: str
-    text: str
-    highlights: List[str]
-    highlight_scores: List[float]
+    print(f"\n\nReview the analysis in this file at: {os.environ.get('SEARCH_SAVE_FILE')}\n")


 def metaphor_extract_titles_or_text(json_data, return_titles=True):
@@ -110,12 +99,10 @@ def metaphor_extract_titles_or_text(json_data, return_titles=True):
    Returns:
        list: List of titles or text.
    """
-    result_list = [Result(**result) for result in json_data]
-
    if return_titles:
-        return [result.title for result in result_list]
+        return [(result.title) for result in json_data]
    else:
-        return [result.text for result in result_list]
+        return [result.text for result in json_data]


 def extract_info(json_data, info_type):
--- a/lib/ai_web_researcher/metaphor_basic_neural_web_search.py
+++ b/lib/ai_web_researcher/metaphor_basic_neural_web_search.py
@@ -142,10 +142,11 @@ def metaphor_search_articles(query,
            logger.error(f"Failed in metaphor.search_and_contents: {err}")
        
        # From each webpage, get a summary of the web page.
+        print(search_response)
        contents_response = search_response.results
-        for content in tqdm(contents_response, desc="Reading Web URL content:", unit="content"):
-            summarized_content = summarize_web_content(content.text, "gemini")
-            content.text = summarized_content
+#        for content in tqdm(contents_response, desc="Reading Web URL content:", unit="content"):
+#            summarized_content = summarize_web_content(content.text, "gemini")
+#            content.text = summarized_content
        
        print_search_result(contents_response)

--- a/lib/ai_web_researcher/tavily_ai_search.py
+++ b/lib/ai_web_researcher/tavily_ai_search.py
@@ -129,18 +129,18 @@ def print_result_table(output_data):
        logger.error(f"Failed to save search results: {save_results_err}")
    
    # Display the 'follow_up_questions' in a table
-    table_headers = [f"Search Engine follow up questions for query: {output_data.get('query')}"]
-    table_data = [[output_data.get("follow_up_questions")]]
-    table = tabulate(table_data, 
+    if output_data.get("follow_up_questions"):
+        table_headers = [f"Search Engine follow up questions for query: {output_data.get('query')}"]
+        table_data = [[output_data.get("follow_up_questions")]]
+        table = tabulate(table_data, 
            headers=table_headers, 
            tablefmt="fancy_grid",
            maxcolwidths=[80])
-    print(table)
-    # Save the combined table to a file
-    try:
-        save_in_file(table)
-    except Exception as save_results_err:
-        logger.error(f"Failed to save search results: {save_results_err}")
+        print(table)
+        try:
+            save_in_file(table)
+        except Exception as save_results_err:
+            logger.error(f"Failed to save search results: {save_results_err}")


 def save_in_file(table_content):