WIP- Under maintenence- Web research working.

This commit is contained in:
AjaySi
2024-02-05 15:15:07 +05:30
parent fd7053fb4b
commit 2a3315f211
96 changed files with 4320 additions and 565 deletions

View File

@@ -258,6 +258,15 @@ def arxiv_bibtex(arxiv_id):
#search = GoogleSearch(params)
#results = search.get_dict()
#from llmsherpa.readers import LayoutPDFReader
#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
#doc = pdf_reader.read_pdf(pdf_url)
def extract_arxiv_ids_from_line(line):
"""

View File

@@ -0,0 +1,302 @@
"""
This Python script performs Google searches using various services such as SerpApi, Serper.dev, and more. It displays the search results, including organic results, People Also Ask, and Related Searches, in formatted tables. The script also utilizes GPT to generate titles and FAQs for the Google search results.
Features:
- Utilizes SerpApi, Serper.dev, and other services for Google searches.
- Displays organic search results, including position, title, link, and snippet.
- Presents People Also Ask questions and snippets in a formatted table.
- Includes Related Searches in the combined table with People Also Ask.
- Configures logging with Loguru for informative messages.
- Uses Rich and Tabulate for visually appealing and formatted tables.
Usage:
- Ensure the necessary API keys are set in the .env file.
- Run the script to perform a Google search with the specified query.
- View the displayed tables with organic results, People Also Ask, and Related Searches.
- Additional information, such as generated titles and FAQs using GPT, is presented.
Modifications:
- Update the environment variables in the .env file with the required API keys.
- Customize the search parameters, such as location and language, in the functions as needed.
- Adjust logging configurations, table formatting, and other aspects based on preferences.
To-Do (TBD):
- Consider adding further enhancements or customization based on specific use cases.
Note: This script depends on external libraries such as SerpApi, Loguru, Rich, and Tabulate. Install them using 'pip install serpapi loguru rich tabulate' if not already installed.
"""
import os
from pathlib import Path
import sys
import pandas as pd
import json
import requests
from clint.textui import progress
#from serpapi import GoogleSearch
from loguru import logger
from tabulate import tabulate
# Configure logger
logger.remove()
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
logger.add(
sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .gpt_titles_faq import gpt_titles_faqs_google_search
#from tenacity import retry, stop_after_attempt, wait_random_exponential
#@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
#FIXME: Accept language, country and time frame to search for.
def google_search(query):
"""
Perform a Google search for the given query.
Args:
query (str): The search query.
flag (str, optional): The search flag (default is "faq").
Returns:
list: List of search results based on the specified flag.
"""
try:
perform_serpapi_google_search(query)
logger.info(f"FIXME: Google serapi: {query}")
#return process_search_results(search_result)
except Exception as err:
logger.error(f"ERROR: Check Here: https://serpapi.com/. Your requests may be over. {err}")
# Retry with serper.dev
try:
logger.info("Trying Google search with Serper.dev: https://serper.dev/api-key")
search_result = perform_serperdev_google_search(query)
process_search_results(search_result)
except Exception as err:
logger.error(f"Failed to do Google search with serper.dev: {err}")
return(search_result)
# # Retry with BROWSERLESS API
# try:
# search_result = perform_browserless_google_search(query)
# #return process_search_results(search_result, flag)
# except Exception as err:
# logger.error("FIXME: Failed to do Google search with BROWSERLESS API.")
# logger.debug("FIXME: Trying with dataforSEO API.")
#
# # Retry with dataforSEO API
# try:
# logger.info("Perform SERP with Data for SEO.")
# #search_result = perform_dataforseo_google_search(query)
# #return process_search_results(search_result, flag)
# except Exception as err:
# logger.error("FIXME: Failed to do Google search with dataforSEO API.")
# logger.debug("All retries failed. Giving up.")
# raise
def perform_serpapi_google_search(query, location="in"):
"""
Perform a Google search using the SerpApi service.
Args:
query (str): The search query.
location (str, optional): The location for the search (default is "Austin, Texas").
api_key (str, optional): Your secret API key for SerpApi.
Returns:
dict: A dictionary containing the search results.
"""
try:
# Check if API key is provided
if not os.getenv("SERPAPI_KEY"):
raise ValueError("SERPAPI_KEY key is required for SerpApi")
# Create a GoogleSearch instance
search = GoogleSearch({
"q": query,
"location": location,
"api_key": api_key
})
# Get search results as a dictionary
result = search.get_dict()
return result
except ValueError as ve:
# Handle missing API key error
logger.info(f"SERPAPI ValueError: {ve}")
except Exception as e:
# Handle other exceptions
logger.info(f"SERPAPI An error occurred: {e}")
def perform_serperdev_google_search(query):
"""
Perform a Google search using the Serper API.
Args:
query (str): The search query.
Returns:
dict: The JSON response from the Serper API.
"""
# Get the Serper API key from environment variables
logger.info("Doing serper.dev google search.")
serper_api_key = os.getenv('SERPER_API_KEY')
# Check if the API key is available
if not serper_api_key:
raise ValueError("SERPER_API_KEY is missing. Set it in the .env file.")
# Serper API endpoint URL
url = "https://google.serper.dev/search"
# FIXME: Expose options to end user. Request payload
payload = json.dumps({
"q": query,
"gl": "in",
"hl": "en",
"num": 5,
"autocorrect": True,
"page": 1,
"type": "search",
"engine": "google"
})
# Request headers with API key
headers = {
'X-API-KEY': serper_api_key,
'Content-Type': 'application/json'
}
# Send a POST request to the Serper API with progress bar
with progress.Bar(label="Searching", expected_size=100) as bar:
response = requests.post(url, headers=headers, data=payload, stream=True)
# Check if the request was successful
if response.status_code == 200:
# Parse and return the JSON response
return response.json()
else:
# Print an error message if the request fails
logger.error(f"Error: {response.status_code}, {response.text}")
return None
def perform_browserless_google_search():
return
def perform_dataforseo_google_search():
return
def process_search_results(search_results):
"""
Create a Pandas DataFrame from the search results.
Args:
search_results (dict): The search results JSON.
Returns:
pd.DataFrame: Pandas DataFrame containing the search results.
"""
data = []
logger.info(f"Google Search Parameters: {search_results.get('searchParameters', {})}")
organic_results = search_results.get("organic", [])
print(search_results)
# Displaying Organic Results
organic_data = []
for result in search_results["organic"]:
position = result.get("position", "")
title = result.get("title", "")
link = result.get("link", "")
snippet = result.get("snippet", "")
organic_data.append([position, title, link, snippet])
organic_headers = ["Rank", "Title", "Link", "Snippet"]
organic_table = tabulate(organic_data,
headers=organic_headers,
tablefmt="fancy_grid",
colalign=["center", "left", "left", "left"],
maxcolwidths=[5, 25, 35, 50])
# Print the tables
print("\n\n📢❗🚨 Google search Organic Results:")
print(organic_table)
# Displaying People Also Ask and Related Searches combined
combined_data = []
try:
people_also_ask_data = []
if "peopleAlsoAsk" in search_results:
for question in search_results["peopleAlsoAsk"]:
title = question.get("title", "")
snippet = question.get("snippet", "")
link = question.get("link", "")
people_also_ask_data.append([title, snippet, link])
except Exception as people_also_ask_err:
logger.error(f"Error processing 'peopleAlsoAsk': {people_also_ask_err}")
people_also_ask_data = []
related_searches_data = []
for query in search_results.get("relatedSearches", []):
related_searches_data.append([query.get("query", "")])
related_searches_headers = ["Related Search"]
if people_also_ask_data:
# Add Related Searches as a column to People Also Ask
combined_data = [
row + [related_searches_data[i][0] if i < len(related_searches_data) else ""]
for i, row in enumerate(people_also_ask_data)
]
combined_headers = ["Question", "Snippet", "Link", "Related Search"]
# Display the combined table
combined_table = tabulate(
combined_data,
headers=combined_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left", "left"],
maxcolwidths=[20, 50, 20, 30]
)
else:
combined_table = tabulate(
related_searches_data,
headers=related_searches_headers,
tablefmt="fancy_grid",
colalign=["left"],
maxcolwidths=[60]
)
print("\n\n📢❗🚨 People Also Ask & Related Searches:")
print(combined_table)
# Save the combined table to a file
try:
save_in_file(organic_table)
save_in_file(combined_table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
return search_results
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")

View File

@@ -0,0 +1,530 @@
"""
This Python script analyzes Google search keywords by fetching auto-suggestions, performing keyword clustering, and visualizing Google Trends data. It uses various libraries such as pytrends, requests_html, tqdm, and more.
Features:
- Fetches auto-suggestions for a given search keyword from Google.
- Performs keyword clustering using K-means algorithm based on TF-IDF vectors.
- Visualizes Google Trends data, including interest over time and interest by region.
- Retrieves related queries and topics for a set of search keywords.
- Utilizes visualization libraries such as Matplotlib, Plotly, and Rich for displaying results.
- Incorporates logging for error handling and informative messages.
Usage:
- Provide a search term or a list of search terms for analysis.
- Run the script to fetch auto-suggestions, perform clustering, and visualize Google Trends data.
- Explore the displayed results, including top keywords in each cluster and related topics.
Modifications:
- Customize the search terms in the 'do_google_trends_analysis' function.
- Adjust the number of clusters for keyword clustering and other parameters as needed.
- Explore further visualizations and analyses based on the generated data.
Note: Ensure that the required libraries are installed using 'pip install pytrends requests_html tqdm tabulate plotly rich'.
"""
import requests
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, silhouette_samples
from rich.console import Console
from rich.progress import Progress
import urllib
import json
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import logging
from requests_html import HTML, HTMLSession
from urllib.parse import quote_plus
from tqdm import tqdm
from tabulate import tabulate
from pytrends.request import TrendReq
import wordcloud
logging.basicConfig(level=logging.INFO)
from loguru import logger
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def fetch_google_trends_interest_overtime(keyword):
try:
pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload([keyword], timeframe='today 1-y', geo='US')
# 1. Interest Over Time
data = pytrends.interest_over_time()
data = data.reset_index()
# Visualization using Matplotlib
plt.figure(figsize=(10, 6))
plt.plot(data['date'], data[keyword], label=keyword)
plt.title(f'Interest Over Time for "{keyword}"')
plt.xlabel('Date')
plt.ylabel('Interest')
plt.legend()
plt.show()
return data
except Exception as e:
logging.error(f"Error in fetch_google_trends_data: {e}")
return pd.DataFrame()
def plot_interest_by_region(kw_list):
try:
from pytrends.request import TrendReq
import matplotlib.pyplot as plt
trends = TrendReq()
trends.build_payload(kw_list=kw_list)
kw_list = ' '.join(kw_list)
data = trends.interest_by_region() #sorting by region
data = data.sort_values(by=f"{kw_list}", ascending=False)
print("\n📢❗🚨 ")
print(f"Top 10 regions with highest interest for keyword: {kw_list}")
data = data.head(10) #Top 10
print(data)
data.reset_index().plot(x="geoName", y=f"{kw_list}",
figsize=(20,15), kind="bar")
plt.style.use('fivethirtyeight')
plt.show()
# FIXME: Send this image to vision GPT for analysis.
except Exception as e:
print(f"Error plotting interest by region: {e}")
return None
def get_related_queries_and_save_csv(keywords, hl='en-US', tz=360, cat=0, timeframe='today 12-m'):
"""
Get related queries for the given search keywords and save the result to a CSV file.
Args:
search_keywords (list): List of search keywords.
hl (str): Language parameter, default is 'en-US'.
tz (int): Timezone parameter, default is 360.
cat (int): Category parameter, default is 0.
timeframe (str): Timeframe parameter, default is 'today 12-m'.
Returns:
pd.DataFrame: DataFrame containing related queries.
"""
try:
# Build model
pytrends = TrendReq(hl=hl, tz=tz)
pytrends.build_payload(kw_list=keywords, cat=cat, timeframe=timeframe)
# Get related queries
data = pytrends.related_queries()
# Extract data from the result
top_queries = list(data.values())[0]['top']
rising_queries = list(data.values())[0]['rising']
top_rising_queries = top_queries + rising_queries
# Convert lists to DataFrames
df_top_queries = pd.DataFrame(top_queries)
df_rising_queries = pd.DataFrame(rising_queries) # Added this line
# Rename columns to avoid duplicates
df_top_queries.columns = ['Top query', 'value']
df_rising_queries.columns = ['Rising query', 'value']
# Save to CSV
all_queries_df = pd.concat([df_top_queries, df_rising_queries], axis=1)
#all_queries_df.to_csv('related_queries.csv', index=False)
# Display additional information
console = Console()
# Display additional information with emojis and bold formatting
print("\n📢❗🚨 ")
print("\n\033[1m🔝 Top\033[0m: The most popular search queries. Scoring is on a relative scale where a value of 100 is the most commonly searched query, 50 is a query searched half as often, and a value of 0 is a query searched for less than 1% as often as the most popular query.\n")
print("\n\033[1m🚀 Rising\033[0m: Queries with the biggest increase in search frequency since the last time period. Results marked 'Breakout' had a tremendous increase, probably because these queries are new and had few (if any) prior searches.\n")
# Display the DataFrame using tabulate
print(tabulate(all_queries_df, headers='keys', tablefmt='fancy_grid'))
# Save the combined table to a file
try:
save_in_file(all_queries_df)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
return top_rising_queries
except Exception as e:
print(f"get_related_queries_and_save_csv: ERROR: An error occurred: {e}")
def get_related_topics_and_save_csv(search_keywords):
"""
Get related topics for the given search keywords and save the result to a CSV file.
Args:
search_keywords (list): List of search keywords.
Returns:
pd.DataFrame: DataFrame containing related topics.
"""
try:
# Build model
pytrends = TrendReq(hl='en-US', tz=360)
# Build payload
pytrends.build_payload(search_keywords, cat=0, timeframe='today 12-m')
# Get related topics
data = pytrends.related_topics()
# Extract data from the result
top_topics = list(data.values())[0]['top']
rising_topics = list(data.values())[0]['rising']
# Convert lists to DataFrames
df_top_topics = pd.DataFrame(top_topics)
df_rising_topics = pd.DataFrame(rising_topics)
# FIXME:Exclude specified columns
columns_to_exclude = ['hasData', 'value', 'topic_mid', 'link']
df_top_topics = df_top_topics.drop(columns=columns_to_exclude, errors='ignore')
df_rising_topics = df_rising_topics.drop(columns=columns_to_exclude, errors='ignore')
# Rename columns to avoid duplicates and provide meaningful names
df_top_topics.columns = ['Top- ' + col if col != 'topic_title' else col for col in df_top_topics.columns]
df_rising_topics.columns = ['Rising- ' + col if col != 'topic_title' else col for col in df_rising_topics.columns]
# Save to CSV
all_topics_df = pd.concat([df_top_topics, df_rising_topics], axis=1)
#all_topics_df.to_csv('related_topics.csv', index=False)
print(f"\n\n 📢❗🚨 Rising and Trending Keywords for {search_keywords}\n")
print("\033[1m🔝 Top\033[0m: The most popular search topics.")
print("\033[1m🚀 Rising\033[0m: Topics experiencing a significant increase in search frequency since the last time period. Topics marked :pile_of_poop:'Breakout' had a tremendous surge, likely because they are new and had few prior searches.")
# Display the DataFrame using tabulate
pd.set_option('display.max_rows', all_topics_df.shape[0]+1)
print(all_topics_df.head(10))
#print(tabulate(all_topics_df, headers='keys', tablefmt='fancy_grid'))
return all_topics_df
except Exception as e:
print(f"ERROR: An error occurred: {e}")
return pd.DataFrame()
def get_source(url):
try:
session = HTMLSession()
response = session.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses
return response
except requests.exceptions.RequestException as e:
logging.error(f"Error during HTTP request: {e}")
return None
def get_results(query):
try:
query = urllib.parse.quote_plus(query)
response = get_source(f"https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q={query}")
if response:
response.raise_for_status()
results = json.loads(response.text)
return results
else:
return None
except json.JSONDecodeError as e:
logging.error(f"Error decoding JSON response: {e}")
return None
except requests.exceptions.RequestException as e:
logging.error(f"Error during HTTP request: {e}")
return None
def format_results(results):
try:
suggestions = []
for index, value in enumerate(results[1]):
suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]}
suggestions.append(suggestion)
return suggestions
except (KeyError, IndexError) as e:
logging.error(f"Error parsing search results: {e}")
return []
def get_expanded_term_suffixes():
return ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm','n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def get_expanded_term_prefixes():
# For shopping, review type blogs.
#return ['discount *', 'pricing *', 'cheap', 'best price *', 'lowest price', 'best value', 'sale', 'affordable', 'promo', 'budget''what *', 'where *', 'how to *', 'why *', 'buy*', 'how much*','best *', 'worse *', 'rent*', 'sale*', 'offer*','vs*','or*']
return ['what *', 'where *', 'how to *', 'why *','best *', 'vs*', 'or*']
def get_expanded_terms(query):
try:
expanded_term_prefixes = get_expanded_term_prefixes()
expanded_term_suffixes = get_expanded_term_suffixes()
terms = [query]
for term in expanded_term_prefixes:
terms.append(f"{term} {query}")
for term in expanded_term_suffixes:
terms.append(f"{query} {term}")
return terms
except Exception as e:
logging.error(f"Error in get_expanded_terms: {e}")
return []
def get_expanded_suggestions(query):
try:
all_results = []
expanded_terms = get_expanded_terms(query)
for term in tqdm(expanded_terms, desc="📢❗🚨 Fetching Google AutoSuggestions", unit="term"):
results = get_results(term)
if results:
formatted_results = format_results(results)
all_results += formatted_results
all_results = sorted(all_results, key=lambda k: k.get('relevance', 0), reverse=True)
return all_results
except Exception as e:
logging.error(f"Error in get_expanded_suggestions: {e}")
return []
def get_suggestions_for_keyword(search_term):
""" """
try:
expanded_results = get_expanded_suggestions(search_term)
expanded_results_df = pd.DataFrame(expanded_results)
expanded_results_df.columns = ['Keywords', 'Relevance']
#expanded_results_df.to_csv('results.csv', index=False)
pd.set_option('display.max_rows', expanded_results_df.shape[0]+1)
expanded_results_df.drop_duplicates('Keywords', inplace=True)
return expanded_results_df
except Exception as e:
logging.error(f"get_suggestions_for_keyword: Error in main: {e}")
def perform_keyword_clustering(expanded_results_df, num_clusters=5):
try:
# Preprocessing: Convert the keywords to lowercase
expanded_results_df['Keywords'] = expanded_results_df['Keywords'].str.lower()
# Vectorization: Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the keywords
tfidf_vectors = vectorizer.fit_transform(expanded_results_df['Keywords'])
# Applying K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_vectors)
# Add cluster labels to the DataFrame
expanded_results_df['cluster_label'] = cluster_labels
# Assessing cluster quality through silhouette score
silhouette_avg = silhouette_score(tfidf_vectors, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")
# Visualize cluster quality using a silhouette plot
#visualize_silhouette(tfidf_vectors, cluster_labels)
return expanded_results_df
except Exception as e:
logging.error(f"Error in perform_keyword_clustering: {e}")
return pd.DataFrame()
def visualize_silhouette(X, labels):
try:
silhouette_avg = silhouette_score(X, labels)
print(f"Silhouette Score: {silhouette_avg}")
# Create a subplot with 1 row and 2 columns
fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))
# The 1st subplot is the silhouette plot
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, X.shape[0] + (len(set(labels)) + 1) * 10])
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, labels)
y_lower = 10
for i in set(labels):
# Aggregate the silhouette scores for samples belonging to the cluster
ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / len(set(labels)))
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for the next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("Silhouette plot for KMeans clustering")
ax1.set_xlabel("Silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for the average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()
except Exception as e:
logging.error(f"Error in visualize_silhouette: {e}")
def print_and_return_top_keywords(expanded_results_df, num_clusters=5):
"""
Display and return top keywords in each cluster.
Args:
expanded_results_df (pd.DataFrame): DataFrame containing expanded keywords, relevance, and cluster labels.
num_clusters (int or str): Number of clusters or 'all'.
Returns:
pd.DataFrame: DataFrame with top keywords for each cluster.
"""
top_keywords_df = pd.DataFrame()
if num_clusters == 'all':
unique_clusters = expanded_results_df['cluster_label'].unique()
else:
unique_clusters = range(int(num_clusters))
for i in unique_clusters:
cluster_df = expanded_results_df[expanded_results_df['cluster_label'] == i]
top_keywords = cluster_df.sort_values(by='Relevance', ascending=False).head(5)
top_keywords_df = pd.concat([top_keywords_df, top_keywords])
print(f"\n📢❗🚨 GTop Keywords for All Clusters:")
table = tabulate(top_keywords_df, headers='keys', tablefmt='fancy_grid')
# Save the combined table to a file
try:
save_in_file(top_keywords_df)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
print(table)
return top_keywords_df
def generate_wordcloud(keywords):
"""
Generate and display a word cloud from a list of keywords.
Args:
keywords (list): List of keywords.
"""
# Convert the list of keywords to a string
text = ' '.join(keywords)
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
# Display the word cloud using matplotlib
plt.figure(figsize=(600, 200))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "w") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")
def do_google_trends_analysis(search_term):
""" Get a google search keywords, get its stats."""
search_term = [f"{search_term}"]
all_the_keywords = []
try:
for asearch_term in search_term:
#FIXME: Lets work with a single root keyword.
suggestions_df = get_suggestions_for_keyword(asearch_term)
result_df = perform_keyword_clustering(suggestions_df)
# Display top keywords in each cluster
top_keywords = print_and_return_top_keywords(result_df)
all_the_keywords.append(top_keywords['Keywords'].tolist())
#
# # FIXME: Get result from vision GPT. Fetch and visualize Google Trends data
# #trends_data = fetch_google_trends_interest_overtime("llamaindex")
#
# # FIXME: Plot Interest Over time.
# result_df = plot_interest_by_region(search_term)
#
# # Display additional information
result_df = get_related_topics_and_save_csv(search_term)
# Extract 'Top' topic_title
top_topic_title = result_df['topic_title'].values.tolist()
# Join each sublist into one string separated by comma
#top_topic_title = [','.join(filter(None, map(str, sublist))) for sublist in top_topic_title]
top_topic_title = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in top_topic_title])
print(f"\nRising and Top keywords: {top_topic_title}")
# Print or use the extracted topic titles
all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords])
print(f"\n\n📢❗🚨 Important keywords to target: {all_the_keywords}\n\n")
all_the_keywords += top_topic_title
print(all_the_keywords)
all_the_keywords = all_the_keywords.split(',')
# Split the list into chunks of 5 keywords
chunk_size = 4
chunks = [all_the_keywords[i:i + chunk_size] for i in range(0, len(all_the_keywords), chunk_size)]
# Create a DataFrame with columns named 'Keyword 1', 'Keyword 2', etc.
combined_df = pd.DataFrame(chunks, columns=[f'K📢eyword Col{i + 1}' for i in range(chunk_size)])
# Print the table
print(tabulate(combined_df, headers='keys', tablefmt='fancy_grid'))
#combined_df = pd.DataFrame({'📢❗🚨 Important keywords to target': chunks})
print(all_the_keywords)
generate_wordcloud(all_the_keywords.split(','))
return(all_the_keywords)
except Exception as e:
logging.error(f"Error in main: {e}")

View File

@@ -0,0 +1,49 @@
import sys
import json
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
# FIXME: Provide num_blogs, num_faqs as inputs.
def get_blog_sections_from_websearch(search_keyword, search_results, gpt_providers="gemini"):
"""Combine the given online research and gpt blog content"""
prompt = f"""
As a SEO expert and content writer, I will provide you with a search keyword and its google search result.
Your task is to write a blog title and 5 blog sub titles, from the given google search result.
The subtitles should be less than 40 characters and click worthy.
Do not explain, describe your response. Respond in json format, always name the key as 'blogSections'.
Web Research Keyword: "{search_keyword}"
Google search Result: "{search_results}"
"""
if 'gemini' in gpt_providers:
try:
response = gemini_text_response(prompt)
if '```' in response and '\n' in response:
response = response.strip().split('\n')
# Remove the first and last lines
response = '\n'.join(response[1:-1])
response = json.loads(response)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
logger.error(f"Gemini Error: {response.prompt_feedback}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,41 @@
import sys
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def summarize_competitor_content(research_content, gpt_providers="openai"):
"""Combine the given online research and gpt blog content"""
prompt = f""" Web page content: {research_content} """
if 'gemini' in gpt_providers:
prompt = f"""You are a helpful assistant writing a research report about a company. I will provide you with company details.
Summarize the given company details into multiple paragraphs.
Be extremely concise, professional, and factual as possible.
The first paragraph should be an introduction and summary of the company.
The second paragraph should include pros and cons of the company.
The third paragraph should be on their pricing model.
Include a conclusion, summarizing your research about the given company details.
Company details: '{research_content}'"""
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,185 @@
################################################################
#
#
#
##############################################################
import os
import json
from pathlib import Path
import sys
from typing import List, NamedTuple
from loguru import logger
from datetime import datetime
from ..gpt_providers.gemini_pro_text import gemini_text_response
from .tavily_ai_search import get_tavilyai_results
from .metaphor_basic_neural_web_search import metaphor_find_similar, metaphor_search_articles
from .google_serp_search import google_search
from .google_trends_researcher import do_google_trends_analysis
from .gpt_blog_sections import get_blog_sections_from_websearch
from .web_research_report import write_web_research_report
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(), similar_url=None):
""" """
print(f"Web Research:Time Range - {time_range},Search Keywords - {search_keywords},Include URLs - {include_domains}")
if not include_domains:
include_domains = list()
# TBD: Keeping the results directory as fixed, for now.
os.environ["SEARCH_SAVE_FILE"] = os.path.join(os.getcwd(), "workspace", "web_research_reports",
search_keywords.replace(" ", "_") + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
# Collect all blog titles featuring in search results. This *may help in generating blog titles
# closest to competing ones. All search blog titles, given keyword and keywords from analysis, give
# llm a good context for the task of generating blog titles.
blog_titles = []
# Get a list of FAQs from search results.
blog_faqs = None
google_result = None
tavily_result = None
report = None
# try:
# logger.info(f"Doing Google search for: {search_keywords}\n")
# google_result = google_search(search_keywords)
# blog_titles.append(extract_info(google_result, "titles"))
# except Exception as err:
# logger.error(f"Failed to do Google Serpapi research: {err}")
# # Not failing, as tavily would do same and then GPT-V to search.
#
# try:
# # FIXME: Include the follow-up questions as blog FAQs.
# logger.info(f"Doing Tavily AI search for: {search_keywords}")
# tavily_result = get_tavilyai_results(search_keywords, include_domains)
# blog_titles.append(tavily_extract_information(tavily_result, "titles"))
# except Exception as err:
# logger.error(f"Failed to do Tavily AI Search: {err}")
# try:
# logger.info(f"Start Semantic/Neural web search with Metahpor: {search_keywords}")
# response_articles = metaphor_search_articles(
# search_keywords,
# include_domains=include_domains,
# time_range=time_range,
# similar_url=similar_url)
# blog_titles.append(metaphor_extract_titles_or_text(response_articles, return_titles=True))
# except Exception as err:
# logger.error(f"Failed to do Metaphor search: {err}")
# print(blog_titles)
try:
logger.info(f"Do Google Trends analysis for given keywords: {search_keywords}")
important_keywords = do_google_trends_analysis(search_keywords)
except Exception as err:
logger.error(f"Failed to do google trends analysis: {err}")
print(important_keywords)
# Now that we have search results from given keywords. Generate blog title and subtopics suggestions.
# 1. Return a list of related keywords along with search volumes.
# 2. New blog titles to write on(niche, top) and blog sections.
# 3. Competitors list, similar urls if given.
class Result(NamedTuple):
url: str
id: str
title: str
score: float
published_date: str
author: str
text: str
highlights: List[str]
highlight_scores: List[float]
def metaphor_extract_titles_or_text(json_data, return_titles=True):
"""
Extract either titles or text from the given JSON structure.
Args:
json_data (list): List of Result objects in JSON format.
return_titles (bool): If True, return titles. If False, return text.
Returns:
list: List of titles or text.
"""
result_list = [Result(**result) for result in json_data]
if return_titles:
return [result.title for result in result_list]
else:
return [result.text for result in result_list]
def extract_info(json_data, info_type):
"""
Extract information (titles, peopleAlsoAsk, or relatedSearches) from the given JSON.
Args:
json_data (dict): The JSON data.
info_type (str): The type of information to extract (titles, peopleAlsoAsk, relatedSearches).
Returns:
list or None: A list containing the requested information, or None if the type is invalid.
"""
if info_type == "titles":
return [result.get("title") for result in json_data.get("organic", [])]
elif info_type == "peopleAlsoAsk":
return [item.get("question") for item in json_data.get("peopleAlsoAsk", [])]
elif info_type == "relatedSearches":
return [item.get("query") for item in json_data.get("relatedSearches", [])]
else:
print("Invalid info_type. Please use 'titles', 'peopleAlsoAsk', or 'relatedSearches'.")
return None
def tavily_extract_information(json_data, keyword):
"""
Extract information from the given JSON based on the specified keyword.
Args:
json_data (dict): The JSON data.
keyword (str): The keyword (title, content, answer, follow-query).
Returns:
list or str: The extracted information based on the keyword.
"""
if keyword == 'title':
return [result['title'] for result in json_data['results']]
elif keyword == 'content':
return [result['content'] for result in json_data['results']]
elif keyword == 'answer':
return json_data['answer']
elif keyword == 'follow-query':
return json_data['follow_up_questions']
else:
return f"Invalid keyword: {keyword}"
def compete_organic_results(query, report, organic_results):
""" Given a blog content and google search organinc results, create a new blog to compete against them."""
prompt = f""" As an SEO expert and copywriter, I will provide you with my blog content on topic '{query}', and
Top google search results.
Your task is to rewrite the given blog to make it compete against top position results.
Make sure, the new blog has high probability of ranking highest against given organic search result competitors.
Modify the given blog content following best SEO practises.
Make sure the blog is original, unique and highly readable.
Remember, Maintain and adopt the formatting, structure, style and tone of the provided blog content.
Include relevant emojis in your final blog for visual appeal. Use it sparingly.
Your response should be well-structured, objective, and critically acclaimed blog article based on provided texts.
Remember, your goal is to create a detailed blog article that will compete against given organic result competitors.
Do not provide explanations, suggestions for your response, reply only with your final response.
Take your time in crafting your content, do not rush to give the response.
Blog Content: '{report}'\n
Organic Search result: '{organic_results}'
"""
report = gemini_text_response(prompt)
return report

View File

@@ -0,0 +1,38 @@
import sys
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def summarize_web_content(page_content, gpt_providers="openai"):
"""Combine the given online research and gpt blog content"""
prompt = f"""
Web page content: {page_content}
"""
if 'gemini' in gpt_providers:
prompt = f"""You are a helpful assistant that briefly summarizes the content of a webpage.
Summarize the given web page content below.
Web page content: '{page_content}'"""
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,53 @@
import sys
import json
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
# FIXME: Provide num_blogs, num_faqs as inputs.
def gpt_titles_faqs_google_search(search_keyword, search_results, gpt_providers="openai"):
"""Combine the given online research and gpt blog content"""
prompt = f"""
As a SEO expert and content writer, I will provide you with my web research keyword and its google search result in json format.
Your task is to write 1 blog title and 10 FAQs.
1). Your blog title should compete against all the provided search results.
2). Your FAQ should be based on 'People also ask' and 'Related Queries' from given result.
Always include answers for each FAQ, use your knowledge and confirm with snippets given in search result.
3). Respond in json data with 'blogTitles' and 'FAQs' as json keys. Do not explain, describe your response.
4). Follow best practises of SEO.
Web Research Keyword: "{search_keyword}"
Google search Result: "{search_results}"
"""
logger.info("Generating blog title and FAQs from web search result.")
if 'gemini' in gpt_providers:
try:
response = gemini_text_response(prompt)
print(f"\n\n\n RESPONSE: {response}\n\n\n")
if '```' in response and '\n' in response:
response = response.strip().split('\n')
# Remove the first and last lines
response = '\n'.join(response[1:-1])
response = json.loads(response)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,223 @@
import os
import sys
import pandas as pd
from io import StringIO
from pathlib import Path
from metaphor_python import Metaphor
from datetime import datetime, timedelta
from loguru import logger
from tqdm import tqdm
from tabulate import tabulate
from collections import namedtuple
import textwrap
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from dotenv import load_dotenv
load_dotenv(Path('../../.env'))
from exa_py import Exa
from tenacity import (retry, stop_after_attempt, wait_random_exponential,)# for exponential backoff
from .gpt_summarize_web_content import summarize_web_content
from .gpt_competitor_analysis import summarize_competitor_content
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_metaphor_client():
"""
Get the Metaphor client.
Returns:
Metaphor: An instance of the Metaphor client.
"""
METAPHOR_API_KEY = os.environ.get('METAPHOR_API_KEY')
if not METAPHOR_API_KEY:
raise ValueError("METAPHOR_API_KEY environment variable not set!")
return Exa(METAPHOR_API_KEY)
def metaphor_rag_search():
""" Mainly used for researching blog sections. """
metaphor = get_metaphor_client()
def metaphor_find_similar(similar_url):
"""
Find similar content using the Metaphor API.
Args:
url (str): The URL to find similar content.
Returns:
MetaphorResponse: The response from the Metaphor API.
"""
metaphor = get_metaphor_client()
try:
logger.info(f"Doing similar web search for url: {similar_url}")
search_response = metaphor.find_similar_and_contents(
similar_url,
highlights=True,
num_results=10)
except Exception as e:
logger.error(f"Metaphor: Error in finding similar content: {e}")
raise
competitors = search_response.results
for acompetitor in tqdm(competitors, desc="Processing Competitors", unit="competitor"):
all_contents = ""
try:
search_response = metaphor.search_and_contents(
acompetitor.url,
type="keyword",
num_results=5
)
except Exception as err:
logger.error(f"Failed to do metaphor keyword/url research: {err}")
research_response = search_response.results
# Add a progress bar for the inner loop
for r in tqdm(research_response, desc=f"{acompetitor.url}", unit="research"):
all_contents += r.text
try:
acompetitor.text = summarize_competitor_content(all_contents, "gemini")
except Exception as err:
logger.error(f"Failed to summarize_web_content: {err}")
# Convert the data into a list of lists
print_search_result(competitors)
return search_response
def metaphor_search_articles(query,
num_results=5,
use_autoprompt=True,
include_domains=[],
time_range=None,
similar_url=None):
"""
Search for articles using the Metaphor API.
Args:
query (str): The search query.
num_results (int): Number of results to retrieve.
use_autoprompt (bool): Whether to use autoprompt.
include_domains (list): List of domains to include.
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
Returns:
MetaphorResponse: The response from the Metaphor API.
"""
metaphor = get_metaphor_client()
try:
if time_range == "past day":
start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
elif time_range == "past week":
start_published_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
elif time_range == "past month":
start_published_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
elif time_range == "past year":
start_published_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
else:
start_published_date = None
logger.info(f"Metaphor web search with Date: {start_published_date} and Query: {query}")
try:
search_response = metaphor.search_and_contents(
query,
include_domains=include_domains,
use_autoprompt=True,
start_published_date=start_published_date,
num_results=num_results
)
except Exception as err:
logger.error(f"Failed in metaphor.search_and_contents: {err}")
# From each webpage, get a summary of the web page.
contents_response = search_response.results
for content in tqdm(contents_response, desc="Reading Web URL content:", unit="content"):
summarized_content = summarize_web_content(content.text, "gemini")
content.text = summarized_content
print_search_result(contents_response)
if similar_url:
logger.info(f"Doing similar/semantic search for URL: {similar_url}")
metaphor_find_similar(similar_url)
return contents_response
except Exception as e:
logger.error(f"Error in Metaphor searching articles: {e}")
raise
def print_search_result(contents_response):
# Define the Result namedtuple
Result = namedtuple("Result", ["url", "title", "published_date", "text"])
# Tabulate the data
table_headers = ["URL", "Title", "Published Date", "Summary"]
table_data = [(result.url, result.title, result.published_date, result.text) for result in contents_response]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left", "left"],
maxcolwidths=[20, 20, 10, 60])
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")
def metaphor_scholar_search(query, include_domains=None, time_range="anytime"):
"""
Search for papers using the Metaphor API.
Args:
query (str): The search query.
include_domains (list): List of domains to include.
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
Returns:
MetaphorResponse: The response from the Metaphor API.
"""
client = get_metaphor_client()
try:
if time_range == "day":
start_published_date = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "week":
start_published_date = (datetime.utcnow() - timedelta(weeks=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "month":
start_published_date = (datetime.utcnow() - timedelta(weeks=4)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "year":
start_published_date = (datetime.utcnow() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ')
else:
start_published_date = None
response = client.search(query, include_domains=include_domains, start_published_date=start_published_date, use_autoprompt=True)
return response
except Exception as e:
logger.error(f"Error in searching papers: {e}")

View File

@@ -0,0 +1,156 @@
"""
This Python script uses the Tavily AI service to perform advanced searches based on specified keywords and options. It retrieves Tavily AI search results, pretty-prints them using Rich and Tabulate, and provides additional information such as the answer to the search query and follow-up questions.
Features:
- Utilizes the Tavily AI service for advanced searches.
- Retrieves API keys from the environment variables loaded from a .env file.
- Configures logging with Loguru for informative messages.
- Implements a retry mechanism using Tenacity to handle transient failures during Tavily searches.
- Displays search results, including titles, snippets, and links, in a visually appealing table using Tabulate and Rich.
Usage:
- Ensure the necessary API keys are set in the .env file.
- Run the script to perform a Tavily AI search with specified keywords and options.
- The search results, including titles, snippets, and links, are displayed in a formatted table.
- Additional information, such as the answer to the search query and follow-up questions, is presented in separate tables.
Modifications:
- To modify the script, update the environment variables in the .env file with the required API keys.
- Adjust the search parameters, such as keywords and search depth, in the `get_tavilyai_results` function as needed.
- Customize logging configurations and table formatting according to preferences.
To-Do (TBD):
- Consider adding further enhancements or customization based on specific use cases.
Note: This script depends on external libraries such as Tavily, Rich, Tabulate, Loguru, and Tenacity. Install them using 'pip install tavily rich tabulate loguru tenacity' if not already installed.
"""
import os
from pathlib import Path
import sys
from dotenv import load_dotenv
from loguru import logger
from tavily import TavilyClient
from rich import print
from tabulate import tabulate
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
from rich import print
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from tenacity import retry, stop_after_attempt, wait_random_exponential
from .gpt_titles_faq import gpt_titles_faqs_google_search
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_tavilyai_results(keywords, include_urls, search_depth="advanced"):
"""
Get Tavily AI search results based on specified keywords and options.
Args:
keywords (str): Keywords for Tavily AI search.
include_urls (str): Comma-separated URLs to include in the search.
search_depth (str, optional): Search depth option (default is "advanced").
Returns:
dict: Tavily AI search results.
"""
# Run Tavily search
logger.info(f"Running Tavily search on: {keywords}")
# Retrieve API keys
api_key = os.getenv('TAVILY_API_KEY')
if not api_key:
raise ValueError("API keys for Tavily or OpenAI are not set.")
# Initialize Tavily client
try:
client = TavilyClient(api_key=api_key)
except Exception as err:
logger.error(f"Failed to create Tavily client. Check TAVILY_API_KEY: {err}")
exit(1)
try:
if include_urls:
tavily_search_result = client.search(keywords, search_depth, include_answer=True, include_domains=include_urls)
else:
tavily_search_result = client.search(keywords, search_depth, include_answer=True)
print_result_table(tavily_search_result)
return(tavily_search_result)
except Exception as err:
logger.error(f"Failed to do Tavily Research: {err}")
def print_result_table(output_data):
""" Pretty print the tavily AI serch result. """
# Prepare data for tabulate
table_data = []
for item in output_data.get("results"):
title = item.get("title", "")
snippet = item.get("content", "")
link = item.get("url", "")
table_data.append([title, snippet, link])
# Define table headers
table_headers = ["Title", "Snippet", "Link"]
# Display the table using tabulate
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left"],
maxcolwidths=[30, 60, 30])
# Print the table
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
# Display the 'answer' in a table
table_headers = [f"The answer to search query: {output_data.get('query')}"]
table_data = [[output_data.get("answer")]]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
maxcolwidths=[80])
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
# Display the 'follow_up_questions' in a table
table_headers = [f"Search Engine follow up questions for query: {output_data.get('query')}"]
table_data = [[output_data.get("follow_up_questions")]]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
maxcolwidths=[80])
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")

View File

@@ -0,0 +1,23 @@
from langchain.adapters.openai import convert_openai_messages
from langchain.chat_models import ChatOpenAI
from ..gpt_providers.gemini_pro_text import gemini_text_response
def write_web_research_report(web_research, faq_questions, gpt_provider="gemini"):
""" """
if "gemini" in gpt_provider:
prompt = ["You are an SEO and marketing expert, who writes unique, factual and comprehensive research reports."
"I will provide you web research report as json data and a list of related FAQ questions."
"Use given json as context for writing your research report."
"Your sole purpose is to write well written, critically acclaimed, objective and structured research report"
"Use the urls from json content to provide cititations and include it in referances section of your report."
"Include appropriate emojis in your research report."
"Format your report in MLA format and markdown style, with special focus on readibility."
f"Do not provide explanations for your response.\nWeb research Report: \"\"\" {web_research} \"\"\"\n "
f"\nList of FAQ questions: \"\"\" {faq_questions} \"\"\"\n"]
report = gemini_text_response(prompt)
elif "openai" in gpt_provider:
report = openai_research_report(prompt)
return report

View File

@@ -0,0 +1,137 @@
import requests
from clint.textui import progress
from loguru import logger
def search_ydc_index(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
"""
Search YDC Index API and retrieve results.
Args:
search_query (str): The search query.
num_web_results (int): Number of web results to retrieve.
country (str): Country code.
api_key (str): YDC Index API key.
Returns:
dict: The response from the YDC Index API in JSON format.
"""
try:
url = "https://api.ydc-index.io/search"
querystring = {
"query": search_query,
"num_web_results": str(num_web_results),
"country": country
}
headers = {"X-API-Key": api_key}
with progress.Bar(expected_size=num_web_results, label="Searching YDC Index") as bar:
response = requests.get(url, headers=headers, params=querystring, stream=True)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
result_json = response.json()
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
return result_json
except requests.exceptions.RequestException as req_exc:
logger.error(f"Request to YDC Index API failed: {req_exc}")
return {"error": str(req_exc)}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {"error": str(e)}
def get_rag_results(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
"""
Retrieve RAG (Relevance, Authority, and Goodness) results from YDC Index API.
Args:
search_query (str): The search query.
num_web_results (int): Number of web results to retrieve.
country (str): Country code.
api_key (str): YDC Index API key.
Returns:
dict: The response from the YDC Index API in JSON format.
"""
try:
url = "https://api.ydc-index.io/rag"
querystring = {
"query": search_query,
"num_web_results": str(num_web_results),
"country": country
}
headers = {"X-API-Key": api_key}
with progress.Bar(expected_size=num_web_results, label="Fetching RAG Results") as bar:
response = requests.get(url, headers=headers, params=querystring, stream=True)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
result_json = response.json()
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
return result_json
except requests.exceptions.RequestException as req_exc:
logger.error(f"Request to YDC Index API failed: {req_exc}")
return {"error": str(req_exc)}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {"error": str(e)}
def get_news_results(query, spellcheck=True, api_key="<api-key>"):
"""
Retrieve news results from YDC Index API.
Args:
query (str): The search query.
spellcheck (bool): Whether to enable spellcheck.
api_key (str): YDC Index API key.
Returns:
dict: The response from the YDC Index API in JSON format.
"""
try:
url = "https://api.ydc-index.io/news"
querystring = {
"q": query,
"spellcheck": str(spellcheck).lower()
}
headers = {"X-API-Key": api_key}
with progress.Bar(expected_size=1, label="Fetching News Results") as bar:
response = requests.get(url, headers=headers, params=querystring, stream=True)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
result_json = response.json()
bar.show() # Update progress bar
return result_json
except requests.exceptions.RequestException as req_exc:
logger.error(f"Request to YDC Index API failed: {req_exc}")
return {"error": str(req_exc)}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {"error": str(e)}
# Example usage
search_query = "Getting started with llamaindex"
result = get_news_results(search_query)
print(result)
result = get_rag_results(search_query)
print(result)
result = search_ydc_index(search_query)
print(result)

View File

@@ -0,0 +1,37 @@
import sys
from .gpt_providers.openai_chat_completion import openai_chatgpt
from .gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def generate_blog_faq(blog_article, gpt_providers="openai"):
"""
Given a blog title generate an outline for it
"""
logger.info("Generating blog FAQs.")
prompt = f"""As an expert writer, I will provide you with blog content below.
Your task is to write 5 FAQs based on the given blog content.
Always, write fact based answers. Use emojis where applicable.
You must reply in MARKDOWN format.
blog content: '{blog_article}' """
if 'gemini' in gpt_providers:
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
SystemError(f"Failed to get response from Openai: {err}")

View File

@@ -1,8 +0,0 @@
{
"wordpress_url": "https://latestaitools.in/",
"wordpress_username": "username",
"wordpress_password": "password",
"image_dir": "path/to/image_dir",
"output_path": "path/to/output_path"
}

View File

@@ -1,18 +0,0 @@
def generate_topic_outline(blog_title, num_subtopics):
"""
Given a blog title generate an outline for it
"""
# TBD: Remove hardcoding, make dynamic
prompt = f"""As a SEO expert, suggest only {num_subtopics} beginner-friendly and
insightful sub topics for the blog title: {blog_title}.
Respond with only answer and no description, explanations."""
# The suggested {num_subtopics} outline should include few long-tailed keywords and most popular questions.
# TBD: Include --niche
logger.info(f"Prompt used for blog title Outline :\n{prompt}\n")
# TBD: Add logic for which_provider and which_model
try:
response = openai_chatgpt(prompt)
except Exception as err:
SystemError(f"Error in generating Blog Title: {err}")
return response

View File

@@ -0,0 +1,39 @@
import sys
from .gpt_providers.openai_chat_completion import openai_chatgpt
from .gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def github_readme_blog(readme_content, gpt_providers="openai"):
""" """
prompt = f"""As an expert programmer and teacher, Write an original, detailed and step-by-step guide, from the provided Text below.
Your guide should be original, engaging and help beginners get started easily.
Write new example codes and detailed comments on how to run them. Include appropriate emoji where applicable.
Include a referances section that links to more code examples.
Your response MUST be a how-to blog in markdown format.
Respond ONLY with your blog content.
Text: '{readme_content}'
"""
if 'gemini' in gpt_providers:
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
sys.exit(1)
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
SystemError(f"Failed to get response from Openai: {err}")

View File

@@ -0,0 +1,140 @@
""" Package for writing getting-started and how to guides. """
import os
import sys
import datetime
import json
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .scrape_github_readme import get_gh_details_vision, get_readme_content
from .scrape_github_readme import research_github_topics, check_if_already_written
from .github_getting_started import github_readme_blog
from .gpt_online_researcher import do_online_research
from .faqs_generator_blog import generate_blog_faq
from .get_blog_metadata import blog_metadata
from .save_blog_to_file import save_blog_to_file
from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
def blog_from_github(github_opts, flag):
""" Module for writing getting started code examples from github. """
if 'url' in flag:
try:
write_from_url(github_opts)
except Exception as err:
logger.error(f"Failed to write from github url: {github_opts}")
sys.exit(1)
elif 'csv' in flag:
try:
gh_urls = []
with open(github_opts, 'r') as file:
# Read each line in the file
for gh_url in file:
gh_urls.append(gh_url.strip())
except FileNotFoundError:
logger.error(f"CSV File not found: {file_path}")
except Exception as e:
logger.error(f"CSV: An error occurred: {str(e)}")
for gh_url in gh_urls:
try:
write_from_url(gh_url.strip())
except Exception as err:
logger.error(f"Failed to write blog from github: {err}")
def write_from_url(gh_url):
# String to store the blog content.
howto_blog = ''
# The url was not found in already_written data.
if not check_if_already_written(gh_url):
logger.info(f"Writing getting started from url: {gh_url}")
else:
logger.error(f"Skipping, already written on url: {gh_url}")
return
# Direct link to the raw content of README file
# fixme: Remove the hardcoding, need add another option OR in config ?
image_dir = os.path.join(os.getcwd(), "blog_images")
generated_image_name = f"screenshot_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
generated_image_filepath = os.path.join(image_dir, generated_image_name)
try:
logger.info(f"Getting github repo details from vision model: {generated_image_filepath}")
gh_json = get_gh_details_vision(gh_url, generated_image_filepath)
except Exception as err:
logger.error(f"Failed to get gemini vision details from GH repo image: {err}")
sys.exit(1)
howto_blog = "```" + f"\nGithub URL:{gh_url}\nStars:{gh_json.get('stars')}\n"
howto_blog += f"Forks:{gh_json.get('forks')}\n"
howto_blog += f"Description:{gh_json.get('about')}\nBranch:{gh_json.get('branch_name')}\n" + "```\n\n"
raw_readme_url_base = "https://raw.githubusercontent.com/" + "/".join(gh_url.split("/")[-2:])
if gh_json.get('branch_name'):
raw_readme_url = raw_readme_url_base + f"/{gh_json.get('branch_name')}/" + "README.md"
else:
raw_readme_url = raw_readme_url_base + f"/main/" + "README.md"
logger.info(f"Using this url to fetch the README file: {raw_readme_url}")
try:
# Get and print the main content
readme_content = get_readme_content(raw_readme_url)
except Exception as err:
logger.error(f"Failed to get README from URL: {raw_readme_url}: {err}")
# If the readme is still None, try with master branch.
if not readme_content:
raw_readme_url = raw_readme_url_base + f"/master/" + "README.md"
logger.warning(f"Trying with master branch: {raw_readme_url}")
readme_content = get_readme_content(raw_readme_url)
if not readme_content:
logger.error(f"Still failed to get the README: {readme_content}")
sys.exit(1)
# Create a getting-started blog, adapted from the GH url README.
howto_blog += github_readme_blog(readme_content, "gemini")
# Do online research for faqs on the github url.
try:
# Repo names are misnomers for others search, include its decription too.
# Which, skews the result favourably towards its home/paid pages.
#online_query = f"{''.join(gh_url.split('/')[-1:])} " + gh_json.get('about')
online_query = f"{''.join(gh_url.split('/')[-1:])} "
logger.info("Do web research with Tavily & Metaphor AI.")
research_report = do_online_research(online_query, "gemini", gh_url)
except Exception as err:
logger.error(f"failed to do online research: {err}")
# Generate FAQs from the online research report.
try:
blog_faqs = generate_blog_faq(research_report, "gemini")
howto_blog += f"\n\n## {''.join(gh_url.split('/')[-1:])} FAQs\n\n" + blog_faqs
except Exception as err:
logger.error(f"Failed to generate FAQs from web research_report: {err}")
logger.info(f"\n\nFinal Blog Content: {howto_blog}\n\n")
try:
blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(howto_blog, "gemini")
except Exception as err:
logger.error(f"Failed to get blog metadata: {err}")
raise err
try:
save_blog_to_file(howto_blog, blog_title, blog_meta_desc, blog_tags,\
blog_categories, generated_image_filepath)
except Exception as err:
logger.error(f"Failed to save blog to a file: {err}")
sys.exit(1)
try:
append_id_to_file(gh_url, "papers_already_written_on.txt")
except Exception as err:
logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
raise err

View File

@@ -0,0 +1,297 @@
import os
import sys
import datetime
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .take_url_screenshot import take_screenshot
from .gpt_providers.gemini_image_details import gemini_get_img_info
def get_readme_content(url):
try:
# Fetch the README content directly from the URL
response = requests.get(url)
print(response.status_code)
if response.status_code == 200:
logger.debug("Successfully fetched the README.md")
readme_content = response.text
else:
readme_content = None
return readme_content
except Exception as err:
logger.error(f"Failed to fetch raw readme from {url}: {err}: {response.status_code}")
sys.exit(1)
def get_gh_repo_metadata(github_url):
""" Function to get the repo details like stars, commits, forks etc """
logger.info("Scraping github with BS4 and requests.")
# download the target page
page = requests.get(github_url)
# parse the HTML document returned by the server
soup = BeautifulSoup(page.text, 'html.parser')
# initialize the object that will contain the scraped data
repo = {}
# repo scraping logic
name_html_element = soup.select_one('[itemprop="name"]')
name = name_html_element.get_text().strip()
git_branch_icon_html_element = soup.select_one('.octicon-git-branch')
main_branch_html_element = git_branch_icon_html_element.find_next_sibling('span')
main_branch = main_branch_html_element.get_text().strip()
# scrape the repo history data
boxheader_html_element = soup.select_one('.Box .Box-header')
# scrape the repo details in the right box
bordergrid_html_element = soup.select_one('.BorderGrid')
about_html_element = bordergrid_html_element.select_one('h2')
description_html_element = about_html_element.find_next_sibling('p')
description = description_html_element.get_text().strip()
star_icon_html_element = bordergrid_html_element.select_one('.octicon-star')
stars_html_element = star_icon_html_element.find_next_sibling('strong')
stars = stars_html_element.get_text().strip().replace(',', '')
eye_icon_html_element = bordergrid_html_element.select_one('.octicon-eye')
watchers_html_element = eye_icon_html_element.find_next_sibling('strong')
watchers = watchers_html_element.get_text().strip().replace(',', '')
fork_icon_html_element = bordergrid_html_element.select_one('.octicon-repo-forked')
forks_html_element = fork_icon_html_element.find_next_sibling('strong')
forks = forks_html_element.get_text().strip().replace(',', '')
# Find the div with class "f6" containing topic links
topic_div = soup.find('div', class_='f6')
if topic_div:
# Find all the topic links within the div
topic_links = topic_div.find_all('a', class_='topic-tag-link')
# Extract and print the topics
repo['topics'] = [link.text.strip() for link in topic_links]
# FIXME: Unable to scrape branch name.
repo['branch_name'] = None
# store the scraped data
repo['name'] = name
repo['about'] = description
repo['stars'] = stars
repo['watchers'] = watchers
repo['forks'] = forks
#repo['readme'] = readme
logger.info(f"Github Repo Details: {repo}")
return(repo)
def get_gh_details_vision(github_url, generated_image_filepath):
""" Take a screenshot of the url and feed to vision models for scraping details. """
logger.info(f"Take screenshot and pass it to gemini for repo details of {github_url}")
generated_image_filepath = take_screenshot(github_url, generated_image_filepath)
prompt = """From the given image of a github page, find out the number of stars, about, forks, last commit days, link url, topics and branch name. Return the result as json."""
try:
gh_details = gemini_get_img_info(prompt, generated_image_filepath)
logger.info(f"Github Repo details, from vision model: {gh_details}")
#gh_details = get_gh_repo_metadata(github_url)
except Exception as err:
logger.error(f"Failed to get gh images details: {err}")
gh_details = get_gh_repo_metadata(github_url)
return gh_details
# Convert string to dictionary Split the string into lines
lines = gh_details.split('\n')
# Remove the first and last line
modified_lines = lines[1:-1]
# Join the modified lines back into a string
gh_details = '\n'.join(modified_lines)
gh_details = json.loads(gh_details)
return(gh_details)
def research_github_topics(topics):
""" Scrape github topics of interest for top repos to write on """
# https://www.kaggle.com/code/subhaskumarray/scraping-github-topics-with-their-repositories
# We are going to scrape https://github.com/topics
# We will get a list of topics. For each topic, we will extract topic name, topic description and topic url.
# For each topic, we will get top 30 repositories with repo name, repo username, stars and repo url.
# Finally we are going to create csv file for each topic with respective repo details.
#github_topics = "https://github.com/topics/"
#response = requests.get(github_topics)
#if response.status_code != 200:
# logger.error(f'There is something wrong with {url}')
#response_contents = response.text
# Now we will parse the contents using BeautifulSoup:
#parsed_contents = BeautifulSoup(response_contents,'html.parser')
#logger.info("Get all topics, Titles and their urls from github.")
#topic_titles = get_topic_titles(parsed_contents)
#topic_desc = get_topic_desc(parsed_contents)
#topic_urls = get_topic_url(parsed_contents)
#topic_df = pd.DataFrame(list(zip(topic_titles, topic_desc,topic_urls)),\
# columns =['title', 'description', 'url'])
#logger.info(f"Scraped data from github: {topic_df}")
gh_topics = ['ai', 'ai-tools', 'ai-assistant', 'ai-agents-framework', 'llm', 'multi-agent', 'fine-tuning', 'rag', 'generative', 'prompt-engineering', 'generative-ai', 'text-to-image-generation', 'llm-ops', 'retrieval-augmented-generation', 'langchain', 'gemini-api', 'vertex-ai', 'huggingface', 'auto-gpt', 'llmops', 'ai-toolkit', 'chatbot', 'chatgpt', 'code-assistant', 'text-to-video', 'llms', 'gpt-4']
repo_info_dict = {
'username':[],
'repo_name': [],
'stars': [],
'repo_url': []
}
for agh_topic in gh_topics:
topic_url = f"https://github.com/topics/{agh_topic}"
first_topic_repo_page = download_repo_page(topic_url)
logger.info(f"Get details on github topic: {topic_url}")
repo_tags = first_topic_repo_page.find_all('h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'})
star_tags = first_topic_repo_page.find_all('span', {'class': 'Counter js-social-count'})
for i in range(len(repo_tags)):
repo_details = get_repo_info(repo_tags[i], star_tags[i])
# Check if the repo URL is not already present in the dictionary
if repo_details[3] not in repo_info_dict['repo_url']:
# Store repos with more than 5000 stars.
if repo_details[2] > 5000:
repo_info_dict['username'].append(repo_details[0])
repo_info_dict['repo_name'].append(repo_details[1])
repo_info_dict['stars'].append(repo_details[2])
repo_info_dict['repo_url'].append(repo_details[3])
# Create a DataFrame from repo_info_dict
df_repo_info = pd.DataFrame(repo_info_dict['repo_url'])
# Check if the file already exists
csv_filename = 'github_url_to_write.csv'
if os.path.isfile(csv_filename):
# Append to the existing file
df_repo_info.to_csv(csv_filename, mode='a', header=False, index=False)
logger.info(f"Data appended to existing file: {csv_filename}")
else:
# Create a new file
df_repo_info.to_csv(csv_filename, index=False)
def get_topic_titles(parsed_content):
try:
selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = parsed_content.find_all('p',{'class':selected_class})
# We can make a list of topics
topic_titles = []
for tags in topic_title_tags:
topic_titles.append(tags.text)
return topic_titles
except Exception as err:
logger.error(f"Failed to get github topic titles: {err}")
def get_topic_desc(parsed_contents):
try:
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
topic_desc_tags = parsed_contents.find_all('p',{'class': desc_selector})
print(f"{topic_desc_tags}")
topic_desc = []
for desc in topic_desc_tags:
print("dsfsfs")
topic_desc.append(desc.text.strip()) # strip() is used for trimming all extra spaces in description.
return topic_desc
except Exception as err:
logger.error(f"Failed to get github topic desc: {err}")
def get_topic_url(parsed_contents):
try:
topic_link_tag = parsed_contents.find_all('a',{'class':'no-underline flex-1 d-flex flex-column'})
topic_urls = []
base_url = 'http://github.com'
for urls in topic_link_tag:
topic_urls.append(base_url + urls['href'])
return topic_urls
except Exception as err:
logger.error(f"Failed to get github topic urls: {err}")
def download_repo_page(topic_url):
response = requests.get(topic_url)
if response.status_code != 200:
print('There is some error in {}'.format(topic_url))
response_contents = response.text
parsed_contents = BeautifulSoup(response_contents,'html.parser')
return parsed_contents
def get_repo_info(repo_tags,star_tags):
# returns all info for a repo
a_tags = repo_tags.find_all('a')
username = a_tags[0].text.strip()
repo_name = a_tags[1].text.strip()
base_url = 'http://github.com/'
repo_url = base_url + a_tags[1]['href'].strip()
# Defining a function so that it will convert our star count to integer
def star_counts_converter(stars):
stars = stars.strip()
if stars[-1] == 'k':
return int(float(stars[:-1]) * 1000)
return int(stars)
star_counts = star_counts_converter(star_tags.text.strip())
return username,repo_name,star_counts,repo_url
def save_to_csv(topic_url,topic_name):
file_name = topic_name + '.csv'
if os.path.exists(file_name):
logger.debug(f"The file {file_name} already exists. Skipping.")
topics_df = topic_repo_details(topic_url)
topics_df.to_csv(file_name,index=None)
logger.info(f"Successfully scraped topic {topic_name}")
def check_if_already_written(github_url, file_path='papers_already_written_on.txt'):
"""
Check if a GitHub URL is an exact match in each line of a file.
Args:
github_url (str): GitHub URL string to check.
file_path (str): Path to the file containing lines to check against. Default is 'papers_already_written_on.txt'.
Returns:
bool: True if an exact match is found, False otherwise.
"""
try:
with open(file_path, 'r') as file:
# Read each line in the file
for line in file:
# Check for an exact match
if github_url.strip() == line.strip():
return True
except FileNotFoundError:
print(f"File not found: {file_path}")
except Exception as e:
print(f"An error occurred: {str(e)}")
return False

View File

@@ -1,17 +0,0 @@
import serpapi
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv('SERPAPI_KEY')
client = serpapi.Client(api_key=api_key)
result = client.search(
q="Retrieval Augumented Generation RAG",
engine="google",
location="Austin, Texas",
hl="en",
gl="us",
)
print(result["related_questions"]) # Get all the related questions

View File

@@ -1,181 +0,0 @@
################################################################
#
# GPT Researcher is an autonomous agent designed for comprehensive online research on a variety of tasks.
# The agent can produce detailed, factual and unbiased research reports, with customization options for
# focusing on relevant resources, outlines, and lessons. Inspired by the recent Plan-and-Solve and RAG papers,
# GPT Researcher addresses issues of speed, determinism and reliability, offering a more stable
# performance and increased speed through parallelized agent work, as opposed to synchronous operations.
#
# The main idea is to run "planner" and "execution" agents, whereas the planner generates questions to research,
# and the execution agents seek the most related information based on each generated research question.
# Finally, the planner filters and aggregates all related information and creates a research report.
#
# The agents leverage both gpt3.5-turbo and gpt-4-turbo (128K context) to complete a research task.
# We optimize for costs using each only when necessary.
# The average research task takes around 3 minutes to complete, and costs ~$0.1.
#
##############################################################
import os
from pathlib import Path
import logging
from tavily import TavilyClient
import serpapi
from dotenv import load_dotenv
load_dotenv(Path('../.env'))
from langchain.adapters.openai import convert_openai_messages
from langchain.chat_models import ChatOpenAI
import google.generativeai as genai
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
) # for exponential backoff
from .gpt_providers.gemini_pro_text import gemini_text_response
from .blog_proof_reader import blog_proof_editor
from .convert_content_to_markdown import convert_tomarkdown_format
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def do_online_research(query, gpt_provider="openai"):
# Do a google search for the given keyword. The search results will give urls, questions for faq
faq_questions = []
organic_results = []
report = ''
try:
faq_questions = google_search(query, "faq")
logging.info(f"Google search FAQ questions: {faq_questions}")
# Now, get top 10 google organic results and polish the content to compete for these keywords.
organic_results = google_search(query, "organic_result")
except Exception as err:
logging.error(f"Failed to do Serpapi research: {err}")
# Not failing, as tavily would do same and then GPT-V to search.
#exit(1)
try:
# Retrieve API keys
api_key = os.getenv('TAVILY_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
if not api_key or not openai_api_key:
raise ValueError("API keys for Tavily or OpenAI are not set.")
# Initialize Tavily client
try:
client = TavilyClient(api_key=api_key)
except Exception as err:
logging.error("Failed to create Tavily client. Check TAVILY_API_KEY")
exit(1)
# Run tavily search
logging.info(f"Running Tavily search on: {query}")
try:
content = client.search(query, search_depth="advanced")["results"]
except Exception as err:
logging.error(f"Failed to do Tavily Research: {err}")
exit(1)
if "gemini" in gpt_provider:
prompt = ["You are an AI critical thinker research assistant."
"I will provide you with json content and a list of faq questions."
"Use given json as context for writing your research report."
"Your sole purpose is to write well written, critically acclaimed, objective and structured research report"
"Important: Include and write code examples in your final report."
"Include your own insights on the topic to make it comprehensive and detailed."
"Use the urls from json content to provide cititations and include it in referances section of your report."
"Include appropriate emojis in your research report."
"Include FAQs relevant to your research report. Use the given faq questions. Write answers for each faq."
"Format your report in MLA format and markdown style, with special focus on readibility."
f"Do not provide explanations for your response.\njson content: \"\"\" {content} \"\"\"\n "
f"\nList of FAQ questions: \"\"\" {faq_questions} \"\"\"\n"]
report = gemini_text_response(prompt)
elif "openai" in gpt_provider:
# Setup prompt for GPT-4
prompt = [{
"role": "system",
"content": ('You are an AI critical thinker research assistant. '
'Your sole purpose is to write well written, critically acclaimed, '
'objective and structured reports on given text.')
}, {
"role": "user",
"content": (f'Information: """{content}"""\n\n'
f'Using the above information, answer the following '
f'query: "{query}" in a detailed report --'
f'Please use MLA format and markdown syntax.')
}]
report = openai_research_report(prompt)
report = compete_organic_results(query, report, organic_results)
return report
except Exception as e:
logging.error(f"Failed in online research: {e}")
exit(1)
def openai_research_report(query):
""" Generate research report with openai """
# Run GPT-4
logging.info("Generating Research report with GPT-4...")
lc_messages = convert_openai_messages(prompt)
try:
report = ChatOpenAI(model='gpt-4', openai_api_key=openai_api_key).invoke(lc_messages).content
#logging.info(f"\n Below is the online research report for given keywords/title: \n\n{report}")
return report
except Exception as err:
logging.error("Failed to generate do_online_research with ChatOpenAI")
exit(1)
def compete_organic_results(query, report, organic_results):
""" Given a blog content and google search organinc results, create a new blog to compete against them."""
prompt = f""" As an SEO expert and copywriter, I will provide you with my blog content on topic '{query}', and
Top google search results.
Your task is to rewrite the given blog to make it compete against top position results.
Make sure, the new blog has high probability of ranking highest against given organic search result competitors.
Modify the given blog content following best SEO practises.
Make sure the blog is original, unique and highly readable.
Remember, Maintain and adopt the formatting, structure, style and tone of the provided blog content.
Include relevant emojis in your final blog for visual appeal. Use it sparingly.
Your response should be well-structured, objective, and critically acclaimed blog article based on provided texts.
Remember, your goal is to create a detailed blog article that will compete against given organic result competitors.
Do not provide explanations, suggestions for your response, reply only with your final response.
Take your time in crafting your content, do not rush to give the response.
Blog Content: '{report}'\n
Organic Search result: '{organic_results}'
"""
report = gemini_text_response(prompt)
return report
def google_search(query, flag="faq"):
""" Do google search for given query """
try:
api_key = os.getenv('SERPAPI_KEY')
client = serpapi.Client(api_key=api_key)
result = client.search(
q=query,
engine="google",
hl="en",
)
except Exception as err:
logging.error(f"Failed in Google Search: {err}")
exit(1)
if 'faq' in flag:
# Check if 'inline_people_also_search_for' and 'related_questions' exist in result
related_search = [item['title'] for item in result.get('inline_people_also_search_for', [])]
related_questions = [item['question'] for item in result.get('related_questions', [])]
# Determine which list to use for faq_questions
if not related_search and not related_questions:
faq_questions = [item['query'] for item in result.get('related_searches', [])]
else:
faq_questions = related_search + related_questions
return faq_questions
elif 'organic_result' in flag:
# Check if 'organic_results' exists in result
return result.get('organic_results', [])

View File

@@ -17,10 +17,11 @@ from tenacity import (
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def gemini_arxiv_img_info(img_path):
def gemini_get_img_info(prompt, img_path):
""" Get image details from arxiv papers. """
logging.info(f"Get image details from Gemini Pro.")
try:
genai.configure(api_key=os.getenv("API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
except Exception as e:
logging.error(f"Could not load gemini API key: {e}")
raise e
@@ -35,19 +36,19 @@ def gemini_arxiv_img_info(img_path):
safety_settings = [{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
"threshold": "BLOCK_NONE"
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
"threshold": "BLOCK_NONE"
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
"threshold": "BLOCK_NONE"
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
"threshold": "BLOCK_NONE"
},]
try:
@@ -67,13 +68,12 @@ def gemini_arxiv_img_info(img_path):
"data": Path(img_path).read_bytes()
},]
prompt_parts = [
"As scholar on evaluating research papers, I will provide you with an image from a research paper. Your task is to explain the image in details so that I can use it in a blog article. Explain the key findings and conclusions from the image. Your description should be in simple terms to explain to a wider audience. Explain key findings from the given image.",
image_parts[0],]
prompt_parts = [f"{prompt}", image_parts[0],]
try:
response = model.generate_content(prompt_parts)
return response.text
except Exception as e:
logging.error(f"Could not generate gemini content: {e}")
logging.error(f"Gemini is blocking this request: {response.prompt_feedback.block_reason}")
logging.error(f"Gemini Vision, Failed to give image Details: {e}\n{response.prompt_feedback}")
raise e

View File

@@ -32,11 +32,9 @@ def gemini_text_response(prompt):
model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config)
try:
response = model.generate_content(prompt)
except Exception as err:
logger.error(f"Failed to get response from Gemini: {err}. Retrying.")
# Try with minstral.
print(f"\n\n\n--MINSTRAL--\n\n\n\n")
response = mistral_text_response(prompt)
return response
#response = mistral_text_response(prompt)
#return response
return response.text

View File

@@ -93,13 +93,6 @@ def blog_arxiv_url_list(file_path):
# Read already written IDs
written_ids = read_written_ids('papers_already_written_on.txt')
# Write blogs on each of arxiv_id from the file.
for arxiv_id in extracted_ids:
# Check if we have already written on this research_paper. For this, all arxiv ids are written in
# a file called 'papers_already_written_on.txt'. If arxiv ID is found in this file, skip writing again.
# YUP, use a DB. KISS for now.
written_ids = read_written_ids('papers_already_written_on.txt')
# Loop through extracted IDs
for arxiv_id in extracted_ids:
if arxiv_id not in written_ids:
@@ -178,8 +171,8 @@ def blog_postprocessing(arxiv_id, research_review):
save_blog_to_file(research_review, blog_title, blog_meta_desc, blog_tags,\
blog_categories, generated_image_filepath)
except Exception as err:
logger.__repr__ror(f"Failed to save blog to a file: {err}")
raise err
logger.error(f"Failed to save blog to a file: {err}")
sys.exit(1)
def take_paper_screenshot(arxiv_url):

View File

@@ -1,71 +0,0 @@
import os
import datetime
from selenium import webdriver
from PIL import Image
import shutil
from screenshotone import Client, TakeOptions
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path('../.env'))
def screenshot_api(url, generated_image_filepath):
""" Use screenshotone API to take company webpage screenshots """
try:
# create API client
client = Client(os.getenv('SCREENSHOTONE_ACCESS_KEY'), os.getenv('SCREENSHOTONE_SECRET_KEY'))
# set up options
options = (TakeOptions.url(url)
.format("png")
.viewport_width(1024)
.viewport_height(768)
.block_cookie_banners(True)
.block_chats(True))
# generate the screenshot URL and share it with a user
#url = client.generate_take_url(options)
# or render a screenshot and download the image as stream
image = client.take(options)
# store the screenshot the example.png file
with open(generated_image_filepath, 'wb') as result_file:
shutil.copyfileobj(image, result_file)
# Display the screenshot using Image.show
image = Image.open(generated_image_filepath)
image.show()
except Exception as err:
print(f"Failed in screenshotone api: {err}")
generated_image_filepath = take_screenshot(url, generated_image_filepath)
return generated_image_filepath
def take_screenshot(url, generated_image_filepath):
# Create a webdriver instance
driver = webdriver.Chrome()
# Navigate to the given url
driver.get(url)
# Set a fixed window size (you can adjust this as needed)
driver.set_window_size(800, 600)
# Take a screenshot of the webpage
screenshot = driver.get_screenshot_as_png()
# Close the webdriver instance
driver.quit()
# Save the screenshot to a file
with open(generated_image_filepath, "wb") as f:
f.write(screenshot)
# Display the screenshot using Image.show
image = Image.open(generated_image_filepath)
image.show()
return generated_image_filepath

View File

@@ -0,0 +1,113 @@
import os
import sys
import datetime
import subprocess
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from selenium import webdriver
from PIL import Image
import shutil
from screenshotone import Client, TakeOptions
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path('../.env'))
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def screenshot_api(url, generated_image_filepath):
""" Use screenshotone API to take company webpage screenshots """
try:
# create API client
client = Client(os.getenv('SCREENSHOTONE_ACCESS_KEY'), os.getenv('SCREENSHOTONE_SECRET_KEY'))
# set up options
options = (TakeOptions.url(url)
.format("png")
.viewport_width(1024)
.viewport_height(768)
.block_cookie_banners(True)
.block_chats(True))
# generate the screenshot URL and share it with a user
#url = client.generate_take_url(options)
# or render a screenshot and download the image as stream
image = client.take(options)
# store the screenshot the example.png file
with open(generated_image_filepath, 'wb') as result_file:
shutil.copyfileobj(image, result_file)
# Display the screenshot using Image.show
image = Image.open(generated_image_filepath)
image.show()
# Wait for 2 seconds (adjust the delay as needed)
sleep(2)
# Close the image window
image.close()
except Exception as err:
print(f"Failed in screenshotone api: {err}")
generated_image_filepath = take_screenshot(url, generated_image_filepath)
return generated_image_filepath
def take_screenshot(url, generated_image_filepath):
# Create a webdriver instance in headless mode
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
logger.debug(f"Taking screenshot of url: {url}")
try:
# Navigate to the given url
driver.get(url)
# Optionally, increase the delay to ensure all content is loaded
sleep(2)
# Explicitly wait for the page to load (adjust timeout as needed)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# Set a larger window size
driver.set_window_size(1200, 800)
# Take a screenshot of the webpage
screenshot = driver.get_screenshot_as_png()
# Save the screenshot to a file
with open(generated_image_filepath, "wb") as f:
f.write(screenshot)
# Display the screenshot using Image.show
image = Image.open(generated_image_filepath)
image.show()
# Wait for 2 seconds (adjust the delay as needed)
sleep(2)
# Close the image window using subprocess (platform-dependent)
subprocess.run(["pkill", "-f", "display"]) # Adjust based on your platform and viewer
# If using macOS, you can use the following:
# subprocess.run(["osascript", "-e", 'tell application "Preview" to close every window'])
# If using Windows, you can use the following:
# subprocess.run(["taskkill", "/F", "/IM", "Microsoft.Photos.exe"])
logger.debug(f"Screenshot successfully stored at: {generated_image_filepath}")
return generated_image_filepath
finally:
# Close the webdriver instance
driver.quit()