WIP- Under maintenence- Web research working.
This commit is contained in:
@@ -258,6 +258,15 @@ def arxiv_bibtex(arxiv_id):
|
||||
#search = GoogleSearch(params)
|
||||
#results = search.get_dict()
|
||||
|
||||
#from llmsherpa.readers import LayoutPDFReader
|
||||
|
||||
#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
|
||||
#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
|
||||
#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
|
||||
#doc = pdf_reader.read_pdf(pdf_url)
|
||||
|
||||
|
||||
|
||||
|
||||
def extract_arxiv_ids_from_line(line):
|
||||
"""
|
||||
302
lib/ai_web_researcher/google_serp_search.py
Normal file
302
lib/ai_web_researcher/google_serp_search.py
Normal file
@@ -0,0 +1,302 @@
|
||||
"""
|
||||
This Python script performs Google searches using various services such as SerpApi, Serper.dev, and more. It displays the search results, including organic results, People Also Ask, and Related Searches, in formatted tables. The script also utilizes GPT to generate titles and FAQs for the Google search results.
|
||||
|
||||
Features:
|
||||
- Utilizes SerpApi, Serper.dev, and other services for Google searches.
|
||||
- Displays organic search results, including position, title, link, and snippet.
|
||||
- Presents People Also Ask questions and snippets in a formatted table.
|
||||
- Includes Related Searches in the combined table with People Also Ask.
|
||||
- Configures logging with Loguru for informative messages.
|
||||
- Uses Rich and Tabulate for visually appealing and formatted tables.
|
||||
|
||||
Usage:
|
||||
- Ensure the necessary API keys are set in the .env file.
|
||||
- Run the script to perform a Google search with the specified query.
|
||||
- View the displayed tables with organic results, People Also Ask, and Related Searches.
|
||||
- Additional information, such as generated titles and FAQs using GPT, is presented.
|
||||
|
||||
Modifications:
|
||||
- Update the environment variables in the .env file with the required API keys.
|
||||
- Customize the search parameters, such as location and language, in the functions as needed.
|
||||
- Adjust logging configurations, table formatting, and other aspects based on preferences.
|
||||
|
||||
To-Do (TBD):
|
||||
- Consider adding further enhancements or customization based on specific use cases.
|
||||
|
||||
Note: This script depends on external libraries such as SerpApi, Loguru, Rich, and Tabulate. Install them using 'pip install serpapi loguru rich tabulate' if not already installed.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import requests
|
||||
from clint.textui import progress
|
||||
#from serpapi import GoogleSearch
|
||||
from loguru import logger
|
||||
from tabulate import tabulate
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
from dotenv import load_dotenv
|
||||
# Load environment variables from .env file
|
||||
load_dotenv(Path('../../.env'))
|
||||
logger.add(
|
||||
sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
from .gpt_titles_faq import gpt_titles_faqs_google_search
|
||||
|
||||
#from tenacity import retry, stop_after_attempt, wait_random_exponential
|
||||
#@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
|
||||
|
||||
#FIXME: Accept language, country and time frame to search for.
|
||||
def google_search(query):
|
||||
"""
|
||||
Perform a Google search for the given query.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
flag (str, optional): The search flag (default is "faq").
|
||||
|
||||
Returns:
|
||||
list: List of search results based on the specified flag.
|
||||
"""
|
||||
try:
|
||||
perform_serpapi_google_search(query)
|
||||
logger.info(f"FIXME: Google serapi: {query}")
|
||||
#return process_search_results(search_result)
|
||||
except Exception as err:
|
||||
logger.error(f"ERROR: Check Here: https://serpapi.com/. Your requests may be over. {err}")
|
||||
|
||||
# Retry with serper.dev
|
||||
try:
|
||||
logger.info("Trying Google search with Serper.dev: https://serper.dev/api-key")
|
||||
search_result = perform_serperdev_google_search(query)
|
||||
process_search_results(search_result)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do Google search with serper.dev: {err}")
|
||||
|
||||
return(search_result)
|
||||
|
||||
# # Retry with BROWSERLESS API
|
||||
# try:
|
||||
# search_result = perform_browserless_google_search(query)
|
||||
# #return process_search_results(search_result, flag)
|
||||
# except Exception as err:
|
||||
# logger.error("FIXME: Failed to do Google search with BROWSERLESS API.")
|
||||
# logger.debug("FIXME: Trying with dataforSEO API.")
|
||||
#
|
||||
# # Retry with dataforSEO API
|
||||
# try:
|
||||
# logger.info("Perform SERP with Data for SEO.")
|
||||
# #search_result = perform_dataforseo_google_search(query)
|
||||
# #return process_search_results(search_result, flag)
|
||||
# except Exception as err:
|
||||
# logger.error("FIXME: Failed to do Google search with dataforSEO API.")
|
||||
# logger.debug("All retries failed. Giving up.")
|
||||
# raise
|
||||
|
||||
|
||||
|
||||
def perform_serpapi_google_search(query, location="in"):
|
||||
"""
|
||||
Perform a Google search using the SerpApi service.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
location (str, optional): The location for the search (default is "Austin, Texas").
|
||||
api_key (str, optional): Your secret API key for SerpApi.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the search results.
|
||||
"""
|
||||
try:
|
||||
# Check if API key is provided
|
||||
if not os.getenv("SERPAPI_KEY"):
|
||||
raise ValueError("SERPAPI_KEY key is required for SerpApi")
|
||||
|
||||
# Create a GoogleSearch instance
|
||||
search = GoogleSearch({
|
||||
"q": query,
|
||||
"location": location,
|
||||
"api_key": api_key
|
||||
})
|
||||
# Get search results as a dictionary
|
||||
result = search.get_dict()
|
||||
return result
|
||||
|
||||
except ValueError as ve:
|
||||
# Handle missing API key error
|
||||
logger.info(f"SERPAPI ValueError: {ve}")
|
||||
except Exception as e:
|
||||
# Handle other exceptions
|
||||
logger.info(f"SERPAPI An error occurred: {e}")
|
||||
|
||||
|
||||
def perform_serperdev_google_search(query):
|
||||
"""
|
||||
Perform a Google search using the Serper API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
|
||||
Returns:
|
||||
dict: The JSON response from the Serper API.
|
||||
"""
|
||||
# Get the Serper API key from environment variables
|
||||
logger.info("Doing serper.dev google search.")
|
||||
serper_api_key = os.getenv('SERPER_API_KEY')
|
||||
|
||||
# Check if the API key is available
|
||||
if not serper_api_key:
|
||||
raise ValueError("SERPER_API_KEY is missing. Set it in the .env file.")
|
||||
|
||||
# Serper API endpoint URL
|
||||
url = "https://google.serper.dev/search"
|
||||
|
||||
# FIXME: Expose options to end user. Request payload
|
||||
payload = json.dumps({
|
||||
"q": query,
|
||||
"gl": "in",
|
||||
"hl": "en",
|
||||
"num": 5,
|
||||
"autocorrect": True,
|
||||
"page": 1,
|
||||
"type": "search",
|
||||
"engine": "google"
|
||||
})
|
||||
|
||||
# Request headers with API key
|
||||
headers = {
|
||||
'X-API-KEY': serper_api_key,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
# Send a POST request to the Serper API with progress bar
|
||||
with progress.Bar(label="Searching", expected_size=100) as bar:
|
||||
response = requests.post(url, headers=headers, data=payload, stream=True)
|
||||
# Check if the request was successful
|
||||
if response.status_code == 200:
|
||||
# Parse and return the JSON response
|
||||
return response.json()
|
||||
else:
|
||||
# Print an error message if the request fails
|
||||
logger.error(f"Error: {response.status_code}, {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def perform_browserless_google_search():
|
||||
return
|
||||
|
||||
def perform_dataforseo_google_search():
|
||||
return
|
||||
|
||||
|
||||
|
||||
def process_search_results(search_results):
|
||||
"""
|
||||
Create a Pandas DataFrame from the search results.
|
||||
|
||||
Args:
|
||||
search_results (dict): The search results JSON.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Pandas DataFrame containing the search results.
|
||||
"""
|
||||
data = []
|
||||
logger.info(f"Google Search Parameters: {search_results.get('searchParameters', {})}")
|
||||
organic_results = search_results.get("organic", [])
|
||||
print(search_results)
|
||||
|
||||
# Displaying Organic Results
|
||||
organic_data = []
|
||||
for result in search_results["organic"]:
|
||||
position = result.get("position", "")
|
||||
title = result.get("title", "")
|
||||
link = result.get("link", "")
|
||||
snippet = result.get("snippet", "")
|
||||
organic_data.append([position, title, link, snippet])
|
||||
|
||||
organic_headers = ["Rank", "Title", "Link", "Snippet"]
|
||||
organic_table = tabulate(organic_data,
|
||||
headers=organic_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["center", "left", "left", "left"],
|
||||
maxcolwidths=[5, 25, 35, 50])
|
||||
|
||||
# Print the tables
|
||||
print("\n\n📢❗🚨 Google search Organic Results:")
|
||||
print(organic_table)
|
||||
|
||||
# Displaying People Also Ask and Related Searches combined
|
||||
combined_data = []
|
||||
try:
|
||||
people_also_ask_data = []
|
||||
if "peopleAlsoAsk" in search_results:
|
||||
for question in search_results["peopleAlsoAsk"]:
|
||||
title = question.get("title", "")
|
||||
snippet = question.get("snippet", "")
|
||||
link = question.get("link", "")
|
||||
people_also_ask_data.append([title, snippet, link])
|
||||
except Exception as people_also_ask_err:
|
||||
logger.error(f"Error processing 'peopleAlsoAsk': {people_also_ask_err}")
|
||||
people_also_ask_data = []
|
||||
|
||||
related_searches_data = []
|
||||
for query in search_results.get("relatedSearches", []):
|
||||
related_searches_data.append([query.get("query", "")])
|
||||
related_searches_headers = ["Related Search"]
|
||||
|
||||
if people_also_ask_data:
|
||||
# Add Related Searches as a column to People Also Ask
|
||||
combined_data = [
|
||||
row + [related_searches_data[i][0] if i < len(related_searches_data) else ""]
|
||||
for i, row in enumerate(people_also_ask_data)
|
||||
]
|
||||
combined_headers = ["Question", "Snippet", "Link", "Related Search"]
|
||||
# Display the combined table
|
||||
combined_table = tabulate(
|
||||
combined_data,
|
||||
headers=combined_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left", "left", "left", "left"],
|
||||
maxcolwidths=[20, 50, 20, 30]
|
||||
)
|
||||
else:
|
||||
combined_table = tabulate(
|
||||
related_searches_data,
|
||||
headers=related_searches_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left"],
|
||||
maxcolwidths=[60]
|
||||
)
|
||||
|
||||
print("\n\n📢❗🚨 People Also Ask & Related Searches:")
|
||||
print(combined_table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(organic_table)
|
||||
save_in_file(combined_table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
return search_results
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "a") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
530
lib/ai_web_researcher/google_trends_researcher.py
Normal file
530
lib/ai_web_researcher/google_trends_researcher.py
Normal file
@@ -0,0 +1,530 @@
|
||||
"""
|
||||
This Python script analyzes Google search keywords by fetching auto-suggestions, performing keyword clustering, and visualizing Google Trends data. It uses various libraries such as pytrends, requests_html, tqdm, and more.
|
||||
|
||||
Features:
|
||||
- Fetches auto-suggestions for a given search keyword from Google.
|
||||
- Performs keyword clustering using K-means algorithm based on TF-IDF vectors.
|
||||
- Visualizes Google Trends data, including interest over time and interest by region.
|
||||
- Retrieves related queries and topics for a set of search keywords.
|
||||
- Utilizes visualization libraries such as Matplotlib, Plotly, and Rich for displaying results.
|
||||
- Incorporates logging for error handling and informative messages.
|
||||
|
||||
Usage:
|
||||
- Provide a search term or a list of search terms for analysis.
|
||||
- Run the script to fetch auto-suggestions, perform clustering, and visualize Google Trends data.
|
||||
- Explore the displayed results, including top keywords in each cluster and related topics.
|
||||
|
||||
Modifications:
|
||||
- Customize the search terms in the 'do_google_trends_analysis' function.
|
||||
- Adjust the number of clusters for keyword clustering and other parameters as needed.
|
||||
- Explore further visualizations and analyses based on the generated data.
|
||||
|
||||
Note: Ensure that the required libraries are installed using 'pip install pytrends requests_html tqdm tabulate plotly rich'.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
import sys
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.cluster import KMeans
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.metrics import silhouette_score, silhouette_samples
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress
|
||||
import urllib
|
||||
import json
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import plotly.express as px
|
||||
import plotly.io as pio
|
||||
import logging
|
||||
from requests_html import HTML, HTMLSession
|
||||
from urllib.parse import quote_plus
|
||||
from tqdm import tqdm
|
||||
from tabulate import tabulate
|
||||
from pytrends.request import TrendReq
|
||||
import wordcloud
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
from loguru import logger
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def fetch_google_trends_interest_overtime(keyword):
|
||||
try:
|
||||
pytrends = TrendReq(hl='en-US', tz=360)
|
||||
pytrends.build_payload([keyword], timeframe='today 1-y', geo='US')
|
||||
|
||||
# 1. Interest Over Time
|
||||
data = pytrends.interest_over_time()
|
||||
data = data.reset_index()
|
||||
|
||||
# Visualization using Matplotlib
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.plot(data['date'], data[keyword], label=keyword)
|
||||
plt.title(f'Interest Over Time for "{keyword}"')
|
||||
plt.xlabel('Date')
|
||||
plt.ylabel('Interest')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
return data
|
||||
except Exception as e:
|
||||
logging.error(f"Error in fetch_google_trends_data: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def plot_interest_by_region(kw_list):
|
||||
try:
|
||||
from pytrends.request import TrendReq
|
||||
import matplotlib.pyplot as plt
|
||||
trends = TrendReq()
|
||||
trends.build_payload(kw_list=kw_list)
|
||||
kw_list = ' '.join(kw_list)
|
||||
data = trends.interest_by_region() #sorting by region
|
||||
data = data.sort_values(by=f"{kw_list}", ascending=False)
|
||||
print("\n📢❗🚨 ")
|
||||
print(f"Top 10 regions with highest interest for keyword: {kw_list}")
|
||||
data = data.head(10) #Top 10
|
||||
print(data)
|
||||
data.reset_index().plot(x="geoName", y=f"{kw_list}",
|
||||
figsize=(20,15), kind="bar")
|
||||
plt.style.use('fivethirtyeight')
|
||||
plt.show()
|
||||
# FIXME: Send this image to vision GPT for analysis.
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error plotting interest by region: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
def get_related_queries_and_save_csv(keywords, hl='en-US', tz=360, cat=0, timeframe='today 12-m'):
|
||||
"""
|
||||
Get related queries for the given search keywords and save the result to a CSV file.
|
||||
|
||||
Args:
|
||||
search_keywords (list): List of search keywords.
|
||||
hl (str): Language parameter, default is 'en-US'.
|
||||
tz (int): Timezone parameter, default is 360.
|
||||
cat (int): Category parameter, default is 0.
|
||||
timeframe (str): Timeframe parameter, default is 'today 12-m'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame containing related queries.
|
||||
"""
|
||||
try:
|
||||
# Build model
|
||||
pytrends = TrendReq(hl=hl, tz=tz)
|
||||
pytrends.build_payload(kw_list=keywords, cat=cat, timeframe=timeframe)
|
||||
|
||||
# Get related queries
|
||||
data = pytrends.related_queries()
|
||||
|
||||
# Extract data from the result
|
||||
top_queries = list(data.values())[0]['top']
|
||||
rising_queries = list(data.values())[0]['rising']
|
||||
top_rising_queries = top_queries + rising_queries
|
||||
|
||||
# Convert lists to DataFrames
|
||||
df_top_queries = pd.DataFrame(top_queries)
|
||||
df_rising_queries = pd.DataFrame(rising_queries) # Added this line
|
||||
|
||||
# Rename columns to avoid duplicates
|
||||
df_top_queries.columns = ['Top query', 'value']
|
||||
df_rising_queries.columns = ['Rising query', 'value']
|
||||
|
||||
# Save to CSV
|
||||
all_queries_df = pd.concat([df_top_queries, df_rising_queries], axis=1)
|
||||
#all_queries_df.to_csv('related_queries.csv', index=False)
|
||||
|
||||
# Display additional information
|
||||
console = Console()
|
||||
# Display additional information with emojis and bold formatting
|
||||
print("\n📢❗🚨 ")
|
||||
print("\n\033[1m🔝 Top\033[0m: The most popular search queries. Scoring is on a relative scale where a value of 100 is the most commonly searched query, 50 is a query searched half as often, and a value of 0 is a query searched for less than 1% as often as the most popular query.\n")
|
||||
print("\n\033[1m🚀 Rising\033[0m: Queries with the biggest increase in search frequency since the last time period. Results marked 'Breakout' had a tremendous increase, probably because these queries are new and had few (if any) prior searches.\n")
|
||||
# Display the DataFrame using tabulate
|
||||
print(tabulate(all_queries_df, headers='keys', tablefmt='fancy_grid'))
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(all_queries_df)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
return top_rising_queries
|
||||
|
||||
except Exception as e:
|
||||
print(f"get_related_queries_and_save_csv: ERROR: An error occurred: {e}")
|
||||
|
||||
|
||||
def get_related_topics_and_save_csv(search_keywords):
|
||||
"""
|
||||
Get related topics for the given search keywords and save the result to a CSV file.
|
||||
|
||||
Args:
|
||||
search_keywords (list): List of search keywords.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame containing related topics.
|
||||
"""
|
||||
try:
|
||||
# Build model
|
||||
pytrends = TrendReq(hl='en-US', tz=360)
|
||||
|
||||
# Build payload
|
||||
pytrends.build_payload(search_keywords, cat=0, timeframe='today 12-m')
|
||||
|
||||
# Get related topics
|
||||
data = pytrends.related_topics()
|
||||
# Extract data from the result
|
||||
top_topics = list(data.values())[0]['top']
|
||||
rising_topics = list(data.values())[0]['rising']
|
||||
|
||||
# Convert lists to DataFrames
|
||||
df_top_topics = pd.DataFrame(top_topics)
|
||||
df_rising_topics = pd.DataFrame(rising_topics)
|
||||
|
||||
# FIXME:Exclude specified columns
|
||||
columns_to_exclude = ['hasData', 'value', 'topic_mid', 'link']
|
||||
df_top_topics = df_top_topics.drop(columns=columns_to_exclude, errors='ignore')
|
||||
df_rising_topics = df_rising_topics.drop(columns=columns_to_exclude, errors='ignore')
|
||||
|
||||
# Rename columns to avoid duplicates and provide meaningful names
|
||||
df_top_topics.columns = ['Top- ' + col if col != 'topic_title' else col for col in df_top_topics.columns]
|
||||
df_rising_topics.columns = ['Rising- ' + col if col != 'topic_title' else col for col in df_rising_topics.columns]
|
||||
|
||||
# Save to CSV
|
||||
all_topics_df = pd.concat([df_top_topics, df_rising_topics], axis=1)
|
||||
#all_topics_df.to_csv('related_topics.csv', index=False)
|
||||
|
||||
print(f"\n\n 📢❗🚨 Rising and Trending Keywords for {search_keywords}\n")
|
||||
print("\033[1m🔝 Top\033[0m: The most popular search topics.")
|
||||
print("\033[1m🚀 Rising\033[0m: Topics experiencing a significant increase in search frequency since the last time period. Topics marked :pile_of_poop:'Breakout' had a tremendous surge, likely because they are new and had few prior searches.")
|
||||
# Display the DataFrame using tabulate
|
||||
pd.set_option('display.max_rows', all_topics_df.shape[0]+1)
|
||||
print(all_topics_df.head(10))
|
||||
#print(tabulate(all_topics_df, headers='keys', tablefmt='fancy_grid'))
|
||||
return all_topics_df
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR: An error occurred: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def get_source(url):
|
||||
try:
|
||||
session = HTMLSession()
|
||||
response = session.get(url)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error during HTTP request: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def get_results(query):
|
||||
try:
|
||||
query = urllib.parse.quote_plus(query)
|
||||
response = get_source(f"https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q={query}")
|
||||
if response:
|
||||
response.raise_for_status()
|
||||
results = json.loads(response.text)
|
||||
return results
|
||||
else:
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logging.error(f"Error decoding JSON response: {e}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error during HTTP request: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def format_results(results):
|
||||
try:
|
||||
suggestions = []
|
||||
for index, value in enumerate(results[1]):
|
||||
suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]}
|
||||
suggestions.append(suggestion)
|
||||
return suggestions
|
||||
except (KeyError, IndexError) as e:
|
||||
logging.error(f"Error parsing search results: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def get_expanded_term_suffixes():
|
||||
return ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm','n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
|
||||
|
||||
|
||||
|
||||
def get_expanded_term_prefixes():
|
||||
# For shopping, review type blogs.
|
||||
#return ['discount *', 'pricing *', 'cheap', 'best price *', 'lowest price', 'best value', 'sale', 'affordable', 'promo', 'budget''what *', 'where *', 'how to *', 'why *', 'buy*', 'how much*','best *', 'worse *', 'rent*', 'sale*', 'offer*','vs*','or*']
|
||||
return ['what *', 'where *', 'how to *', 'why *','best *', 'vs*', 'or*']
|
||||
|
||||
|
||||
|
||||
def get_expanded_terms(query):
|
||||
try:
|
||||
expanded_term_prefixes = get_expanded_term_prefixes()
|
||||
expanded_term_suffixes = get_expanded_term_suffixes()
|
||||
|
||||
terms = [query]
|
||||
|
||||
for term in expanded_term_prefixes:
|
||||
terms.append(f"{term} {query}")
|
||||
|
||||
for term in expanded_term_suffixes:
|
||||
terms.append(f"{query} {term}")
|
||||
|
||||
return terms
|
||||
except Exception as e:
|
||||
logging.error(f"Error in get_expanded_terms: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def get_expanded_suggestions(query):
|
||||
try:
|
||||
all_results = []
|
||||
|
||||
expanded_terms = get_expanded_terms(query)
|
||||
for term in tqdm(expanded_terms, desc="📢❗🚨 Fetching Google AutoSuggestions", unit="term"):
|
||||
results = get_results(term)
|
||||
if results:
|
||||
formatted_results = format_results(results)
|
||||
all_results += formatted_results
|
||||
all_results = sorted(all_results, key=lambda k: k.get('relevance', 0), reverse=True)
|
||||
|
||||
return all_results
|
||||
except Exception as e:
|
||||
logging.error(f"Error in get_expanded_suggestions: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def get_suggestions_for_keyword(search_term):
|
||||
""" """
|
||||
try:
|
||||
expanded_results = get_expanded_suggestions(search_term)
|
||||
expanded_results_df = pd.DataFrame(expanded_results)
|
||||
expanded_results_df.columns = ['Keywords', 'Relevance']
|
||||
#expanded_results_df.to_csv('results.csv', index=False)
|
||||
pd.set_option('display.max_rows', expanded_results_df.shape[0]+1)
|
||||
expanded_results_df.drop_duplicates('Keywords', inplace=True)
|
||||
|
||||
return expanded_results_df
|
||||
except Exception as e:
|
||||
logging.error(f"get_suggestions_for_keyword: Error in main: {e}")
|
||||
|
||||
|
||||
|
||||
def perform_keyword_clustering(expanded_results_df, num_clusters=5):
|
||||
try:
|
||||
# Preprocessing: Convert the keywords to lowercase
|
||||
expanded_results_df['Keywords'] = expanded_results_df['Keywords'].str.lower()
|
||||
|
||||
# Vectorization: Create a TF-IDF vectorizer
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
# Fit the vectorizer to the keywords
|
||||
tfidf_vectors = vectorizer.fit_transform(expanded_results_df['Keywords'])
|
||||
|
||||
# Applying K-means clustering
|
||||
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
||||
cluster_labels = kmeans.fit_predict(tfidf_vectors)
|
||||
|
||||
# Add cluster labels to the DataFrame
|
||||
expanded_results_df['cluster_label'] = cluster_labels
|
||||
|
||||
# Assessing cluster quality through silhouette score
|
||||
silhouette_avg = silhouette_score(tfidf_vectors, cluster_labels)
|
||||
print(f"Silhouette Score: {silhouette_avg}")
|
||||
|
||||
# Visualize cluster quality using a silhouette plot
|
||||
#visualize_silhouette(tfidf_vectors, cluster_labels)
|
||||
|
||||
return expanded_results_df
|
||||
except Exception as e:
|
||||
logging.error(f"Error in perform_keyword_clustering: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
|
||||
def visualize_silhouette(X, labels):
|
||||
try:
|
||||
silhouette_avg = silhouette_score(X, labels)
|
||||
print(f"Silhouette Score: {silhouette_avg}")
|
||||
|
||||
# Create a subplot with 1 row and 2 columns
|
||||
fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))
|
||||
|
||||
# The 1st subplot is the silhouette plot
|
||||
ax1.set_xlim([-0.1, 1])
|
||||
ax1.set_ylim([0, X.shape[0] + (len(set(labels)) + 1) * 10])
|
||||
|
||||
# Compute the silhouette scores for each sample
|
||||
sample_silhouette_values = silhouette_samples(X, labels)
|
||||
|
||||
y_lower = 10
|
||||
for i in set(labels):
|
||||
# Aggregate the silhouette scores for samples belonging to the cluster
|
||||
ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
|
||||
ith_cluster_silhouette_values.sort()
|
||||
|
||||
size_cluster_i = ith_cluster_silhouette_values.shape[0]
|
||||
y_upper = y_lower + size_cluster_i
|
||||
|
||||
color = plt.cm.nipy_spectral(float(i) / len(set(labels)))
|
||||
ax1.fill_betweenx(np.arange(y_lower, y_upper),
|
||||
0, ith_cluster_silhouette_values,
|
||||
facecolor=color, edgecolor=color, alpha=0.7)
|
||||
|
||||
# Label the silhouette plots with their cluster numbers at the middle
|
||||
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
|
||||
|
||||
# Compute the new y_lower for the next plot
|
||||
y_lower = y_upper + 10 # 10 for the 0 samples
|
||||
|
||||
ax1.set_title("Silhouette plot for KMeans clustering")
|
||||
ax1.set_xlabel("Silhouette coefficient values")
|
||||
ax1.set_ylabel("Cluster label")
|
||||
|
||||
# The vertical line for the average silhouette score of all the values
|
||||
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
|
||||
|
||||
plt.show()
|
||||
except Exception as e:
|
||||
logging.error(f"Error in visualize_silhouette: {e}")
|
||||
|
||||
|
||||
|
||||
def print_and_return_top_keywords(expanded_results_df, num_clusters=5):
|
||||
"""
|
||||
Display and return top keywords in each cluster.
|
||||
|
||||
Args:
|
||||
expanded_results_df (pd.DataFrame): DataFrame containing expanded keywords, relevance, and cluster labels.
|
||||
num_clusters (int or str): Number of clusters or 'all'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with top keywords for each cluster.
|
||||
"""
|
||||
top_keywords_df = pd.DataFrame()
|
||||
|
||||
if num_clusters == 'all':
|
||||
unique_clusters = expanded_results_df['cluster_label'].unique()
|
||||
else:
|
||||
unique_clusters = range(int(num_clusters))
|
||||
|
||||
for i in unique_clusters:
|
||||
cluster_df = expanded_results_df[expanded_results_df['cluster_label'] == i]
|
||||
top_keywords = cluster_df.sort_values(by='Relevance', ascending=False).head(5)
|
||||
top_keywords_df = pd.concat([top_keywords_df, top_keywords])
|
||||
|
||||
print(f"\n📢❗🚨 GTop Keywords for All Clusters:")
|
||||
table = tabulate(top_keywords_df, headers='keys', tablefmt='fancy_grid')
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(top_keywords_df)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
print(table)
|
||||
return top_keywords_df
|
||||
|
||||
|
||||
def generate_wordcloud(keywords):
|
||||
"""
|
||||
Generate and display a word cloud from a list of keywords.
|
||||
|
||||
Args:
|
||||
keywords (list): List of keywords.
|
||||
"""
|
||||
# Convert the list of keywords to a string
|
||||
text = ' '.join(keywords)
|
||||
|
||||
# Generate word cloud
|
||||
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
|
||||
|
||||
# Display the word cloud using matplotlib
|
||||
plt.figure(figsize=(600, 200))
|
||||
plt.imshow(wordcloud, interpolation='bilinear')
|
||||
plt.axis('off')
|
||||
plt.show()
|
||||
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "w") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
|
||||
|
||||
def do_google_trends_analysis(search_term):
|
||||
""" Get a google search keywords, get its stats."""
|
||||
search_term = [f"{search_term}"]
|
||||
all_the_keywords = []
|
||||
try:
|
||||
for asearch_term in search_term:
|
||||
#FIXME: Lets work with a single root keyword.
|
||||
suggestions_df = get_suggestions_for_keyword(asearch_term)
|
||||
|
||||
result_df = perform_keyword_clustering(suggestions_df)
|
||||
# Display top keywords in each cluster
|
||||
top_keywords = print_and_return_top_keywords(result_df)
|
||||
all_the_keywords.append(top_keywords['Keywords'].tolist())
|
||||
#
|
||||
# # FIXME: Get result from vision GPT. Fetch and visualize Google Trends data
|
||||
# #trends_data = fetch_google_trends_interest_overtime("llamaindex")
|
||||
#
|
||||
# # FIXME: Plot Interest Over time.
|
||||
# result_df = plot_interest_by_region(search_term)
|
||||
#
|
||||
# # Display additional information
|
||||
result_df = get_related_topics_and_save_csv(search_term)
|
||||
# Extract 'Top' topic_title
|
||||
top_topic_title = result_df['topic_title'].values.tolist()
|
||||
|
||||
# Join each sublist into one string separated by comma
|
||||
#top_topic_title = [','.join(filter(None, map(str, sublist))) for sublist in top_topic_title]
|
||||
top_topic_title = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in top_topic_title])
|
||||
|
||||
print(f"\nRising and Top keywords: {top_topic_title}")
|
||||
# Print or use the extracted topic titles
|
||||
all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords])
|
||||
print(f"\n\n📢❗🚨 Important keywords to target: {all_the_keywords}\n\n")
|
||||
all_the_keywords += top_topic_title
|
||||
print(all_the_keywords)
|
||||
all_the_keywords = all_the_keywords.split(',')
|
||||
|
||||
# Split the list into chunks of 5 keywords
|
||||
chunk_size = 4
|
||||
chunks = [all_the_keywords[i:i + chunk_size] for i in range(0, len(all_the_keywords), chunk_size)]
|
||||
# Create a DataFrame with columns named 'Keyword 1', 'Keyword 2', etc.
|
||||
combined_df = pd.DataFrame(chunks, columns=[f'K📢eyword Col{i + 1}' for i in range(chunk_size)])
|
||||
|
||||
# Print the table
|
||||
print(tabulate(combined_df, headers='keys', tablefmt='fancy_grid'))
|
||||
#combined_df = pd.DataFrame({'📢❗🚨 Important keywords to target': chunks})
|
||||
|
||||
print(all_the_keywords)
|
||||
generate_wordcloud(all_the_keywords.split(','))
|
||||
return(all_the_keywords)
|
||||
except Exception as e:
|
||||
logging.error(f"Error in main: {e}")
|
||||
49
lib/ai_web_researcher/gpt_blog_sections.py
Normal file
49
lib/ai_web_researcher/gpt_blog_sections.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import sys
|
||||
import json
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
# FIXME: Provide num_blogs, num_faqs as inputs.
|
||||
def get_blog_sections_from_websearch(search_keyword, search_results, gpt_providers="gemini"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
As a SEO expert and content writer, I will provide you with a search keyword and its google search result.
|
||||
Your task is to write a blog title and 5 blog sub titles, from the given google search result.
|
||||
The subtitles should be less than 40 characters and click worthy.
|
||||
Do not explain, describe your response. Respond in json format, always name the key as 'blogSections'.
|
||||
|
||||
Web Research Keyword: "{search_keyword}"
|
||||
Google search Result: "{search_results}"
|
||||
"""
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
if '```' in response and '\n' in response:
|
||||
response = response.strip().split('\n')
|
||||
# Remove the first and last lines
|
||||
response = '\n'.join(response[1:-1])
|
||||
response = json.loads(response)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
logger.error(f"Gemini Error: {response.prompt_feedback}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from Openai: {err}")
|
||||
raise err
|
||||
41
lib/ai_web_researcher/gpt_competitor_analysis.py
Normal file
41
lib/ai_web_researcher/gpt_competitor_analysis.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def summarize_competitor_content(research_content, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f""" Web page content: {research_content} """
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
prompt = f"""You are a helpful assistant writing a research report about a company. I will provide you with company details.
|
||||
Summarize the given company details into multiple paragraphs.
|
||||
Be extremely concise, professional, and factual as possible.
|
||||
The first paragraph should be an introduction and summary of the company.
|
||||
The second paragraph should include pros and cons of the company.
|
||||
The third paragraph should be on their pricing model.
|
||||
Include a conclusion, summarizing your research about the given company details.
|
||||
Company details: '{research_content}'"""
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"failed to get response from Openai: {err}")
|
||||
raise err
|
||||
185
lib/ai_web_researcher/gpt_online_researcher.py
Normal file
185
lib/ai_web_researcher/gpt_online_researcher.py
Normal file
@@ -0,0 +1,185 @@
|
||||
################################################################
|
||||
#
|
||||
#
|
||||
#
|
||||
##############################################################
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import List, NamedTuple
|
||||
from loguru import logger
|
||||
from datetime import datetime
|
||||
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
from .tavily_ai_search import get_tavilyai_results
|
||||
from .metaphor_basic_neural_web_search import metaphor_find_similar, metaphor_search_articles
|
||||
from .google_serp_search import google_search
|
||||
from .google_trends_researcher import do_google_trends_analysis
|
||||
from .gpt_blog_sections import get_blog_sections_from_websearch
|
||||
from .web_research_report import write_web_research_report
|
||||
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(), similar_url=None):
|
||||
""" """
|
||||
print(f"Web Research:Time Range - {time_range},Search Keywords - {search_keywords},Include URLs - {include_domains}")
|
||||
if not include_domains:
|
||||
include_domains = list()
|
||||
# TBD: Keeping the results directory as fixed, for now.
|
||||
os.environ["SEARCH_SAVE_FILE"] = os.path.join(os.getcwd(), "workspace", "web_research_reports",
|
||||
search_keywords.replace(" ", "_") + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
|
||||
# Collect all blog titles featuring in search results. This *may help in generating blog titles
|
||||
# closest to competing ones. All search blog titles, given keyword and keywords from analysis, give
|
||||
# llm a good context for the task of generating blog titles.
|
||||
blog_titles = []
|
||||
# Get a list of FAQs from search results.
|
||||
blog_faqs = None
|
||||
google_result = None
|
||||
tavily_result = None
|
||||
report = None
|
||||
# try:
|
||||
# logger.info(f"Doing Google search for: {search_keywords}\n")
|
||||
# google_result = google_search(search_keywords)
|
||||
# blog_titles.append(extract_info(google_result, "titles"))
|
||||
# except Exception as err:
|
||||
# logger.error(f"Failed to do Google Serpapi research: {err}")
|
||||
# # Not failing, as tavily would do same and then GPT-V to search.
|
||||
#
|
||||
# try:
|
||||
# # FIXME: Include the follow-up questions as blog FAQs.
|
||||
# logger.info(f"Doing Tavily AI search for: {search_keywords}")
|
||||
# tavily_result = get_tavilyai_results(search_keywords, include_domains)
|
||||
# blog_titles.append(tavily_extract_information(tavily_result, "titles"))
|
||||
# except Exception as err:
|
||||
# logger.error(f"Failed to do Tavily AI Search: {err}")
|
||||
|
||||
# try:
|
||||
# logger.info(f"Start Semantic/Neural web search with Metahpor: {search_keywords}")
|
||||
# response_articles = metaphor_search_articles(
|
||||
# search_keywords,
|
||||
# include_domains=include_domains,
|
||||
# time_range=time_range,
|
||||
# similar_url=similar_url)
|
||||
# blog_titles.append(metaphor_extract_titles_or_text(response_articles, return_titles=True))
|
||||
# except Exception as err:
|
||||
# logger.error(f"Failed to do Metaphor search: {err}")
|
||||
# print(blog_titles)
|
||||
|
||||
try:
|
||||
logger.info(f"Do Google Trends analysis for given keywords: {search_keywords}")
|
||||
important_keywords = do_google_trends_analysis(search_keywords)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do google trends analysis: {err}")
|
||||
print(important_keywords)
|
||||
# Now that we have search results from given keywords. Generate blog title and subtopics suggestions.
|
||||
# 1. Return a list of related keywords along with search volumes.
|
||||
# 2. New blog titles to write on(niche, top) and blog sections.
|
||||
# 3. Competitors list, similar urls if given.
|
||||
|
||||
|
||||
class Result(NamedTuple):
|
||||
url: str
|
||||
id: str
|
||||
title: str
|
||||
score: float
|
||||
published_date: str
|
||||
author: str
|
||||
text: str
|
||||
highlights: List[str]
|
||||
highlight_scores: List[float]
|
||||
|
||||
|
||||
def metaphor_extract_titles_or_text(json_data, return_titles=True):
|
||||
"""
|
||||
Extract either titles or text from the given JSON structure.
|
||||
|
||||
Args:
|
||||
json_data (list): List of Result objects in JSON format.
|
||||
return_titles (bool): If True, return titles. If False, return text.
|
||||
|
||||
Returns:
|
||||
list: List of titles or text.
|
||||
"""
|
||||
result_list = [Result(**result) for result in json_data]
|
||||
|
||||
if return_titles:
|
||||
return [result.title for result in result_list]
|
||||
else:
|
||||
return [result.text for result in result_list]
|
||||
|
||||
|
||||
def extract_info(json_data, info_type):
|
||||
"""
|
||||
Extract information (titles, peopleAlsoAsk, or relatedSearches) from the given JSON.
|
||||
|
||||
Args:
|
||||
json_data (dict): The JSON data.
|
||||
info_type (str): The type of information to extract (titles, peopleAlsoAsk, relatedSearches).
|
||||
|
||||
Returns:
|
||||
list or None: A list containing the requested information, or None if the type is invalid.
|
||||
"""
|
||||
if info_type == "titles":
|
||||
return [result.get("title") for result in json_data.get("organic", [])]
|
||||
elif info_type == "peopleAlsoAsk":
|
||||
return [item.get("question") for item in json_data.get("peopleAlsoAsk", [])]
|
||||
elif info_type == "relatedSearches":
|
||||
return [item.get("query") for item in json_data.get("relatedSearches", [])]
|
||||
else:
|
||||
print("Invalid info_type. Please use 'titles', 'peopleAlsoAsk', or 'relatedSearches'.")
|
||||
return None
|
||||
|
||||
|
||||
def tavily_extract_information(json_data, keyword):
|
||||
"""
|
||||
Extract information from the given JSON based on the specified keyword.
|
||||
|
||||
Args:
|
||||
json_data (dict): The JSON data.
|
||||
keyword (str): The keyword (title, content, answer, follow-query).
|
||||
|
||||
Returns:
|
||||
list or str: The extracted information based on the keyword.
|
||||
"""
|
||||
if keyword == 'title':
|
||||
return [result['title'] for result in json_data['results']]
|
||||
elif keyword == 'content':
|
||||
return [result['content'] for result in json_data['results']]
|
||||
elif keyword == 'answer':
|
||||
return json_data['answer']
|
||||
elif keyword == 'follow-query':
|
||||
return json_data['follow_up_questions']
|
||||
else:
|
||||
return f"Invalid keyword: {keyword}"
|
||||
|
||||
|
||||
def compete_organic_results(query, report, organic_results):
|
||||
""" Given a blog content and google search organinc results, create a new blog to compete against them."""
|
||||
prompt = f""" As an SEO expert and copywriter, I will provide you with my blog content on topic '{query}', and
|
||||
Top google search results.
|
||||
Your task is to rewrite the given blog to make it compete against top position results.
|
||||
Make sure, the new blog has high probability of ranking highest against given organic search result competitors.
|
||||
Modify the given blog content following best SEO practises.
|
||||
Make sure the blog is original, unique and highly readable.
|
||||
Remember, Maintain and adopt the formatting, structure, style and tone of the provided blog content.
|
||||
Include relevant emojis in your final blog for visual appeal. Use it sparingly.
|
||||
Your response should be well-structured, objective, and critically acclaimed blog article based on provided texts.
|
||||
|
||||
Remember, your goal is to create a detailed blog article that will compete against given organic result competitors.
|
||||
Do not provide explanations, suggestions for your response, reply only with your final response.
|
||||
Take your time in crafting your content, do not rush to give the response.
|
||||
Blog Content: '{report}'\n
|
||||
Organic Search result: '{organic_results}'
|
||||
"""
|
||||
report = gemini_text_response(prompt)
|
||||
return report
|
||||
38
lib/ai_web_researcher/gpt_summarize_web_content.py
Normal file
38
lib/ai_web_researcher/gpt_summarize_web_content.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import sys
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def summarize_web_content(page_content, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
Web page content: {page_content}
|
||||
"""
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
prompt = f"""You are a helpful assistant that briefly summarizes the content of a webpage.
|
||||
Summarize the given web page content below.
|
||||
Web page content: '{page_content}'"""
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"failed to get response from Openai: {err}")
|
||||
raise err
|
||||
53
lib/ai_web_researcher/gpt_titles_faq.py
Normal file
53
lib/ai_web_researcher/gpt_titles_faq.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import sys
|
||||
import json
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
# FIXME: Provide num_blogs, num_faqs as inputs.
|
||||
def gpt_titles_faqs_google_search(search_keyword, search_results, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
As a SEO expert and content writer, I will provide you with my web research keyword and its google search result in json format.
|
||||
Your task is to write 1 blog title and 10 FAQs.
|
||||
|
||||
1). Your blog title should compete against all the provided search results.
|
||||
2). Your FAQ should be based on 'People also ask' and 'Related Queries' from given result.
|
||||
Always include answers for each FAQ, use your knowledge and confirm with snippets given in search result.
|
||||
3). Respond in json data with 'blogTitles' and 'FAQs' as json keys. Do not explain, describe your response.
|
||||
4). Follow best practises of SEO.
|
||||
|
||||
Web Research Keyword: "{search_keyword}"
|
||||
Google search Result: "{search_results}"
|
||||
"""
|
||||
logger.info("Generating blog title and FAQs from web search result.")
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
print(f"\n\n\n RESPONSE: {response}\n\n\n")
|
||||
if '```' in response and '\n' in response:
|
||||
response = response.strip().split('\n')
|
||||
# Remove the first and last lines
|
||||
response = '\n'.join(response[1:-1])
|
||||
response = json.loads(response)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from Openai: {err}")
|
||||
raise err
|
||||
223
lib/ai_web_researcher/metaphor_basic_neural_web_search.py
Normal file
223
lib/ai_web_researcher/metaphor_basic_neural_web_search.py
Normal file
@@ -0,0 +1,223 @@
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
from metaphor_python import Metaphor
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
from tabulate import tabulate
|
||||
from collections import namedtuple
|
||||
import textwrap
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../../.env'))
|
||||
|
||||
from exa_py import Exa
|
||||
|
||||
from tenacity import (retry, stop_after_attempt, wait_random_exponential,)# for exponential backoff
|
||||
from .gpt_summarize_web_content import summarize_web_content
|
||||
from .gpt_competitor_analysis import summarize_competitor_content
|
||||
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def get_metaphor_client():
|
||||
"""
|
||||
Get the Metaphor client.
|
||||
|
||||
Returns:
|
||||
Metaphor: An instance of the Metaphor client.
|
||||
"""
|
||||
METAPHOR_API_KEY = os.environ.get('METAPHOR_API_KEY')
|
||||
if not METAPHOR_API_KEY:
|
||||
raise ValueError("METAPHOR_API_KEY environment variable not set!")
|
||||
return Exa(METAPHOR_API_KEY)
|
||||
|
||||
|
||||
def metaphor_rag_search():
|
||||
""" Mainly used for researching blog sections. """
|
||||
metaphor = get_metaphor_client()
|
||||
|
||||
|
||||
|
||||
def metaphor_find_similar(similar_url):
|
||||
"""
|
||||
Find similar content using the Metaphor API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to find similar content.
|
||||
|
||||
Returns:
|
||||
MetaphorResponse: The response from the Metaphor API.
|
||||
"""
|
||||
metaphor = get_metaphor_client()
|
||||
try:
|
||||
logger.info(f"Doing similar web search for url: {similar_url}")
|
||||
search_response = metaphor.find_similar_and_contents(
|
||||
similar_url,
|
||||
highlights=True,
|
||||
num_results=10)
|
||||
except Exception as e:
|
||||
logger.error(f"Metaphor: Error in finding similar content: {e}")
|
||||
raise
|
||||
|
||||
competitors = search_response.results
|
||||
for acompetitor in tqdm(competitors, desc="Processing Competitors", unit="competitor"):
|
||||
all_contents = ""
|
||||
try:
|
||||
search_response = metaphor.search_and_contents(
|
||||
acompetitor.url,
|
||||
type="keyword",
|
||||
num_results=5
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do metaphor keyword/url research: {err}")
|
||||
|
||||
research_response = search_response.results
|
||||
|
||||
# Add a progress bar for the inner loop
|
||||
for r in tqdm(research_response, desc=f"{acompetitor.url}", unit="research"):
|
||||
all_contents += r.text
|
||||
try:
|
||||
acompetitor.text = summarize_competitor_content(all_contents, "gemini")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to summarize_web_content: {err}")
|
||||
|
||||
# Convert the data into a list of lists
|
||||
print_search_result(competitors)
|
||||
return search_response
|
||||
|
||||
|
||||
|
||||
def metaphor_search_articles(query,
|
||||
num_results=5,
|
||||
use_autoprompt=True,
|
||||
include_domains=[],
|
||||
time_range=None,
|
||||
similar_url=None):
|
||||
"""
|
||||
Search for articles using the Metaphor API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
num_results (int): Number of results to retrieve.
|
||||
use_autoprompt (bool): Whether to use autoprompt.
|
||||
include_domains (list): List of domains to include.
|
||||
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
|
||||
|
||||
Returns:
|
||||
MetaphorResponse: The response from the Metaphor API.
|
||||
"""
|
||||
metaphor = get_metaphor_client()
|
||||
try:
|
||||
if time_range == "past day":
|
||||
start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
|
||||
elif time_range == "past week":
|
||||
start_published_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
|
||||
elif time_range == "past month":
|
||||
start_published_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
|
||||
elif time_range == "past year":
|
||||
start_published_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
|
||||
else:
|
||||
start_published_date = None
|
||||
|
||||
logger.info(f"Metaphor web search with Date: {start_published_date} and Query: {query}")
|
||||
try:
|
||||
search_response = metaphor.search_and_contents(
|
||||
query,
|
||||
include_domains=include_domains,
|
||||
use_autoprompt=True,
|
||||
start_published_date=start_published_date,
|
||||
num_results=num_results
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed in metaphor.search_and_contents: {err}")
|
||||
|
||||
# From each webpage, get a summary of the web page.
|
||||
contents_response = search_response.results
|
||||
for content in tqdm(contents_response, desc="Reading Web URL content:", unit="content"):
|
||||
summarized_content = summarize_web_content(content.text, "gemini")
|
||||
content.text = summarized_content
|
||||
|
||||
print_search_result(contents_response)
|
||||
|
||||
if similar_url:
|
||||
logger.info(f"Doing similar/semantic search for URL: {similar_url}")
|
||||
metaphor_find_similar(similar_url)
|
||||
return contents_response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Metaphor searching articles: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def print_search_result(contents_response):
|
||||
# Define the Result namedtuple
|
||||
Result = namedtuple("Result", ["url", "title", "published_date", "text"])
|
||||
# Tabulate the data
|
||||
table_headers = ["URL", "Title", "Published Date", "Summary"]
|
||||
table_data = [(result.url, result.title, result.published_date, result.text) for result in contents_response]
|
||||
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left", "left", "left", "left"],
|
||||
maxcolwidths=[20, 20, 10, 60])
|
||||
print(table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "a") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
|
||||
|
||||
def metaphor_scholar_search(query, include_domains=None, time_range="anytime"):
|
||||
"""
|
||||
Search for papers using the Metaphor API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
include_domains (list): List of domains to include.
|
||||
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
|
||||
|
||||
Returns:
|
||||
MetaphorResponse: The response from the Metaphor API.
|
||||
"""
|
||||
client = get_metaphor_client()
|
||||
try:
|
||||
if time_range == "day":
|
||||
start_published_date = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
elif time_range == "week":
|
||||
start_published_date = (datetime.utcnow() - timedelta(weeks=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
elif time_range == "month":
|
||||
start_published_date = (datetime.utcnow() - timedelta(weeks=4)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
elif time_range == "year":
|
||||
start_published_date = (datetime.utcnow() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
else:
|
||||
start_published_date = None
|
||||
|
||||
response = client.search(query, include_domains=include_domains, start_published_date=start_published_date, use_autoprompt=True)
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in searching papers: {e}")
|
||||
156
lib/ai_web_researcher/tavily_ai_search.py
Normal file
156
lib/ai_web_researcher/tavily_ai_search.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""
|
||||
This Python script uses the Tavily AI service to perform advanced searches based on specified keywords and options. It retrieves Tavily AI search results, pretty-prints them using Rich and Tabulate, and provides additional information such as the answer to the search query and follow-up questions.
|
||||
|
||||
Features:
|
||||
- Utilizes the Tavily AI service for advanced searches.
|
||||
- Retrieves API keys from the environment variables loaded from a .env file.
|
||||
- Configures logging with Loguru for informative messages.
|
||||
- Implements a retry mechanism using Tenacity to handle transient failures during Tavily searches.
|
||||
- Displays search results, including titles, snippets, and links, in a visually appealing table using Tabulate and Rich.
|
||||
|
||||
Usage:
|
||||
- Ensure the necessary API keys are set in the .env file.
|
||||
- Run the script to perform a Tavily AI search with specified keywords and options.
|
||||
- The search results, including titles, snippets, and links, are displayed in a formatted table.
|
||||
- Additional information, such as the answer to the search query and follow-up questions, is presented in separate tables.
|
||||
|
||||
Modifications:
|
||||
- To modify the script, update the environment variables in the .env file with the required API keys.
|
||||
- Adjust the search parameters, such as keywords and search depth, in the `get_tavilyai_results` function as needed.
|
||||
- Customize logging configurations and table formatting according to preferences.
|
||||
|
||||
To-Do (TBD):
|
||||
- Consider adding further enhancements or customization based on specific use cases.
|
||||
|
||||
Note: This script depends on external libraries such as Tavily, Rich, Tabulate, Loguru, and Tenacity. Install them using 'pip install tavily rich tabulate loguru tenacity' if not already installed.
|
||||
"""
|
||||
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from tavily import TavilyClient
|
||||
from rich import print
|
||||
from tabulate import tabulate
|
||||
# Load environment variables from .env file
|
||||
load_dotenv(Path('../../.env'))
|
||||
from rich import print
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
||||
|
||||
from .gpt_titles_faq import gpt_titles_faqs_google_search
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def get_tavilyai_results(keywords, include_urls, search_depth="advanced"):
|
||||
"""
|
||||
Get Tavily AI search results based on specified keywords and options.
|
||||
|
||||
Args:
|
||||
keywords (str): Keywords for Tavily AI search.
|
||||
include_urls (str): Comma-separated URLs to include in the search.
|
||||
search_depth (str, optional): Search depth option (default is "advanced").
|
||||
|
||||
Returns:
|
||||
dict: Tavily AI search results.
|
||||
"""
|
||||
# Run Tavily search
|
||||
logger.info(f"Running Tavily search on: {keywords}")
|
||||
|
||||
# Retrieve API keys
|
||||
api_key = os.getenv('TAVILY_API_KEY')
|
||||
if not api_key:
|
||||
raise ValueError("API keys for Tavily or OpenAI are not set.")
|
||||
|
||||
# Initialize Tavily client
|
||||
try:
|
||||
client = TavilyClient(api_key=api_key)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to create Tavily client. Check TAVILY_API_KEY: {err}")
|
||||
exit(1)
|
||||
try:
|
||||
if include_urls:
|
||||
tavily_search_result = client.search(keywords, search_depth, include_answer=True, include_domains=include_urls)
|
||||
else:
|
||||
tavily_search_result = client.search(keywords, search_depth, include_answer=True)
|
||||
print_result_table(tavily_search_result)
|
||||
return(tavily_search_result)
|
||||
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do Tavily Research: {err}")
|
||||
|
||||
|
||||
def print_result_table(output_data):
|
||||
""" Pretty print the tavily AI serch result. """
|
||||
# Prepare data for tabulate
|
||||
table_data = []
|
||||
for item in output_data.get("results"):
|
||||
title = item.get("title", "")
|
||||
snippet = item.get("content", "")
|
||||
link = item.get("url", "")
|
||||
table_data.append([title, snippet, link])
|
||||
|
||||
# Define table headers
|
||||
table_headers = ["Title", "Snippet", "Link"]
|
||||
# Display the table using tabulate
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left", "left", "left"],
|
||||
maxcolwidths=[30, 60, 30])
|
||||
# Print the table
|
||||
print(table)
|
||||
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
# Display the 'answer' in a table
|
||||
table_headers = [f"The answer to search query: {output_data.get('query')}"]
|
||||
table_data = [[output_data.get("answer")]]
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
maxcolwidths=[80])
|
||||
print(table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
# Display the 'follow_up_questions' in a table
|
||||
table_headers = [f"Search Engine follow up questions for query: {output_data.get('query')}"]
|
||||
table_data = [[output_data.get("follow_up_questions")]]
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
maxcolwidths=[80])
|
||||
print(table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "a") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
23
lib/ai_web_researcher/web_research_report.py
Normal file
23
lib/ai_web_researcher/web_research_report.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from langchain.adapters.openai import convert_openai_messages
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
|
||||
def write_web_research_report(web_research, faq_questions, gpt_provider="gemini"):
|
||||
""" """
|
||||
if "gemini" in gpt_provider:
|
||||
prompt = ["You are an SEO and marketing expert, who writes unique, factual and comprehensive research reports."
|
||||
"I will provide you web research report as json data and a list of related FAQ questions."
|
||||
"Use given json as context for writing your research report."
|
||||
"Your sole purpose is to write well written, critically acclaimed, objective and structured research report"
|
||||
"Use the urls from json content to provide cititations and include it in referances section of your report."
|
||||
"Include appropriate emojis in your research report."
|
||||
"Format your report in MLA format and markdown style, with special focus on readibility."
|
||||
f"Do not provide explanations for your response.\nWeb research Report: \"\"\" {web_research} \"\"\"\n "
|
||||
f"\nList of FAQ questions: \"\"\" {faq_questions} \"\"\"\n"]
|
||||
report = gemini_text_response(prompt)
|
||||
|
||||
elif "openai" in gpt_provider:
|
||||
report = openai_research_report(prompt)
|
||||
return report
|
||||
137
lib/ai_web_researcher/you_web_reseacher.py
Normal file
137
lib/ai_web_researcher/you_web_reseacher.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import requests
|
||||
from clint.textui import progress
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
||||
def search_ydc_index(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
|
||||
"""
|
||||
Search YDC Index API and retrieve results.
|
||||
|
||||
Args:
|
||||
search_query (str): The search query.
|
||||
num_web_results (int): Number of web results to retrieve.
|
||||
country (str): Country code.
|
||||
api_key (str): YDC Index API key.
|
||||
|
||||
Returns:
|
||||
dict: The response from the YDC Index API in JSON format.
|
||||
"""
|
||||
try:
|
||||
url = "https://api.ydc-index.io/search"
|
||||
|
||||
querystring = {
|
||||
"query": search_query,
|
||||
"num_web_results": str(num_web_results),
|
||||
"country": country
|
||||
}
|
||||
|
||||
headers = {"X-API-Key": api_key}
|
||||
|
||||
with progress.Bar(expected_size=num_web_results, label="Searching YDC Index") as bar:
|
||||
response = requests.get(url, headers=headers, params=querystring, stream=True)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
||||
|
||||
result_json = response.json()
|
||||
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
|
||||
|
||||
return result_json
|
||||
|
||||
except requests.exceptions.RequestException as req_exc:
|
||||
logger.error(f"Request to YDC Index API failed: {req_exc}")
|
||||
return {"error": str(req_exc)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def get_rag_results(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
|
||||
"""
|
||||
Retrieve RAG (Relevance, Authority, and Goodness) results from YDC Index API.
|
||||
|
||||
Args:
|
||||
search_query (str): The search query.
|
||||
num_web_results (int): Number of web results to retrieve.
|
||||
country (str): Country code.
|
||||
api_key (str): YDC Index API key.
|
||||
|
||||
Returns:
|
||||
dict: The response from the YDC Index API in JSON format.
|
||||
"""
|
||||
try:
|
||||
url = "https://api.ydc-index.io/rag"
|
||||
|
||||
querystring = {
|
||||
"query": search_query,
|
||||
"num_web_results": str(num_web_results),
|
||||
"country": country
|
||||
}
|
||||
|
||||
headers = {"X-API-Key": api_key}
|
||||
|
||||
with progress.Bar(expected_size=num_web_results, label="Fetching RAG Results") as bar:
|
||||
response = requests.get(url, headers=headers, params=querystring, stream=True)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
||||
|
||||
result_json = response.json()
|
||||
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
|
||||
|
||||
return result_json
|
||||
|
||||
except requests.exceptions.RequestException as req_exc:
|
||||
logger.error(f"Request to YDC Index API failed: {req_exc}")
|
||||
return {"error": str(req_exc)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
def get_news_results(query, spellcheck=True, api_key="<api-key>"):
|
||||
"""
|
||||
Retrieve news results from YDC Index API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
spellcheck (bool): Whether to enable spellcheck.
|
||||
api_key (str): YDC Index API key.
|
||||
|
||||
Returns:
|
||||
dict: The response from the YDC Index API in JSON format.
|
||||
"""
|
||||
try:
|
||||
url = "https://api.ydc-index.io/news"
|
||||
|
||||
querystring = {
|
||||
"q": query,
|
||||
"spellcheck": str(spellcheck).lower()
|
||||
}
|
||||
|
||||
headers = {"X-API-Key": api_key}
|
||||
|
||||
with progress.Bar(expected_size=1, label="Fetching News Results") as bar:
|
||||
response = requests.get(url, headers=headers, params=querystring, stream=True)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
||||
|
||||
result_json = response.json()
|
||||
bar.show() # Update progress bar
|
||||
|
||||
return result_json
|
||||
|
||||
except requests.exceptions.RequestException as req_exc:
|
||||
logger.error(f"Request to YDC Index API failed: {req_exc}")
|
||||
return {"error": str(req_exc)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
# Example usage
|
||||
search_query = "Getting started with llamaindex"
|
||||
result = get_news_results(search_query)
|
||||
print(result)
|
||||
result = get_rag_results(search_query)
|
||||
print(result)
|
||||
result = search_ydc_index(search_query)
|
||||
print(result)
|
||||
37
lib/blog_sections/faqs_generator_blog.py
Normal file
37
lib/blog_sections/faqs_generator_blog.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import sys
|
||||
|
||||
from .gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def generate_blog_faq(blog_article, gpt_providers="openai"):
|
||||
"""
|
||||
Given a blog title generate an outline for it
|
||||
"""
|
||||
logger.info("Generating blog FAQs.")
|
||||
prompt = f"""As an expert writer, I will provide you with blog content below.
|
||||
Your task is to write 5 FAQs based on the given blog content.
|
||||
Always, write fact based answers. Use emojis where applicable.
|
||||
You must reply in MARKDOWN format.
|
||||
blog content: '{blog_article}' """
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
SystemError(f"Failed to get response from Openai: {err}")
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"wordpress_url": "https://latestaitools.in/",
|
||||
"wordpress_username": "username",
|
||||
"wordpress_password": "password",
|
||||
"image_dir": "path/to/image_dir",
|
||||
"output_path": "path/to/output_path"
|
||||
}
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
def generate_topic_outline(blog_title, num_subtopics):
|
||||
"""
|
||||
Given a blog title generate an outline for it
|
||||
"""
|
||||
# TBD: Remove hardcoding, make dynamic
|
||||
prompt = f"""As a SEO expert, suggest only {num_subtopics} beginner-friendly and
|
||||
insightful sub topics for the blog title: {blog_title}.
|
||||
Respond with only answer and no description, explanations."""
|
||||
|
||||
# The suggested {num_subtopics} outline should include few long-tailed keywords and most popular questions.
|
||||
# TBD: Include --niche
|
||||
logger.info(f"Prompt used for blog title Outline :\n{prompt}\n")
|
||||
# TBD: Add logic for which_provider and which_model
|
||||
try:
|
||||
response = openai_chatgpt(prompt)
|
||||
except Exception as err:
|
||||
SystemError(f"Error in generating Blog Title: {err}")
|
||||
return response
|
||||
39
lib/github_blogs/github_getting_started.py
Normal file
39
lib/github_blogs/github_getting_started.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import sys
|
||||
|
||||
from .gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
|
||||
def github_readme_blog(readme_content, gpt_providers="openai"):
|
||||
""" """
|
||||
prompt = f"""As an expert programmer and teacher, Write an original, detailed and step-by-step guide, from the provided Text below.
|
||||
Your guide should be original, engaging and help beginners get started easily.
|
||||
Write new example codes and detailed comments on how to run them. Include appropriate emoji where applicable.
|
||||
Include a referances section that links to more code examples.
|
||||
Your response MUST be a how-to blog in markdown format.
|
||||
Respond ONLY with your blog content.
|
||||
|
||||
Text: '{readme_content}'
|
||||
"""
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
sys.exit(1)
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
SystemError(f"Failed to get response from Openai: {err}")
|
||||
140
lib/github_blogs/main_getting_started_blogs.py
Normal file
140
lib/github_blogs/main_getting_started_blogs.py
Normal file
@@ -0,0 +1,140 @@
|
||||
""" Package for writing getting-started and how to guides. """
|
||||
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import json
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
from .scrape_github_readme import get_gh_details_vision, get_readme_content
|
||||
from .scrape_github_readme import research_github_topics, check_if_already_written
|
||||
from .github_getting_started import github_readme_blog
|
||||
from .gpt_online_researcher import do_online_research
|
||||
from .faqs_generator_blog import generate_blog_faq
|
||||
from .get_blog_metadata import blog_metadata
|
||||
from .save_blog_to_file import save_blog_to_file
|
||||
from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
|
||||
|
||||
|
||||
|
||||
def blog_from_github(github_opts, flag):
|
||||
""" Module for writing getting started code examples from github. """
|
||||
if 'url' in flag:
|
||||
try:
|
||||
write_from_url(github_opts)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to write from github url: {github_opts}")
|
||||
sys.exit(1)
|
||||
elif 'csv' in flag:
|
||||
try:
|
||||
gh_urls = []
|
||||
with open(github_opts, 'r') as file:
|
||||
# Read each line in the file
|
||||
for gh_url in file:
|
||||
gh_urls.append(gh_url.strip())
|
||||
except FileNotFoundError:
|
||||
logger.error(f"CSV File not found: {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"CSV: An error occurred: {str(e)}")
|
||||
|
||||
for gh_url in gh_urls:
|
||||
try:
|
||||
write_from_url(gh_url.strip())
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to write blog from github: {err}")
|
||||
|
||||
|
||||
|
||||
def write_from_url(gh_url):
|
||||
# String to store the blog content.
|
||||
howto_blog = ''
|
||||
# The url was not found in already_written data.
|
||||
if not check_if_already_written(gh_url):
|
||||
logger.info(f"Writing getting started from url: {gh_url}")
|
||||
else:
|
||||
logger.error(f"Skipping, already written on url: {gh_url}")
|
||||
return
|
||||
|
||||
# Direct link to the raw content of README file
|
||||
# fixme: Remove the hardcoding, need add another option OR in config ?
|
||||
image_dir = os.path.join(os.getcwd(), "blog_images")
|
||||
generated_image_name = f"screenshot_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
|
||||
generated_image_filepath = os.path.join(image_dir, generated_image_name)
|
||||
try:
|
||||
logger.info(f"Getting github repo details from vision model: {generated_image_filepath}")
|
||||
gh_json = get_gh_details_vision(gh_url, generated_image_filepath)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get gemini vision details from GH repo image: {err}")
|
||||
sys.exit(1)
|
||||
howto_blog = "```" + f"\nGithub URL:{gh_url}\nStars:{gh_json.get('stars')}\n"
|
||||
howto_blog += f"Forks:{gh_json.get('forks')}\n"
|
||||
howto_blog += f"Description:{gh_json.get('about')}\nBranch:{gh_json.get('branch_name')}\n" + "```\n\n"
|
||||
|
||||
raw_readme_url_base = "https://raw.githubusercontent.com/" + "/".join(gh_url.split("/")[-2:])
|
||||
if gh_json.get('branch_name'):
|
||||
raw_readme_url = raw_readme_url_base + f"/{gh_json.get('branch_name')}/" + "README.md"
|
||||
else:
|
||||
raw_readme_url = raw_readme_url_base + f"/main/" + "README.md"
|
||||
logger.info(f"Using this url to fetch the README file: {raw_readme_url}")
|
||||
|
||||
try:
|
||||
# Get and print the main content
|
||||
readme_content = get_readme_content(raw_readme_url)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get README from URL: {raw_readme_url}: {err}")
|
||||
# If the readme is still None, try with master branch.
|
||||
if not readme_content:
|
||||
raw_readme_url = raw_readme_url_base + f"/master/" + "README.md"
|
||||
logger.warning(f"Trying with master branch: {raw_readme_url}")
|
||||
readme_content = get_readme_content(raw_readme_url)
|
||||
if not readme_content:
|
||||
logger.error(f"Still failed to get the README: {readme_content}")
|
||||
sys.exit(1)
|
||||
|
||||
# Create a getting-started blog, adapted from the GH url README.
|
||||
howto_blog += github_readme_blog(readme_content, "gemini")
|
||||
|
||||
# Do online research for faqs on the github url.
|
||||
try:
|
||||
# Repo names are misnomers for others search, include its decription too.
|
||||
# Which, skews the result favourably towards its home/paid pages.
|
||||
#online_query = f"{''.join(gh_url.split('/')[-1:])} " + gh_json.get('about')
|
||||
online_query = f"{''.join(gh_url.split('/')[-1:])} "
|
||||
logger.info("Do web research with Tavily & Metaphor AI.")
|
||||
research_report = do_online_research(online_query, "gemini", gh_url)
|
||||
except Exception as err:
|
||||
logger.error(f"failed to do online research: {err}")
|
||||
|
||||
# Generate FAQs from the online research report.
|
||||
try:
|
||||
blog_faqs = generate_blog_faq(research_report, "gemini")
|
||||
howto_blog += f"\n\n## {''.join(gh_url.split('/')[-1:])} FAQs\n\n" + blog_faqs
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to generate FAQs from web research_report: {err}")
|
||||
|
||||
logger.info(f"\n\nFinal Blog Content: {howto_blog}\n\n")
|
||||
|
||||
try:
|
||||
blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(howto_blog, "gemini")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get blog metadata: {err}")
|
||||
raise err
|
||||
|
||||
try:
|
||||
save_blog_to_file(howto_blog, blog_title, blog_meta_desc, blog_tags,\
|
||||
blog_categories, generated_image_filepath)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to save blog to a file: {err}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
append_id_to_file(gh_url, "papers_already_written_on.txt")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
|
||||
raise err
|
||||
297
lib/github_blogs/scrape_github_readme.py
Normal file
297
lib/github_blogs/scrape_github_readme.py
Normal file
@@ -0,0 +1,297 @@
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import pandas as pd
|
||||
|
||||
import json
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
from .take_url_screenshot import take_screenshot
|
||||
from .gpt_providers.gemini_image_details import gemini_get_img_info
|
||||
|
||||
|
||||
|
||||
def get_readme_content(url):
|
||||
try:
|
||||
# Fetch the README content directly from the URL
|
||||
response = requests.get(url)
|
||||
print(response.status_code)
|
||||
if response.status_code == 200:
|
||||
logger.debug("Successfully fetched the README.md")
|
||||
readme_content = response.text
|
||||
else:
|
||||
readme_content = None
|
||||
return readme_content
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to fetch raw readme from {url}: {err}: {response.status_code}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def get_gh_repo_metadata(github_url):
|
||||
""" Function to get the repo details like stars, commits, forks etc """
|
||||
logger.info("Scraping github with BS4 and requests.")
|
||||
# download the target page
|
||||
page = requests.get(github_url)
|
||||
# parse the HTML document returned by the server
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
|
||||
# initialize the object that will contain the scraped data
|
||||
repo = {}
|
||||
|
||||
# repo scraping logic
|
||||
name_html_element = soup.select_one('[itemprop="name"]')
|
||||
name = name_html_element.get_text().strip()
|
||||
|
||||
git_branch_icon_html_element = soup.select_one('.octicon-git-branch')
|
||||
main_branch_html_element = git_branch_icon_html_element.find_next_sibling('span')
|
||||
main_branch = main_branch_html_element.get_text().strip()
|
||||
|
||||
# scrape the repo history data
|
||||
boxheader_html_element = soup.select_one('.Box .Box-header')
|
||||
|
||||
# scrape the repo details in the right box
|
||||
bordergrid_html_element = soup.select_one('.BorderGrid')
|
||||
|
||||
about_html_element = bordergrid_html_element.select_one('h2')
|
||||
description_html_element = about_html_element.find_next_sibling('p')
|
||||
description = description_html_element.get_text().strip()
|
||||
|
||||
star_icon_html_element = bordergrid_html_element.select_one('.octicon-star')
|
||||
stars_html_element = star_icon_html_element.find_next_sibling('strong')
|
||||
stars = stars_html_element.get_text().strip().replace(',', '')
|
||||
|
||||
eye_icon_html_element = bordergrid_html_element.select_one('.octicon-eye')
|
||||
watchers_html_element = eye_icon_html_element.find_next_sibling('strong')
|
||||
watchers = watchers_html_element.get_text().strip().replace(',', '')
|
||||
|
||||
fork_icon_html_element = bordergrid_html_element.select_one('.octicon-repo-forked')
|
||||
forks_html_element = fork_icon_html_element.find_next_sibling('strong')
|
||||
forks = forks_html_element.get_text().strip().replace(',', '')
|
||||
|
||||
# Find the div with class "f6" containing topic links
|
||||
topic_div = soup.find('div', class_='f6')
|
||||
if topic_div:
|
||||
# Find all the topic links within the div
|
||||
topic_links = topic_div.find_all('a', class_='topic-tag-link')
|
||||
# Extract and print the topics
|
||||
repo['topics'] = [link.text.strip() for link in topic_links]
|
||||
|
||||
# FIXME: Unable to scrape branch name.
|
||||
repo['branch_name'] = None
|
||||
# store the scraped data
|
||||
repo['name'] = name
|
||||
repo['about'] = description
|
||||
repo['stars'] = stars
|
||||
repo['watchers'] = watchers
|
||||
repo['forks'] = forks
|
||||
#repo['readme'] = readme
|
||||
logger.info(f"Github Repo Details: {repo}")
|
||||
return(repo)
|
||||
|
||||
|
||||
def get_gh_details_vision(github_url, generated_image_filepath):
|
||||
""" Take a screenshot of the url and feed to vision models for scraping details. """
|
||||
logger.info(f"Take screenshot and pass it to gemini for repo details of {github_url}")
|
||||
|
||||
generated_image_filepath = take_screenshot(github_url, generated_image_filepath)
|
||||
prompt = """From the given image of a github page, find out the number of stars, about, forks, last commit days, link url, topics and branch name. Return the result as json."""
|
||||
|
||||
try:
|
||||
gh_details = gemini_get_img_info(prompt, generated_image_filepath)
|
||||
logger.info(f"Github Repo details, from vision model: {gh_details}")
|
||||
#gh_details = get_gh_repo_metadata(github_url)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get gh images details: {err}")
|
||||
gh_details = get_gh_repo_metadata(github_url)
|
||||
return gh_details
|
||||
|
||||
# Convert string to dictionary Split the string into lines
|
||||
lines = gh_details.split('\n')
|
||||
# Remove the first and last line
|
||||
modified_lines = lines[1:-1]
|
||||
# Join the modified lines back into a string
|
||||
gh_details = '\n'.join(modified_lines)
|
||||
gh_details = json.loads(gh_details)
|
||||
|
||||
return(gh_details)
|
||||
|
||||
|
||||
def research_github_topics(topics):
|
||||
""" Scrape github topics of interest for top repos to write on """
|
||||
# https://www.kaggle.com/code/subhaskumarray/scraping-github-topics-with-their-repositories
|
||||
# We are going to scrape https://github.com/topics
|
||||
# We will get a list of topics. For each topic, we will extract topic name, topic description and topic url.
|
||||
# For each topic, we will get top 30 repositories with repo name, repo username, stars and repo url.
|
||||
# Finally we are going to create csv file for each topic with respective repo details.
|
||||
|
||||
#github_topics = "https://github.com/topics/"
|
||||
#response = requests.get(github_topics)
|
||||
#if response.status_code != 200:
|
||||
# logger.error(f'There is something wrong with {url}')
|
||||
#response_contents = response.text
|
||||
# Now we will parse the contents using BeautifulSoup:
|
||||
#parsed_contents = BeautifulSoup(response_contents,'html.parser')
|
||||
#logger.info("Get all topics, Titles and their urls from github.")
|
||||
#topic_titles = get_topic_titles(parsed_contents)
|
||||
#topic_desc = get_topic_desc(parsed_contents)
|
||||
#topic_urls = get_topic_url(parsed_contents)
|
||||
#topic_df = pd.DataFrame(list(zip(topic_titles, topic_desc,topic_urls)),\
|
||||
# columns =['title', 'description', 'url'])
|
||||
#logger.info(f"Scraped data from github: {topic_df}")
|
||||
|
||||
gh_topics = ['ai', 'ai-tools', 'ai-assistant', 'ai-agents-framework', 'llm', 'multi-agent', 'fine-tuning', 'rag', 'generative', 'prompt-engineering', 'generative-ai', 'text-to-image-generation', 'llm-ops', 'retrieval-augmented-generation', 'langchain', 'gemini-api', 'vertex-ai', 'huggingface', 'auto-gpt', 'llmops', 'ai-toolkit', 'chatbot', 'chatgpt', 'code-assistant', 'text-to-video', 'llms', 'gpt-4']
|
||||
|
||||
repo_info_dict = {
|
||||
'username':[],
|
||||
'repo_name': [],
|
||||
'stars': [],
|
||||
'repo_url': []
|
||||
}
|
||||
for agh_topic in gh_topics:
|
||||
topic_url = f"https://github.com/topics/{agh_topic}"
|
||||
first_topic_repo_page = download_repo_page(topic_url)
|
||||
logger.info(f"Get details on github topic: {topic_url}")
|
||||
repo_tags = first_topic_repo_page.find_all('h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'})
|
||||
star_tags = first_topic_repo_page.find_all('span', {'class': 'Counter js-social-count'})
|
||||
|
||||
for i in range(len(repo_tags)):
|
||||
repo_details = get_repo_info(repo_tags[i], star_tags[i])
|
||||
|
||||
# Check if the repo URL is not already present in the dictionary
|
||||
if repo_details[3] not in repo_info_dict['repo_url']:
|
||||
# Store repos with more than 5000 stars.
|
||||
if repo_details[2] > 5000:
|
||||
repo_info_dict['username'].append(repo_details[0])
|
||||
repo_info_dict['repo_name'].append(repo_details[1])
|
||||
repo_info_dict['stars'].append(repo_details[2])
|
||||
repo_info_dict['repo_url'].append(repo_details[3])
|
||||
|
||||
# Create a DataFrame from repo_info_dict
|
||||
df_repo_info = pd.DataFrame(repo_info_dict['repo_url'])
|
||||
|
||||
# Check if the file already exists
|
||||
csv_filename = 'github_url_to_write.csv'
|
||||
if os.path.isfile(csv_filename):
|
||||
# Append to the existing file
|
||||
df_repo_info.to_csv(csv_filename, mode='a', header=False, index=False)
|
||||
logger.info(f"Data appended to existing file: {csv_filename}")
|
||||
else:
|
||||
# Create a new file
|
||||
df_repo_info.to_csv(csv_filename, index=False)
|
||||
|
||||
|
||||
def get_topic_titles(parsed_content):
|
||||
try:
|
||||
selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
|
||||
topic_title_tags = parsed_content.find_all('p',{'class':selected_class})
|
||||
# We can make a list of topics
|
||||
topic_titles = []
|
||||
for tags in topic_title_tags:
|
||||
topic_titles.append(tags.text)
|
||||
return topic_titles
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get github topic titles: {err}")
|
||||
|
||||
|
||||
def get_topic_desc(parsed_contents):
|
||||
try:
|
||||
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
|
||||
topic_desc_tags = parsed_contents.find_all('p',{'class': desc_selector})
|
||||
print(f"{topic_desc_tags}")
|
||||
topic_desc = []
|
||||
for desc in topic_desc_tags:
|
||||
print("dsfsfs")
|
||||
topic_desc.append(desc.text.strip()) # strip() is used for trimming all extra spaces in description.
|
||||
return topic_desc
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get github topic desc: {err}")
|
||||
|
||||
|
||||
def get_topic_url(parsed_contents):
|
||||
try:
|
||||
topic_link_tag = parsed_contents.find_all('a',{'class':'no-underline flex-1 d-flex flex-column'})
|
||||
topic_urls = []
|
||||
base_url = 'http://github.com'
|
||||
for urls in topic_link_tag:
|
||||
topic_urls.append(base_url + urls['href'])
|
||||
return topic_urls
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get github topic urls: {err}")
|
||||
|
||||
|
||||
def download_repo_page(topic_url):
|
||||
response = requests.get(topic_url)
|
||||
if response.status_code != 200:
|
||||
print('There is some error in {}'.format(topic_url))
|
||||
response_contents = response.text
|
||||
|
||||
parsed_contents = BeautifulSoup(response_contents,'html.parser')
|
||||
return parsed_contents
|
||||
|
||||
|
||||
def get_repo_info(repo_tags,star_tags):
|
||||
# returns all info for a repo
|
||||
a_tags = repo_tags.find_all('a')
|
||||
username = a_tags[0].text.strip()
|
||||
repo_name = a_tags[1].text.strip()
|
||||
base_url = 'http://github.com/'
|
||||
repo_url = base_url + a_tags[1]['href'].strip()
|
||||
|
||||
# Defining a function so that it will convert our star count to integer
|
||||
def star_counts_converter(stars):
|
||||
stars = stars.strip()
|
||||
if stars[-1] == 'k':
|
||||
return int(float(stars[:-1]) * 1000)
|
||||
return int(stars)
|
||||
star_counts = star_counts_converter(star_tags.text.strip())
|
||||
return username,repo_name,star_counts,repo_url
|
||||
|
||||
|
||||
def save_to_csv(topic_url,topic_name):
|
||||
file_name = topic_name + '.csv'
|
||||
if os.path.exists(file_name):
|
||||
logger.debug(f"The file {file_name} already exists. Skipping.")
|
||||
topics_df = topic_repo_details(topic_url)
|
||||
topics_df.to_csv(file_name,index=None)
|
||||
logger.info(f"Successfully scraped topic {topic_name}")
|
||||
|
||||
|
||||
def check_if_already_written(github_url, file_path='papers_already_written_on.txt'):
|
||||
"""
|
||||
Check if a GitHub URL is an exact match in each line of a file.
|
||||
|
||||
Args:
|
||||
github_url (str): GitHub URL string to check.
|
||||
file_path (str): Path to the file containing lines to check against. Default is 'papers_already_written_on.txt'.
|
||||
|
||||
Returns:
|
||||
bool: True if an exact match is found, False otherwise.
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r') as file:
|
||||
# Read each line in the file
|
||||
for line in file:
|
||||
# Check for an exact match
|
||||
if github_url.strip() == line.strip():
|
||||
return True
|
||||
except FileNotFoundError:
|
||||
print(f"File not found: {file_path}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
import serpapi
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
api_key = os.getenv('SERPAPI_KEY')
|
||||
|
||||
client = serpapi.Client(api_key=api_key)
|
||||
result = client.search(
|
||||
q="Retrieval Augumented Generation RAG",
|
||||
engine="google",
|
||||
location="Austin, Texas",
|
||||
hl="en",
|
||||
gl="us",
|
||||
)
|
||||
|
||||
print(result["related_questions"]) # Get all the related questions
|
||||
@@ -1,181 +0,0 @@
|
||||
################################################################
|
||||
#
|
||||
# GPT Researcher is an autonomous agent designed for comprehensive online research on a variety of tasks.
|
||||
# The agent can produce detailed, factual and unbiased research reports, with customization options for
|
||||
# focusing on relevant resources, outlines, and lessons. Inspired by the recent Plan-and-Solve and RAG papers,
|
||||
# GPT Researcher addresses issues of speed, determinism and reliability, offering a more stable
|
||||
# performance and increased speed through parallelized agent work, as opposed to synchronous operations.
|
||||
#
|
||||
# The main idea is to run "planner" and "execution" agents, whereas the planner generates questions to research,
|
||||
# and the execution agents seek the most related information based on each generated research question.
|
||||
# Finally, the planner filters and aggregates all related information and creates a research report.
|
||||
#
|
||||
# The agents leverage both gpt3.5-turbo and gpt-4-turbo (128K context) to complete a research task.
|
||||
# We optimize for costs using each only when necessary.
|
||||
# The average research task takes around 3 minutes to complete, and costs ~$0.1.
|
||||
#
|
||||
##############################################################
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
from tavily import TavilyClient
|
||||
import serpapi
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../.env'))
|
||||
|
||||
from langchain.adapters.openai import convert_openai_messages
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
import google.generativeai as genai
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_random_exponential,
|
||||
) # for exponential backoff
|
||||
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
from .blog_proof_reader import blog_proof_editor
|
||||
from .convert_content_to_markdown import convert_tomarkdown_format
|
||||
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def do_online_research(query, gpt_provider="openai"):
|
||||
# Do a google search for the given keyword. The search results will give urls, questions for faq
|
||||
faq_questions = []
|
||||
organic_results = []
|
||||
report = ''
|
||||
try:
|
||||
faq_questions = google_search(query, "faq")
|
||||
logging.info(f"Google search FAQ questions: {faq_questions}")
|
||||
# Now, get top 10 google organic results and polish the content to compete for these keywords.
|
||||
organic_results = google_search(query, "organic_result")
|
||||
except Exception as err:
|
||||
logging.error(f"Failed to do Serpapi research: {err}")
|
||||
# Not failing, as tavily would do same and then GPT-V to search.
|
||||
#exit(1)
|
||||
try:
|
||||
# Retrieve API keys
|
||||
api_key = os.getenv('TAVILY_API_KEY')
|
||||
openai_api_key = os.getenv('OPENAI_API_KEY')
|
||||
if not api_key or not openai_api_key:
|
||||
raise ValueError("API keys for Tavily or OpenAI are not set.")
|
||||
|
||||
# Initialize Tavily client
|
||||
try:
|
||||
client = TavilyClient(api_key=api_key)
|
||||
except Exception as err:
|
||||
logging.error("Failed to create Tavily client. Check TAVILY_API_KEY")
|
||||
exit(1)
|
||||
# Run tavily search
|
||||
logging.info(f"Running Tavily search on: {query}")
|
||||
try:
|
||||
content = client.search(query, search_depth="advanced")["results"]
|
||||
except Exception as err:
|
||||
logging.error(f"Failed to do Tavily Research: {err}")
|
||||
exit(1)
|
||||
|
||||
if "gemini" in gpt_provider:
|
||||
prompt = ["You are an AI critical thinker research assistant."
|
||||
"I will provide you with json content and a list of faq questions."
|
||||
"Use given json as context for writing your research report."
|
||||
"Your sole purpose is to write well written, critically acclaimed, objective and structured research report"
|
||||
"Important: Include and write code examples in your final report."
|
||||
"Include your own insights on the topic to make it comprehensive and detailed."
|
||||
"Use the urls from json content to provide cititations and include it in referances section of your report."
|
||||
"Include appropriate emojis in your research report."
|
||||
"Include FAQs relevant to your research report. Use the given faq questions. Write answers for each faq."
|
||||
"Format your report in MLA format and markdown style, with special focus on readibility."
|
||||
f"Do not provide explanations for your response.\njson content: \"\"\" {content} \"\"\"\n "
|
||||
f"\nList of FAQ questions: \"\"\" {faq_questions} \"\"\"\n"]
|
||||
report = gemini_text_response(prompt)
|
||||
|
||||
elif "openai" in gpt_provider:
|
||||
# Setup prompt for GPT-4
|
||||
prompt = [{
|
||||
"role": "system",
|
||||
"content": ('You are an AI critical thinker research assistant. '
|
||||
'Your sole purpose is to write well written, critically acclaimed, '
|
||||
'objective and structured reports on given text.')
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": (f'Information: """{content}"""\n\n'
|
||||
f'Using the above information, answer the following '
|
||||
f'query: "{query}" in a detailed report --'
|
||||
f'Please use MLA format and markdown syntax.')
|
||||
}]
|
||||
report = openai_research_report(prompt)
|
||||
report = compete_organic_results(query, report, organic_results)
|
||||
return report
|
||||
except Exception as e:
|
||||
logging.error(f"Failed in online research: {e}")
|
||||
exit(1)
|
||||
|
||||
|
||||
def openai_research_report(query):
|
||||
""" Generate research report with openai """
|
||||
# Run GPT-4
|
||||
logging.info("Generating Research report with GPT-4...")
|
||||
lc_messages = convert_openai_messages(prompt)
|
||||
try:
|
||||
report = ChatOpenAI(model='gpt-4', openai_api_key=openai_api_key).invoke(lc_messages).content
|
||||
#logging.info(f"\n Below is the online research report for given keywords/title: \n\n{report}")
|
||||
return report
|
||||
except Exception as err:
|
||||
logging.error("Failed to generate do_online_research with ChatOpenAI")
|
||||
exit(1)
|
||||
|
||||
|
||||
def compete_organic_results(query, report, organic_results):
|
||||
""" Given a blog content and google search organinc results, create a new blog to compete against them."""
|
||||
prompt = f""" As an SEO expert and copywriter, I will provide you with my blog content on topic '{query}', and
|
||||
Top google search results.
|
||||
Your task is to rewrite the given blog to make it compete against top position results.
|
||||
Make sure, the new blog has high probability of ranking highest against given organic search result competitors.
|
||||
Modify the given blog content following best SEO practises.
|
||||
Make sure the blog is original, unique and highly readable.
|
||||
Remember, Maintain and adopt the formatting, structure, style and tone of the provided blog content.
|
||||
Include relevant emojis in your final blog for visual appeal. Use it sparingly.
|
||||
Your response should be well-structured, objective, and critically acclaimed blog article based on provided texts.
|
||||
|
||||
Remember, your goal is to create a detailed blog article that will compete against given organic result competitors.
|
||||
Do not provide explanations, suggestions for your response, reply only with your final response.
|
||||
Take your time in crafting your content, do not rush to give the response.
|
||||
Blog Content: '{report}'\n
|
||||
Organic Search result: '{organic_results}'
|
||||
"""
|
||||
report = gemini_text_response(prompt)
|
||||
return report
|
||||
|
||||
|
||||
def google_search(query, flag="faq"):
|
||||
""" Do google search for given query """
|
||||
try:
|
||||
api_key = os.getenv('SERPAPI_KEY')
|
||||
client = serpapi.Client(api_key=api_key)
|
||||
result = client.search(
|
||||
q=query,
|
||||
engine="google",
|
||||
hl="en",
|
||||
)
|
||||
except Exception as err:
|
||||
logging.error(f"Failed in Google Search: {err}")
|
||||
exit(1)
|
||||
if 'faq' in flag:
|
||||
# Check if 'inline_people_also_search_for' and 'related_questions' exist in result
|
||||
related_search = [item['title'] for item in result.get('inline_people_also_search_for', [])]
|
||||
related_questions = [item['question'] for item in result.get('related_questions', [])]
|
||||
|
||||
# Determine which list to use for faq_questions
|
||||
if not related_search and not related_questions:
|
||||
faq_questions = [item['query'] for item in result.get('related_searches', [])]
|
||||
else:
|
||||
faq_questions = related_search + related_questions
|
||||
return faq_questions
|
||||
|
||||
elif 'organic_result' in flag:
|
||||
# Check if 'organic_results' exists in result
|
||||
return result.get('organic_results', [])
|
||||
@@ -17,10 +17,11 @@ from tenacity import (
|
||||
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def gemini_arxiv_img_info(img_path):
|
||||
def gemini_get_img_info(prompt, img_path):
|
||||
""" Get image details from arxiv papers. """
|
||||
logging.info(f"Get image details from Gemini Pro.")
|
||||
try:
|
||||
genai.configure(api_key=os.getenv("API_KEY"))
|
||||
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
||||
except Exception as e:
|
||||
logging.error(f"Could not load gemini API key: {e}")
|
||||
raise e
|
||||
@@ -35,19 +36,19 @@ def gemini_arxiv_img_info(img_path):
|
||||
|
||||
safety_settings = [{
|
||||
"category": "HARM_CATEGORY_HARASSMENT",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
"threshold": "BLOCK_NONE"
|
||||
},
|
||||
{
|
||||
"category": "HARM_CATEGORY_HATE_SPEECH",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
"threshold": "BLOCK_NONE"
|
||||
},
|
||||
{
|
||||
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
"threshold": "BLOCK_NONE"
|
||||
},
|
||||
{
|
||||
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
"threshold": "BLOCK_NONE"
|
||||
},]
|
||||
|
||||
try:
|
||||
@@ -67,13 +68,12 @@ def gemini_arxiv_img_info(img_path):
|
||||
"data": Path(img_path).read_bytes()
|
||||
},]
|
||||
|
||||
prompt_parts = [
|
||||
"As scholar on evaluating research papers, I will provide you with an image from a research paper. Your task is to explain the image in details so that I can use it in a blog article. Explain the key findings and conclusions from the image. Your description should be in simple terms to explain to a wider audience. Explain key findings from the given image.",
|
||||
image_parts[0],]
|
||||
prompt_parts = [f"{prompt}", image_parts[0],]
|
||||
|
||||
try:
|
||||
response = model.generate_content(prompt_parts)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logging.error(f"Could not generate gemini content: {e}")
|
||||
logging.error(f"Gemini is blocking this request: {response.prompt_feedback.block_reason}")
|
||||
logging.error(f"Gemini Vision, Failed to give image Details: {e}\n{response.prompt_feedback}")
|
||||
raise e
|
||||
@@ -32,11 +32,9 @@ def gemini_text_response(prompt):
|
||||
model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config)
|
||||
try:
|
||||
response = model.generate_content(prompt)
|
||||
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from Gemini: {err}. Retrying.")
|
||||
# Try with minstral.
|
||||
print(f"\n\n\n--MINSTRAL--\n\n\n\n")
|
||||
response = mistral_text_response(prompt)
|
||||
return response
|
||||
#response = mistral_text_response(prompt)
|
||||
#return response
|
||||
return response.text
|
||||
|
||||
@@ -93,13 +93,6 @@ def blog_arxiv_url_list(file_path):
|
||||
# Read already written IDs
|
||||
written_ids = read_written_ids('papers_already_written_on.txt')
|
||||
|
||||
# Write blogs on each of arxiv_id from the file.
|
||||
for arxiv_id in extracted_ids:
|
||||
# Check if we have already written on this research_paper. For this, all arxiv ids are written in
|
||||
# a file called 'papers_already_written_on.txt'. If arxiv ID is found in this file, skip writing again.
|
||||
# YUP, use a DB. KISS for now.
|
||||
written_ids = read_written_ids('papers_already_written_on.txt')
|
||||
|
||||
# Loop through extracted IDs
|
||||
for arxiv_id in extracted_ids:
|
||||
if arxiv_id not in written_ids:
|
||||
@@ -178,8 +171,8 @@ def blog_postprocessing(arxiv_id, research_review):
|
||||
save_blog_to_file(research_review, blog_title, blog_meta_desc, blog_tags,\
|
||||
blog_categories, generated_image_filepath)
|
||||
except Exception as err:
|
||||
logger.__repr__ror(f"Failed to save blog to a file: {err}")
|
||||
raise err
|
||||
logger.error(f"Failed to save blog to a file: {err}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def take_paper_screenshot(arxiv_url):
|
||||
@@ -1,71 +0,0 @@
|
||||
import os
|
||||
import datetime
|
||||
|
||||
from selenium import webdriver
|
||||
from PIL import Image
|
||||
import shutil
|
||||
from screenshotone import Client, TakeOptions
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../.env'))
|
||||
|
||||
|
||||
def screenshot_api(url, generated_image_filepath):
|
||||
""" Use screenshotone API to take company webpage screenshots """
|
||||
try:
|
||||
# create API client
|
||||
client = Client(os.getenv('SCREENSHOTONE_ACCESS_KEY'), os.getenv('SCREENSHOTONE_SECRET_KEY'))
|
||||
|
||||
# set up options
|
||||
options = (TakeOptions.url(url)
|
||||
.format("png")
|
||||
.viewport_width(1024)
|
||||
.viewport_height(768)
|
||||
.block_cookie_banners(True)
|
||||
.block_chats(True))
|
||||
|
||||
# generate the screenshot URL and share it with a user
|
||||
#url = client.generate_take_url(options)
|
||||
# or render a screenshot and download the image as stream
|
||||
image = client.take(options)
|
||||
|
||||
# store the screenshot the example.png file
|
||||
with open(generated_image_filepath, 'wb') as result_file:
|
||||
shutil.copyfileobj(image, result_file)
|
||||
|
||||
# Display the screenshot using Image.show
|
||||
image = Image.open(generated_image_filepath)
|
||||
image.show()
|
||||
|
||||
except Exception as err:
|
||||
print(f"Failed in screenshotone api: {err}")
|
||||
generated_image_filepath = take_screenshot(url, generated_image_filepath)
|
||||
|
||||
return generated_image_filepath
|
||||
|
||||
def take_screenshot(url, generated_image_filepath):
|
||||
# Create a webdriver instance
|
||||
driver = webdriver.Chrome()
|
||||
|
||||
# Navigate to the given url
|
||||
driver.get(url)
|
||||
|
||||
# Set a fixed window size (you can adjust this as needed)
|
||||
driver.set_window_size(800, 600)
|
||||
|
||||
# Take a screenshot of the webpage
|
||||
screenshot = driver.get_screenshot_as_png()
|
||||
|
||||
# Close the webdriver instance
|
||||
driver.quit()
|
||||
|
||||
# Save the screenshot to a file
|
||||
with open(generated_image_filepath, "wb") as f:
|
||||
f.write(screenshot)
|
||||
|
||||
# Display the screenshot using Image.show
|
||||
image = Image.open(generated_image_filepath)
|
||||
image.show()
|
||||
|
||||
return generated_image_filepath
|
||||
113
lib/utils/take_url_screenshot.py
Normal file
113
lib/utils/take_url_screenshot.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import subprocess
|
||||
|
||||
from time import sleep
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from PIL import Image
|
||||
|
||||
from selenium import webdriver
|
||||
from PIL import Image
|
||||
import shutil
|
||||
from screenshotone import Client, TakeOptions
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../.env'))
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def screenshot_api(url, generated_image_filepath):
|
||||
""" Use screenshotone API to take company webpage screenshots """
|
||||
try:
|
||||
# create API client
|
||||
client = Client(os.getenv('SCREENSHOTONE_ACCESS_KEY'), os.getenv('SCREENSHOTONE_SECRET_KEY'))
|
||||
|
||||
# set up options
|
||||
options = (TakeOptions.url(url)
|
||||
.format("png")
|
||||
.viewport_width(1024)
|
||||
.viewport_height(768)
|
||||
.block_cookie_banners(True)
|
||||
.block_chats(True))
|
||||
|
||||
# generate the screenshot URL and share it with a user
|
||||
#url = client.generate_take_url(options)
|
||||
# or render a screenshot and download the image as stream
|
||||
image = client.take(options)
|
||||
|
||||
# store the screenshot the example.png file
|
||||
with open(generated_image_filepath, 'wb') as result_file:
|
||||
shutil.copyfileobj(image, result_file)
|
||||
|
||||
# Display the screenshot using Image.show
|
||||
image = Image.open(generated_image_filepath)
|
||||
image.show()
|
||||
# Wait for 2 seconds (adjust the delay as needed)
|
||||
sleep(2)
|
||||
# Close the image window
|
||||
image.close()
|
||||
|
||||
except Exception as err:
|
||||
print(f"Failed in screenshotone api: {err}")
|
||||
generated_image_filepath = take_screenshot(url, generated_image_filepath)
|
||||
|
||||
return generated_image_filepath
|
||||
|
||||
|
||||
def take_screenshot(url, generated_image_filepath):
|
||||
# Create a webdriver instance in headless mode
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--headless")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
logger.debug(f"Taking screenshot of url: {url}")
|
||||
|
||||
try:
|
||||
# Navigate to the given url
|
||||
driver.get(url)
|
||||
|
||||
# Optionally, increase the delay to ensure all content is loaded
|
||||
sleep(2)
|
||||
|
||||
# Explicitly wait for the page to load (adjust timeout as needed)
|
||||
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
||||
|
||||
# Set a larger window size
|
||||
driver.set_window_size(1200, 800)
|
||||
|
||||
# Take a screenshot of the webpage
|
||||
screenshot = driver.get_screenshot_as_png()
|
||||
|
||||
# Save the screenshot to a file
|
||||
with open(generated_image_filepath, "wb") as f:
|
||||
f.write(screenshot)
|
||||
|
||||
# Display the screenshot using Image.show
|
||||
image = Image.open(generated_image_filepath)
|
||||
image.show()
|
||||
# Wait for 2 seconds (adjust the delay as needed)
|
||||
sleep(2)
|
||||
|
||||
# Close the image window using subprocess (platform-dependent)
|
||||
subprocess.run(["pkill", "-f", "display"]) # Adjust based on your platform and viewer
|
||||
|
||||
# If using macOS, you can use the following:
|
||||
# subprocess.run(["osascript", "-e", 'tell application "Preview" to close every window'])
|
||||
# If using Windows, you can use the following:
|
||||
# subprocess.run(["taskkill", "/F", "/IM", "Microsoft.Photos.exe"])
|
||||
|
||||
logger.debug(f"Screenshot successfully stored at: {generated_image_filepath}")
|
||||
return generated_image_filepath
|
||||
finally:
|
||||
# Close the webdriver instance
|
||||
driver.quit()
|
||||
Reference in New Issue
Block a user