WIP- Under maintenence- Web research working.

This commit is contained in:
AjaySi
2024-02-05 15:15:07 +05:30
parent fd7053fb4b
commit 2a3315f211
96 changed files with 4320 additions and 565 deletions

View File

@@ -0,0 +1,332 @@
####################################################
#
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
# https://github.com/danielnsilva/semanticscholar
# https://github.com/shauryr/S2QA
#
####################################################
import os
import sys
import re
import pandas as pd
import arxiv
import PyPDF2
import io
import requests
from bs4 import BeautifulSoup
import urllib.parse
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def fetch_arxiv_data(query, max_results=10):
try:
# Construct the default API client
client = arxiv.Client()
# Search for articles matching the keyword
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate
)
# Fetching results
results = list(client.results(search))
# Extracting data
all_data = []
for result in results:
temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
all_data.append(temp)
return all_data
except Exception as e:
print("An error occurred while fetching data from arXiv:", e)
raise e
def create_dataframe(data, column_names):
try:
df = pd.DataFrame(data, columns=column_names)
return df
except Exception as e:
print("An error occurred while creating DataFrame:", e)
return pd.DataFrame()
def get_arxiv_main_content(url):
"""
Returns the main content of an arXiv paper.
Args:
url (str): The URL of the arXiv paper.
Returns:
str: The main content of the paper as a string.
"""
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Raise an exception for HTTP errors
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find the main content in 'ltx_page_content'
main_content = soup.find('div', class_='ltx_page_content')
if not main_content:
logger.warning("Main content not found in the page.")
return "Main content not found."
# Remove specific section with class 'package-alerts ltx_document'
alert_section = main_content.find('div', class_='package-alerts ltx_document')
if alert_section:
alert_section.decompose()
# Optional: Remove abstract and authors if present
for element_id in ["abs", "authors"]:
element = main_content.find(id=element_id)
if element:
element.decompose()
return main_content.text.strip()
# Could not access the arxiv HTML content, instead download pdf and read its content.
except Exception as html_error:
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
try:
# Extract arXiv ID from URL
arxiv_id = url.split('/')[-1]
# Fetch paper information using arXiv API
paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
# Initialize an empty string to store the extracted text
pdf_text = ''
# Read the downloaded PDF
with open(pdf_filename, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page in pdf_reader.pages:
try:
# Attempt to extract text from the current page
page_text = page.extract_text()
# If text extraction is successful, add it to the cumulative text
if page_text:
pdf_text += page_text + '\n'
except UnicodeDecodeError as err:
# FIXME: Handle any UnicodeDecodeError that arises during text extraction
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
pass
# Optionally, remove the downloaded PDF file
os.remove(pdf_filename)
# Pattern to match 'References' and everything that follows
pattern = r'References\s*.*'
pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
for section in sections_to_remove:
# Pattern to match the section title and any text following it until the next big title or end of document
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
return pdf_text
except Exception as pdf_error:
logger.error(f"Failed to process PDF: {pdf_error}")
return "Failed to retrieve content."
def download_image(image_url, base_url, folder="images"):
# Skip downloading if the image URL is a data URI
if image_url.startswith('data:image'):
print(f"Skipping download of data URI image: {image_url}")
return False
# Create the folder if it doesn't exist
if not os.path.exists(folder):
os.makedirs(folder)
# Form the absolute URL for image paths
if not urllib.parse.urlparse(image_url).scheme:
if not base_url.endswith('/'):
base_url += '/'
image_url = base_url + image_url
# Download and save the image
try:
response = requests.get(image_url)
response.raise_for_status()
image_name = image_url.split("/")[-1]
with open(os.path.join(folder, image_name), 'wb') as file:
file.write(response.content)
return True
except requests.RequestException as e:
print(f"Error downloading {image_url}: {str(e)}")
return False
def scrape_images_from_arxiv(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
image_urls = [img['src'] for img in images if 'src' in img.attrs]
return image_urls
except requests.RequestException as e:
print(f"Error fetching page {url}: {str(e)}")
return []
def arxiv_bibtex(arxiv_id):
"""
Get the BibTeX entry for an arXiv paper.
Args:
arxiv_id: The arXiv ID of the paper.
Returns:
A string containing the BibTeX entry.
"""
import urllib.request, xml.dom.minidom
# Download the XML
try:
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
xmldoc = xml.dom.minidom.parse(usock)
usock.close()
except Exception as e:
raise e
# Parse the XML
entry = xmldoc.getElementsByTagName("entry")[0]
date = entry.getElementsByTagName("updated")[0].firstChild.data
text_year = date[:4]
title = entry.getElementsByTagName("title")[0]
text_title = title.firstChild.data.strip()
authorlist = []
first = True
for person_name in entry.getElementsByTagName("author"):
# Get names
name = person_name.getElementsByTagName("name")[0]
text_name = name.firstChild.data
text_given_name = ' '.join(text_name.split()[:-1])
text_surname = text_name.split()[-1]
authorlist.append(f"{text_surname}, {text_given_name}")
# First author?
if first:
text_first_author_surname = text_surname
first = False
# Construct the BibTeX entry
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
bibtex += f" author = {' and '.join(authorlist)},\n"
bibtex += f" title = {{{text_title}}},\n"
bibtex += f" year = {{{text_year}}},\n"
bibtex += f" eprint = {{{arxiv_id}}},\n"
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
bibtex += "}"
return bibtex
#from serpapi import GoogleSearch
#params = {
# "api_key": "os.getenv(SERPER_API_KEY)",
# "engine": "google_scholar",
# "q": "llm",
# "hl": "en",
# "as_ylo": "2023",
# "as_yhi": "2024"
#}
#search = GoogleSearch(params)
#results = search.get_dict()
#from llmsherpa.readers import LayoutPDFReader
#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
#doc = pdf_reader.read_pdf(pdf_url)
def extract_arxiv_ids_from_line(line):
"""
Extract the arXiv ID from a given line of text.
Args:
line (str): A line of text potentially containing an arXiv URL.
Returns:
str: The extracted arXiv ID, or None if not found.
"""
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
match = arxiv_id_pattern.search(line)
if match:
return match.group(1) + (match.group(2) if match.group(2) else '')
return None
def read_written_ids(file_path):
"""
Read already written arXiv IDs from a file.
Args:
file_path (str): Path to the file containing written IDs.
Returns:
set: A set of arXiv IDs.
"""
written_ids = set()
try:
with open(file_path, 'r') as file:
for line in file:
written_ids.add(line.strip())
except FileNotFoundError:
logger.error(f"File not found: {file_path}")
except Exception as e:
logger.error(f"Error while reading the file: {e}")
return written_ids
def append_id_to_file(arxiv_id, output_file_path):
"""
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
Args:
arxiv_id (str): The arXiv ID to append.
output_file_path (str): Path to the output file.
"""
try:
# Check if file exists
if not os.path.exists(output_file_path):
logger.info(f"File does not exist. Creating new file: {output_file_path}")
# Create a new file and append the ID
with open(output_file_path, 'a') as outfile:
outfile.write(arxiv_id + '\n')
else:
logger.info(f"Appending to existing file: {output_file_path}")
# File exists, append the ID
with open(output_file_path, 'a') as outfile:
outfile.write(arxiv_id + '\n')
except Exception as e:
logger.error(f"Error while appending to file: {e}")

View File

@@ -0,0 +1,76 @@
import sys
from .gpt_providers.openai_chat_completion import openai_chatgpt
from .gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def blog_with_research(report, blog, gpt_providers="openai"):
"""Combine the given online research and gpt blog content"""
prompt = f"""
You are an expert copywriter specializing in content optimization for SEO.
I will provide you with a 'research report' and a 'blog content' on the same topic.
Your task is to transform and combine the given research and blog content into a well-structured markdown, unique
and engaging blog article.
Your objectives include:
1. Master the report and blog content: Understand main ideas, key points, and the core message.
2. Sentence Structure: Rephrase while preserving logical flow and coherence.
3. Identify Main Keywords: Determine the primary topic and combine the articles on the main topic.
4. REMEMBER: From the research report, include links and cititations to make your article more authoratative.
5. Write Code snippets: Check if given report is on programming, then write code snippets where applicable.
6. Optimize for SEO: Generate high quality informative content.
Implement SEO best practises with appropriate keyword density.
7. Craft Engaging and Informative Article: Provide value and insight to readers.
8. Proofread: Important to Check for grammar, spelling, and punctuation errors.
9. Use Creative and Human-like Style: Incorporate contractions, idioms, transitional phrases,
interjections, and colloquialisms. Avoid repetitive phrases and unnatural sentence structures.
10. Blog Structuring: Include an Introduction, subtopics and use bullet points or
numbered lists if appropriate. Important to include FAQs, Conclusion and Referances.
11. Ensure Uniqueness: Guarantee the article is plagiarism-free. Write in unique, informative style.
12. Punctuation: Use appropriate question marks at the end of questions.
13. Pass AI Detection Tools: Create content that easily passes AI plagiarism detection tools.
14. REMEMBER: Use the formatting style of given research report and include highlights, citations, referances in combined article.
Follow these guidelines to combine and write a new, unique, and informative blog article
that will rank well in search engine results and engage readers effectively.
Create a blog post, in markdown, from the given research report and blog content below.
Research report: {report}
Blog content: {blog}
"""
if 'gemini' in gpt_providers:
prompt = f"""You are an expert copywriter specializing in content optimization for SEO.
You are world famous writer, known for your originality and engaging content.
I will provide you with a 'research report' and a 'blog content' on the same topic.
Your task is to transform and combine the given research and blog content into a blog article.
Your blog should be highly detailed and well formatted.
Include a section in your blog on the highlights section of blog content.
Do not miss out any details from provided content. Always, include figures, data, results from given content.
It is important that your blog is original and unique. It should be highly readable and SEO optimized.
Research report: '{report}'
Blog content: '{blog}'
"""
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,108 @@
import re #additional import for regex
import os
import json
import requests
from openai import OpenAI
client = OpenAI(
api_key=os.getenv('OPENAI-API-KEY')
)
# Target URL can be a website url or it can google search
query = "kedarkanta trek"
target_url = f"https://www.google.com/search?q={query}&gl=us"
response = requests.get(target_url)
print
html_text = response.text
# Remove unnecessary part to prevent HUGE TOKEN cost!
# Remove everything between <head> and </head>
html_text = re.sub(r'<head.*?>.*?</head>', '', html_text, flags=re.DOTALL)
# Remove all occurrences of content between <script> and </script>
html_text = re.sub(r'<script.*?>.*?</script>', '', html_text, flags=re.DOTALL)
# Remove all occurrences of content between <style> and </style>
html_text = re.sub(r'<style.*?>.*?</style>', '', html_text, flags=re.DOTALL)
completion = client.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": "You are a master at scraping Google results data. Scrape two things: 1st. Scrape top 10 organic results data and 2nd. Scrape people_also_ask section from Google search result page."},
{"role": "user", "content": html_text}
],
tools=[
{
"type": "function",
"function": {
"name": "parse_organic_results",
"description": "Parse organic results from Google SERP raw HTML data nicely",
"parameters": {
'type': 'object',
'properties': {
'data': {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'title': {'type': 'string'},
'original_url': {'type': 'string'},
'snippet': {'type': 'string'},
'position': {'type': 'integer'}
}
}
}
}
}
}
},
{
"type": "function",
"function": {
"name": "parse_people_also_ask_section",
"description": "Parse `people also ask` section from Google SERP raw HTML",
"parameters": {
'type': 'object',
'properties': {
'data': {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'question': {'type': 'string'},
'original_url': {'type': 'string'},
'answer': {'type': 'string'},
}
}
}
}
}
}
}
],
tool_choice="auto"
)
# Organic_results
argument_str = completion.choices[0].message.tool_calls[0].function.arguments
argument_dict = json.loads(argument_str)
organic_results = argument_dict['data']
print('Organic results:')
for result in organic_results:
print(f"Blog Title: {result['title']}")
print(f"Blog URL: {result['original_url']}")
print(f"Blog Snippet: {result['snippet']}")
print(f"Blog Position: {result['position']}")
print('---')
# People also ask
argument_str = completion.choices[0].message.tool_calls[1].function.arguments
argument_dict = json.loads(argument_str)
people_also_ask = argument_dict['data']
print('People also ask:')
for result in people_also_ask:
print(f"People_Also_Ask: Question: {result['question']}")
print(f"People_Also_Ask: URL: {result['original_url']}")
print("People_Also_Ask: Answer: {result['answer']}")
print('---')

View File

@@ -0,0 +1,302 @@
"""
This Python script performs Google searches using various services such as SerpApi, Serper.dev, and more. It displays the search results, including organic results, People Also Ask, and Related Searches, in formatted tables. The script also utilizes GPT to generate titles and FAQs for the Google search results.
Features:
- Utilizes SerpApi, Serper.dev, and other services for Google searches.
- Displays organic search results, including position, title, link, and snippet.
- Presents People Also Ask questions and snippets in a formatted table.
- Includes Related Searches in the combined table with People Also Ask.
- Configures logging with Loguru for informative messages.
- Uses Rich and Tabulate for visually appealing and formatted tables.
Usage:
- Ensure the necessary API keys are set in the .env file.
- Run the script to perform a Google search with the specified query.
- View the displayed tables with organic results, People Also Ask, and Related Searches.
- Additional information, such as generated titles and FAQs using GPT, is presented.
Modifications:
- Update the environment variables in the .env file with the required API keys.
- Customize the search parameters, such as location and language, in the functions as needed.
- Adjust logging configurations, table formatting, and other aspects based on preferences.
To-Do (TBD):
- Consider adding further enhancements or customization based on specific use cases.
Note: This script depends on external libraries such as SerpApi, Loguru, Rich, and Tabulate. Install them using 'pip install serpapi loguru rich tabulate' if not already installed.
"""
import os
from pathlib import Path
import sys
import pandas as pd
import json
import requests
from clint.textui import progress
#from serpapi import GoogleSearch
from loguru import logger
from tabulate import tabulate
# Configure logger
logger.remove()
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
logger.add(
sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .gpt_titles_faq import gpt_titles_faqs_google_search
#from tenacity import retry, stop_after_attempt, wait_random_exponential
#@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
#FIXME: Accept language, country and time frame to search for.
def google_search(query):
"""
Perform a Google search for the given query.
Args:
query (str): The search query.
flag (str, optional): The search flag (default is "faq").
Returns:
list: List of search results based on the specified flag.
"""
try:
perform_serpapi_google_search(query)
logger.info(f"FIXME: Google serapi: {query}")
#return process_search_results(search_result)
except Exception as err:
logger.error(f"ERROR: Check Here: https://serpapi.com/. Your requests may be over. {err}")
# Retry with serper.dev
try:
logger.info("Trying Google search with Serper.dev: https://serper.dev/api-key")
search_result = perform_serperdev_google_search(query)
process_search_results(search_result)
except Exception as err:
logger.error(f"Failed to do Google search with serper.dev: {err}")
return(search_result)
# # Retry with BROWSERLESS API
# try:
# search_result = perform_browserless_google_search(query)
# #return process_search_results(search_result, flag)
# except Exception as err:
# logger.error("FIXME: Failed to do Google search with BROWSERLESS API.")
# logger.debug("FIXME: Trying with dataforSEO API.")
#
# # Retry with dataforSEO API
# try:
# logger.info("Perform SERP with Data for SEO.")
# #search_result = perform_dataforseo_google_search(query)
# #return process_search_results(search_result, flag)
# except Exception as err:
# logger.error("FIXME: Failed to do Google search with dataforSEO API.")
# logger.debug("All retries failed. Giving up.")
# raise
def perform_serpapi_google_search(query, location="in"):
"""
Perform a Google search using the SerpApi service.
Args:
query (str): The search query.
location (str, optional): The location for the search (default is "Austin, Texas").
api_key (str, optional): Your secret API key for SerpApi.
Returns:
dict: A dictionary containing the search results.
"""
try:
# Check if API key is provided
if not os.getenv("SERPAPI_KEY"):
raise ValueError("SERPAPI_KEY key is required for SerpApi")
# Create a GoogleSearch instance
search = GoogleSearch({
"q": query,
"location": location,
"api_key": api_key
})
# Get search results as a dictionary
result = search.get_dict()
return result
except ValueError as ve:
# Handle missing API key error
logger.info(f"SERPAPI ValueError: {ve}")
except Exception as e:
# Handle other exceptions
logger.info(f"SERPAPI An error occurred: {e}")
def perform_serperdev_google_search(query):
"""
Perform a Google search using the Serper API.
Args:
query (str): The search query.
Returns:
dict: The JSON response from the Serper API.
"""
# Get the Serper API key from environment variables
logger.info("Doing serper.dev google search.")
serper_api_key = os.getenv('SERPER_API_KEY')
# Check if the API key is available
if not serper_api_key:
raise ValueError("SERPER_API_KEY is missing. Set it in the .env file.")
# Serper API endpoint URL
url = "https://google.serper.dev/search"
# FIXME: Expose options to end user. Request payload
payload = json.dumps({
"q": query,
"gl": "in",
"hl": "en",
"num": 5,
"autocorrect": True,
"page": 1,
"type": "search",
"engine": "google"
})
# Request headers with API key
headers = {
'X-API-KEY': serper_api_key,
'Content-Type': 'application/json'
}
# Send a POST request to the Serper API with progress bar
with progress.Bar(label="Searching", expected_size=100) as bar:
response = requests.post(url, headers=headers, data=payload, stream=True)
# Check if the request was successful
if response.status_code == 200:
# Parse and return the JSON response
return response.json()
else:
# Print an error message if the request fails
logger.error(f"Error: {response.status_code}, {response.text}")
return None
def perform_browserless_google_search():
return
def perform_dataforseo_google_search():
return
def process_search_results(search_results):
"""
Create a Pandas DataFrame from the search results.
Args:
search_results (dict): The search results JSON.
Returns:
pd.DataFrame: Pandas DataFrame containing the search results.
"""
data = []
logger.info(f"Google Search Parameters: {search_results.get('searchParameters', {})}")
organic_results = search_results.get("organic", [])
print(search_results)
# Displaying Organic Results
organic_data = []
for result in search_results["organic"]:
position = result.get("position", "")
title = result.get("title", "")
link = result.get("link", "")
snippet = result.get("snippet", "")
organic_data.append([position, title, link, snippet])
organic_headers = ["Rank", "Title", "Link", "Snippet"]
organic_table = tabulate(organic_data,
headers=organic_headers,
tablefmt="fancy_grid",
colalign=["center", "left", "left", "left"],
maxcolwidths=[5, 25, 35, 50])
# Print the tables
print("\n\n📢❗🚨 Google search Organic Results:")
print(organic_table)
# Displaying People Also Ask and Related Searches combined
combined_data = []
try:
people_also_ask_data = []
if "peopleAlsoAsk" in search_results:
for question in search_results["peopleAlsoAsk"]:
title = question.get("title", "")
snippet = question.get("snippet", "")
link = question.get("link", "")
people_also_ask_data.append([title, snippet, link])
except Exception as people_also_ask_err:
logger.error(f"Error processing 'peopleAlsoAsk': {people_also_ask_err}")
people_also_ask_data = []
related_searches_data = []
for query in search_results.get("relatedSearches", []):
related_searches_data.append([query.get("query", "")])
related_searches_headers = ["Related Search"]
if people_also_ask_data:
# Add Related Searches as a column to People Also Ask
combined_data = [
row + [related_searches_data[i][0] if i < len(related_searches_data) else ""]
for i, row in enumerate(people_also_ask_data)
]
combined_headers = ["Question", "Snippet", "Link", "Related Search"]
# Display the combined table
combined_table = tabulate(
combined_data,
headers=combined_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left", "left"],
maxcolwidths=[20, 50, 20, 30]
)
else:
combined_table = tabulate(
related_searches_data,
headers=related_searches_headers,
tablefmt="fancy_grid",
colalign=["left"],
maxcolwidths=[60]
)
print("\n\n📢❗🚨 People Also Ask & Related Searches:")
print(combined_table)
# Save the combined table to a file
try:
save_in_file(organic_table)
save_in_file(combined_table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
return search_results
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")

View File

@@ -0,0 +1,530 @@
"""
This Python script analyzes Google search keywords by fetching auto-suggestions, performing keyword clustering, and visualizing Google Trends data. It uses various libraries such as pytrends, requests_html, tqdm, and more.
Features:
- Fetches auto-suggestions for a given search keyword from Google.
- Performs keyword clustering using K-means algorithm based on TF-IDF vectors.
- Visualizes Google Trends data, including interest over time and interest by region.
- Retrieves related queries and topics for a set of search keywords.
- Utilizes visualization libraries such as Matplotlib, Plotly, and Rich for displaying results.
- Incorporates logging for error handling and informative messages.
Usage:
- Provide a search term or a list of search terms for analysis.
- Run the script to fetch auto-suggestions, perform clustering, and visualize Google Trends data.
- Explore the displayed results, including top keywords in each cluster and related topics.
Modifications:
- Customize the search terms in the 'do_google_trends_analysis' function.
- Adjust the number of clusters for keyword clustering and other parameters as needed.
- Explore further visualizations and analyses based on the generated data.
Note: Ensure that the required libraries are installed using 'pip install pytrends requests_html tqdm tabulate plotly rich'.
"""
import requests
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, silhouette_samples
from rich.console import Console
from rich.progress import Progress
import urllib
import json
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import logging
from requests_html import HTML, HTMLSession
from urllib.parse import quote_plus
from tqdm import tqdm
from tabulate import tabulate
from pytrends.request import TrendReq
import wordcloud
logging.basicConfig(level=logging.INFO)
from loguru import logger
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def fetch_google_trends_interest_overtime(keyword):
try:
pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload([keyword], timeframe='today 1-y', geo='US')
# 1. Interest Over Time
data = pytrends.interest_over_time()
data = data.reset_index()
# Visualization using Matplotlib
plt.figure(figsize=(10, 6))
plt.plot(data['date'], data[keyword], label=keyword)
plt.title(f'Interest Over Time for "{keyword}"')
plt.xlabel('Date')
plt.ylabel('Interest')
plt.legend()
plt.show()
return data
except Exception as e:
logging.error(f"Error in fetch_google_trends_data: {e}")
return pd.DataFrame()
def plot_interest_by_region(kw_list):
try:
from pytrends.request import TrendReq
import matplotlib.pyplot as plt
trends = TrendReq()
trends.build_payload(kw_list=kw_list)
kw_list = ' '.join(kw_list)
data = trends.interest_by_region() #sorting by region
data = data.sort_values(by=f"{kw_list}", ascending=False)
print("\n📢❗🚨 ")
print(f"Top 10 regions with highest interest for keyword: {kw_list}")
data = data.head(10) #Top 10
print(data)
data.reset_index().plot(x="geoName", y=f"{kw_list}",
figsize=(20,15), kind="bar")
plt.style.use('fivethirtyeight')
plt.show()
# FIXME: Send this image to vision GPT for analysis.
except Exception as e:
print(f"Error plotting interest by region: {e}")
return None
def get_related_queries_and_save_csv(keywords, hl='en-US', tz=360, cat=0, timeframe='today 12-m'):
"""
Get related queries for the given search keywords and save the result to a CSV file.
Args:
search_keywords (list): List of search keywords.
hl (str): Language parameter, default is 'en-US'.
tz (int): Timezone parameter, default is 360.
cat (int): Category parameter, default is 0.
timeframe (str): Timeframe parameter, default is 'today 12-m'.
Returns:
pd.DataFrame: DataFrame containing related queries.
"""
try:
# Build model
pytrends = TrendReq(hl=hl, tz=tz)
pytrends.build_payload(kw_list=keywords, cat=cat, timeframe=timeframe)
# Get related queries
data = pytrends.related_queries()
# Extract data from the result
top_queries = list(data.values())[0]['top']
rising_queries = list(data.values())[0]['rising']
top_rising_queries = top_queries + rising_queries
# Convert lists to DataFrames
df_top_queries = pd.DataFrame(top_queries)
df_rising_queries = pd.DataFrame(rising_queries) # Added this line
# Rename columns to avoid duplicates
df_top_queries.columns = ['Top query', 'value']
df_rising_queries.columns = ['Rising query', 'value']
# Save to CSV
all_queries_df = pd.concat([df_top_queries, df_rising_queries], axis=1)
#all_queries_df.to_csv('related_queries.csv', index=False)
# Display additional information
console = Console()
# Display additional information with emojis and bold formatting
print("\n📢❗🚨 ")
print("\n\033[1m🔝 Top\033[0m: The most popular search queries. Scoring is on a relative scale where a value of 100 is the most commonly searched query, 50 is a query searched half as often, and a value of 0 is a query searched for less than 1% as often as the most popular query.\n")
print("\n\033[1m🚀 Rising\033[0m: Queries with the biggest increase in search frequency since the last time period. Results marked 'Breakout' had a tremendous increase, probably because these queries are new and had few (if any) prior searches.\n")
# Display the DataFrame using tabulate
print(tabulate(all_queries_df, headers='keys', tablefmt='fancy_grid'))
# Save the combined table to a file
try:
save_in_file(all_queries_df)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
return top_rising_queries
except Exception as e:
print(f"get_related_queries_and_save_csv: ERROR: An error occurred: {e}")
def get_related_topics_and_save_csv(search_keywords):
"""
Get related topics for the given search keywords and save the result to a CSV file.
Args:
search_keywords (list): List of search keywords.
Returns:
pd.DataFrame: DataFrame containing related topics.
"""
try:
# Build model
pytrends = TrendReq(hl='en-US', tz=360)
# Build payload
pytrends.build_payload(search_keywords, cat=0, timeframe='today 12-m')
# Get related topics
data = pytrends.related_topics()
# Extract data from the result
top_topics = list(data.values())[0]['top']
rising_topics = list(data.values())[0]['rising']
# Convert lists to DataFrames
df_top_topics = pd.DataFrame(top_topics)
df_rising_topics = pd.DataFrame(rising_topics)
# FIXME:Exclude specified columns
columns_to_exclude = ['hasData', 'value', 'topic_mid', 'link']
df_top_topics = df_top_topics.drop(columns=columns_to_exclude, errors='ignore')
df_rising_topics = df_rising_topics.drop(columns=columns_to_exclude, errors='ignore')
# Rename columns to avoid duplicates and provide meaningful names
df_top_topics.columns = ['Top- ' + col if col != 'topic_title' else col for col in df_top_topics.columns]
df_rising_topics.columns = ['Rising- ' + col if col != 'topic_title' else col for col in df_rising_topics.columns]
# Save to CSV
all_topics_df = pd.concat([df_top_topics, df_rising_topics], axis=1)
#all_topics_df.to_csv('related_topics.csv', index=False)
print(f"\n\n 📢❗🚨 Rising and Trending Keywords for {search_keywords}\n")
print("\033[1m🔝 Top\033[0m: The most popular search topics.")
print("\033[1m🚀 Rising\033[0m: Topics experiencing a significant increase in search frequency since the last time period. Topics marked :pile_of_poop:'Breakout' had a tremendous surge, likely because they are new and had few prior searches.")
# Display the DataFrame using tabulate
pd.set_option('display.max_rows', all_topics_df.shape[0]+1)
print(all_topics_df.head(10))
#print(tabulate(all_topics_df, headers='keys', tablefmt='fancy_grid'))
return all_topics_df
except Exception as e:
print(f"ERROR: An error occurred: {e}")
return pd.DataFrame()
def get_source(url):
try:
session = HTMLSession()
response = session.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses
return response
except requests.exceptions.RequestException as e:
logging.error(f"Error during HTTP request: {e}")
return None
def get_results(query):
try:
query = urllib.parse.quote_plus(query)
response = get_source(f"https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q={query}")
if response:
response.raise_for_status()
results = json.loads(response.text)
return results
else:
return None
except json.JSONDecodeError as e:
logging.error(f"Error decoding JSON response: {e}")
return None
except requests.exceptions.RequestException as e:
logging.error(f"Error during HTTP request: {e}")
return None
def format_results(results):
try:
suggestions = []
for index, value in enumerate(results[1]):
suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]}
suggestions.append(suggestion)
return suggestions
except (KeyError, IndexError) as e:
logging.error(f"Error parsing search results: {e}")
return []
def get_expanded_term_suffixes():
return ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm','n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def get_expanded_term_prefixes():
# For shopping, review type blogs.
#return ['discount *', 'pricing *', 'cheap', 'best price *', 'lowest price', 'best value', 'sale', 'affordable', 'promo', 'budget''what *', 'where *', 'how to *', 'why *', 'buy*', 'how much*','best *', 'worse *', 'rent*', 'sale*', 'offer*','vs*','or*']
return ['what *', 'where *', 'how to *', 'why *','best *', 'vs*', 'or*']
def get_expanded_terms(query):
try:
expanded_term_prefixes = get_expanded_term_prefixes()
expanded_term_suffixes = get_expanded_term_suffixes()
terms = [query]
for term in expanded_term_prefixes:
terms.append(f"{term} {query}")
for term in expanded_term_suffixes:
terms.append(f"{query} {term}")
return terms
except Exception as e:
logging.error(f"Error in get_expanded_terms: {e}")
return []
def get_expanded_suggestions(query):
try:
all_results = []
expanded_terms = get_expanded_terms(query)
for term in tqdm(expanded_terms, desc="📢❗🚨 Fetching Google AutoSuggestions", unit="term"):
results = get_results(term)
if results:
formatted_results = format_results(results)
all_results += formatted_results
all_results = sorted(all_results, key=lambda k: k.get('relevance', 0), reverse=True)
return all_results
except Exception as e:
logging.error(f"Error in get_expanded_suggestions: {e}")
return []
def get_suggestions_for_keyword(search_term):
""" """
try:
expanded_results = get_expanded_suggestions(search_term)
expanded_results_df = pd.DataFrame(expanded_results)
expanded_results_df.columns = ['Keywords', 'Relevance']
#expanded_results_df.to_csv('results.csv', index=False)
pd.set_option('display.max_rows', expanded_results_df.shape[0]+1)
expanded_results_df.drop_duplicates('Keywords', inplace=True)
return expanded_results_df
except Exception as e:
logging.error(f"get_suggestions_for_keyword: Error in main: {e}")
def perform_keyword_clustering(expanded_results_df, num_clusters=5):
try:
# Preprocessing: Convert the keywords to lowercase
expanded_results_df['Keywords'] = expanded_results_df['Keywords'].str.lower()
# Vectorization: Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the keywords
tfidf_vectors = vectorizer.fit_transform(expanded_results_df['Keywords'])
# Applying K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_vectors)
# Add cluster labels to the DataFrame
expanded_results_df['cluster_label'] = cluster_labels
# Assessing cluster quality through silhouette score
silhouette_avg = silhouette_score(tfidf_vectors, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")
# Visualize cluster quality using a silhouette plot
#visualize_silhouette(tfidf_vectors, cluster_labels)
return expanded_results_df
except Exception as e:
logging.error(f"Error in perform_keyword_clustering: {e}")
return pd.DataFrame()
def visualize_silhouette(X, labels):
try:
silhouette_avg = silhouette_score(X, labels)
print(f"Silhouette Score: {silhouette_avg}")
# Create a subplot with 1 row and 2 columns
fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))
# The 1st subplot is the silhouette plot
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, X.shape[0] + (len(set(labels)) + 1) * 10])
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, labels)
y_lower = 10
for i in set(labels):
# Aggregate the silhouette scores for samples belonging to the cluster
ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / len(set(labels)))
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for the next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("Silhouette plot for KMeans clustering")
ax1.set_xlabel("Silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for the average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()
except Exception as e:
logging.error(f"Error in visualize_silhouette: {e}")
def print_and_return_top_keywords(expanded_results_df, num_clusters=5):
"""
Display and return top keywords in each cluster.
Args:
expanded_results_df (pd.DataFrame): DataFrame containing expanded keywords, relevance, and cluster labels.
num_clusters (int or str): Number of clusters or 'all'.
Returns:
pd.DataFrame: DataFrame with top keywords for each cluster.
"""
top_keywords_df = pd.DataFrame()
if num_clusters == 'all':
unique_clusters = expanded_results_df['cluster_label'].unique()
else:
unique_clusters = range(int(num_clusters))
for i in unique_clusters:
cluster_df = expanded_results_df[expanded_results_df['cluster_label'] == i]
top_keywords = cluster_df.sort_values(by='Relevance', ascending=False).head(5)
top_keywords_df = pd.concat([top_keywords_df, top_keywords])
print(f"\n📢❗🚨 GTop Keywords for All Clusters:")
table = tabulate(top_keywords_df, headers='keys', tablefmt='fancy_grid')
# Save the combined table to a file
try:
save_in_file(top_keywords_df)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
print(table)
return top_keywords_df
def generate_wordcloud(keywords):
"""
Generate and display a word cloud from a list of keywords.
Args:
keywords (list): List of keywords.
"""
# Convert the list of keywords to a string
text = ' '.join(keywords)
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
# Display the word cloud using matplotlib
plt.figure(figsize=(600, 200))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "w") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")
def do_google_trends_analysis(search_term):
""" Get a google search keywords, get its stats."""
search_term = [f"{search_term}"]
all_the_keywords = []
try:
for asearch_term in search_term:
#FIXME: Lets work with a single root keyword.
suggestions_df = get_suggestions_for_keyword(asearch_term)
result_df = perform_keyword_clustering(suggestions_df)
# Display top keywords in each cluster
top_keywords = print_and_return_top_keywords(result_df)
all_the_keywords.append(top_keywords['Keywords'].tolist())
#
# # FIXME: Get result from vision GPT. Fetch and visualize Google Trends data
# #trends_data = fetch_google_trends_interest_overtime("llamaindex")
#
# # FIXME: Plot Interest Over time.
# result_df = plot_interest_by_region(search_term)
#
# # Display additional information
result_df = get_related_topics_and_save_csv(search_term)
# Extract 'Top' topic_title
top_topic_title = result_df['topic_title'].values.tolist()
# Join each sublist into one string separated by comma
#top_topic_title = [','.join(filter(None, map(str, sublist))) for sublist in top_topic_title]
top_topic_title = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in top_topic_title])
print(f"\nRising and Top keywords: {top_topic_title}")
# Print or use the extracted topic titles
all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords])
print(f"\n\n📢❗🚨 Important keywords to target: {all_the_keywords}\n\n")
all_the_keywords += top_topic_title
print(all_the_keywords)
all_the_keywords = all_the_keywords.split(',')
# Split the list into chunks of 5 keywords
chunk_size = 4
chunks = [all_the_keywords[i:i + chunk_size] for i in range(0, len(all_the_keywords), chunk_size)]
# Create a DataFrame with columns named 'Keyword 1', 'Keyword 2', etc.
combined_df = pd.DataFrame(chunks, columns=[f'K📢eyword Col{i + 1}' for i in range(chunk_size)])
# Print the table
print(tabulate(combined_df, headers='keys', tablefmt='fancy_grid'))
#combined_df = pd.DataFrame({'📢❗🚨 Important keywords to target': chunks})
print(all_the_keywords)
generate_wordcloud(all_the_keywords.split(','))
return(all_the_keywords)
except Exception as e:
logging.error(f"Error in main: {e}")

View File

@@ -0,0 +1,49 @@
import sys
import json
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
# FIXME: Provide num_blogs, num_faqs as inputs.
def get_blog_sections_from_websearch(search_keyword, search_results, gpt_providers="gemini"):
"""Combine the given online research and gpt blog content"""
prompt = f"""
As a SEO expert and content writer, I will provide you with a search keyword and its google search result.
Your task is to write a blog title and 5 blog sub titles, from the given google search result.
The subtitles should be less than 40 characters and click worthy.
Do not explain, describe your response. Respond in json format, always name the key as 'blogSections'.
Web Research Keyword: "{search_keyword}"
Google search Result: "{search_results}"
"""
if 'gemini' in gpt_providers:
try:
response = gemini_text_response(prompt)
if '```' in response and '\n' in response:
response = response.strip().split('\n')
# Remove the first and last lines
response = '\n'.join(response[1:-1])
response = json.loads(response)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
logger.error(f"Gemini Error: {response.prompt_feedback}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,41 @@
import sys
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def summarize_competitor_content(research_content, gpt_providers="openai"):
"""Combine the given online research and gpt blog content"""
prompt = f""" Web page content: {research_content} """
if 'gemini' in gpt_providers:
prompt = f"""You are a helpful assistant writing a research report about a company. I will provide you with company details.
Summarize the given company details into multiple paragraphs.
Be extremely concise, professional, and factual as possible.
The first paragraph should be an introduction and summary of the company.
The second paragraph should include pros and cons of the company.
The third paragraph should be on their pricing model.
Include a conclusion, summarizing your research about the given company details.
Company details: '{research_content}'"""
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,185 @@
################################################################
#
#
#
##############################################################
import os
import json
from pathlib import Path
import sys
from typing import List, NamedTuple
from loguru import logger
from datetime import datetime
from ..gpt_providers.gemini_pro_text import gemini_text_response
from .tavily_ai_search import get_tavilyai_results
from .metaphor_basic_neural_web_search import metaphor_find_similar, metaphor_search_articles
from .google_serp_search import google_search
from .google_trends_researcher import do_google_trends_analysis
from .gpt_blog_sections import get_blog_sections_from_websearch
from .web_research_report import write_web_research_report
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(), similar_url=None):
""" """
print(f"Web Research:Time Range - {time_range},Search Keywords - {search_keywords},Include URLs - {include_domains}")
if not include_domains:
include_domains = list()
# TBD: Keeping the results directory as fixed, for now.
os.environ["SEARCH_SAVE_FILE"] = os.path.join(os.getcwd(), "workspace", "web_research_reports",
search_keywords.replace(" ", "_") + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
# Collect all blog titles featuring in search results. This *may help in generating blog titles
# closest to competing ones. All search blog titles, given keyword and keywords from analysis, give
# llm a good context for the task of generating blog titles.
blog_titles = []
# Get a list of FAQs from search results.
blog_faqs = None
google_result = None
tavily_result = None
report = None
# try:
# logger.info(f"Doing Google search for: {search_keywords}\n")
# google_result = google_search(search_keywords)
# blog_titles.append(extract_info(google_result, "titles"))
# except Exception as err:
# logger.error(f"Failed to do Google Serpapi research: {err}")
# # Not failing, as tavily would do same and then GPT-V to search.
#
# try:
# # FIXME: Include the follow-up questions as blog FAQs.
# logger.info(f"Doing Tavily AI search for: {search_keywords}")
# tavily_result = get_tavilyai_results(search_keywords, include_domains)
# blog_titles.append(tavily_extract_information(tavily_result, "titles"))
# except Exception as err:
# logger.error(f"Failed to do Tavily AI Search: {err}")
# try:
# logger.info(f"Start Semantic/Neural web search with Metahpor: {search_keywords}")
# response_articles = metaphor_search_articles(
# search_keywords,
# include_domains=include_domains,
# time_range=time_range,
# similar_url=similar_url)
# blog_titles.append(metaphor_extract_titles_or_text(response_articles, return_titles=True))
# except Exception as err:
# logger.error(f"Failed to do Metaphor search: {err}")
# print(blog_titles)
try:
logger.info(f"Do Google Trends analysis for given keywords: {search_keywords}")
important_keywords = do_google_trends_analysis(search_keywords)
except Exception as err:
logger.error(f"Failed to do google trends analysis: {err}")
print(important_keywords)
# Now that we have search results from given keywords. Generate blog title and subtopics suggestions.
# 1. Return a list of related keywords along with search volumes.
# 2. New blog titles to write on(niche, top) and blog sections.
# 3. Competitors list, similar urls if given.
class Result(NamedTuple):
url: str
id: str
title: str
score: float
published_date: str
author: str
text: str
highlights: List[str]
highlight_scores: List[float]
def metaphor_extract_titles_or_text(json_data, return_titles=True):
"""
Extract either titles or text from the given JSON structure.
Args:
json_data (list): List of Result objects in JSON format.
return_titles (bool): If True, return titles. If False, return text.
Returns:
list: List of titles or text.
"""
result_list = [Result(**result) for result in json_data]
if return_titles:
return [result.title for result in result_list]
else:
return [result.text for result in result_list]
def extract_info(json_data, info_type):
"""
Extract information (titles, peopleAlsoAsk, or relatedSearches) from the given JSON.
Args:
json_data (dict): The JSON data.
info_type (str): The type of information to extract (titles, peopleAlsoAsk, relatedSearches).
Returns:
list or None: A list containing the requested information, or None if the type is invalid.
"""
if info_type == "titles":
return [result.get("title") for result in json_data.get("organic", [])]
elif info_type == "peopleAlsoAsk":
return [item.get("question") for item in json_data.get("peopleAlsoAsk", [])]
elif info_type == "relatedSearches":
return [item.get("query") for item in json_data.get("relatedSearches", [])]
else:
print("Invalid info_type. Please use 'titles', 'peopleAlsoAsk', or 'relatedSearches'.")
return None
def tavily_extract_information(json_data, keyword):
"""
Extract information from the given JSON based on the specified keyword.
Args:
json_data (dict): The JSON data.
keyword (str): The keyword (title, content, answer, follow-query).
Returns:
list or str: The extracted information based on the keyword.
"""
if keyword == 'title':
return [result['title'] for result in json_data['results']]
elif keyword == 'content':
return [result['content'] for result in json_data['results']]
elif keyword == 'answer':
return json_data['answer']
elif keyword == 'follow-query':
return json_data['follow_up_questions']
else:
return f"Invalid keyword: {keyword}"
def compete_organic_results(query, report, organic_results):
""" Given a blog content and google search organinc results, create a new blog to compete against them."""
prompt = f""" As an SEO expert and copywriter, I will provide you with my blog content on topic '{query}', and
Top google search results.
Your task is to rewrite the given blog to make it compete against top position results.
Make sure, the new blog has high probability of ranking highest against given organic search result competitors.
Modify the given blog content following best SEO practises.
Make sure the blog is original, unique and highly readable.
Remember, Maintain and adopt the formatting, structure, style and tone of the provided blog content.
Include relevant emojis in your final blog for visual appeal. Use it sparingly.
Your response should be well-structured, objective, and critically acclaimed blog article based on provided texts.
Remember, your goal is to create a detailed blog article that will compete against given organic result competitors.
Do not provide explanations, suggestions for your response, reply only with your final response.
Take your time in crafting your content, do not rush to give the response.
Blog Content: '{report}'\n
Organic Search result: '{organic_results}'
"""
report = gemini_text_response(prompt)
return report

View File

@@ -0,0 +1,38 @@
import sys
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def summarize_web_content(page_content, gpt_providers="openai"):
"""Combine the given online research and gpt blog content"""
prompt = f"""
Web page content: {page_content}
"""
if 'gemini' in gpt_providers:
prompt = f"""You are a helpful assistant that briefly summarizes the content of a webpage.
Summarize the given web page content below.
Web page content: '{page_content}'"""
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,53 @@
import sys
import json
from ..gpt_providers.openai_chat_completion import openai_chatgpt
from ..gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
# FIXME: Provide num_blogs, num_faqs as inputs.
def gpt_titles_faqs_google_search(search_keyword, search_results, gpt_providers="openai"):
"""Combine the given online research and gpt blog content"""
prompt = f"""
As a SEO expert and content writer, I will provide you with my web research keyword and its google search result in json format.
Your task is to write 1 blog title and 10 FAQs.
1). Your blog title should compete against all the provided search results.
2). Your FAQ should be based on 'People also ask' and 'Related Queries' from given result.
Always include answers for each FAQ, use your knowledge and confirm with snippets given in search result.
3). Respond in json data with 'blogTitles' and 'FAQs' as json keys. Do not explain, describe your response.
4). Follow best practises of SEO.
Web Research Keyword: "{search_keyword}"
Google search Result: "{search_results}"
"""
logger.info("Generating blog title and FAQs from web search result.")
if 'gemini' in gpt_providers:
try:
response = gemini_text_response(prompt)
print(f"\n\n\n RESPONSE: {response}\n\n\n")
if '```' in response and '\n' in response:
response = response.strip().split('\n')
# Remove the first and last lines
response = '\n'.join(response[1:-1])
response = json.loads(response)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
raise err
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from Openai: {err}")
raise err

View File

@@ -0,0 +1,223 @@
import os
import sys
import pandas as pd
from io import StringIO
from pathlib import Path
from metaphor_python import Metaphor
from datetime import datetime, timedelta
from loguru import logger
from tqdm import tqdm
from tabulate import tabulate
from collections import namedtuple
import textwrap
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from dotenv import load_dotenv
load_dotenv(Path('../../.env'))
from exa_py import Exa
from tenacity import (retry, stop_after_attempt, wait_random_exponential,)# for exponential backoff
from .gpt_summarize_web_content import summarize_web_content
from .gpt_competitor_analysis import summarize_competitor_content
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_metaphor_client():
"""
Get the Metaphor client.
Returns:
Metaphor: An instance of the Metaphor client.
"""
METAPHOR_API_KEY = os.environ.get('METAPHOR_API_KEY')
if not METAPHOR_API_KEY:
raise ValueError("METAPHOR_API_KEY environment variable not set!")
return Exa(METAPHOR_API_KEY)
def metaphor_rag_search():
""" Mainly used for researching blog sections. """
metaphor = get_metaphor_client()
def metaphor_find_similar(similar_url):
"""
Find similar content using the Metaphor API.
Args:
url (str): The URL to find similar content.
Returns:
MetaphorResponse: The response from the Metaphor API.
"""
metaphor = get_metaphor_client()
try:
logger.info(f"Doing similar web search for url: {similar_url}")
search_response = metaphor.find_similar_and_contents(
similar_url,
highlights=True,
num_results=10)
except Exception as e:
logger.error(f"Metaphor: Error in finding similar content: {e}")
raise
competitors = search_response.results
for acompetitor in tqdm(competitors, desc="Processing Competitors", unit="competitor"):
all_contents = ""
try:
search_response = metaphor.search_and_contents(
acompetitor.url,
type="keyword",
num_results=5
)
except Exception as err:
logger.error(f"Failed to do metaphor keyword/url research: {err}")
research_response = search_response.results
# Add a progress bar for the inner loop
for r in tqdm(research_response, desc=f"{acompetitor.url}", unit="research"):
all_contents += r.text
try:
acompetitor.text = summarize_competitor_content(all_contents, "gemini")
except Exception as err:
logger.error(f"Failed to summarize_web_content: {err}")
# Convert the data into a list of lists
print_search_result(competitors)
return search_response
def metaphor_search_articles(query,
num_results=5,
use_autoprompt=True,
include_domains=[],
time_range=None,
similar_url=None):
"""
Search for articles using the Metaphor API.
Args:
query (str): The search query.
num_results (int): Number of results to retrieve.
use_autoprompt (bool): Whether to use autoprompt.
include_domains (list): List of domains to include.
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
Returns:
MetaphorResponse: The response from the Metaphor API.
"""
metaphor = get_metaphor_client()
try:
if time_range == "past day":
start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
elif time_range == "past week":
start_published_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
elif time_range == "past month":
start_published_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
elif time_range == "past year":
start_published_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
else:
start_published_date = None
logger.info(f"Metaphor web search with Date: {start_published_date} and Query: {query}")
try:
search_response = metaphor.search_and_contents(
query,
include_domains=include_domains,
use_autoprompt=True,
start_published_date=start_published_date,
num_results=num_results
)
except Exception as err:
logger.error(f"Failed in metaphor.search_and_contents: {err}")
# From each webpage, get a summary of the web page.
contents_response = search_response.results
for content in tqdm(contents_response, desc="Reading Web URL content:", unit="content"):
summarized_content = summarize_web_content(content.text, "gemini")
content.text = summarized_content
print_search_result(contents_response)
if similar_url:
logger.info(f"Doing similar/semantic search for URL: {similar_url}")
metaphor_find_similar(similar_url)
return contents_response
except Exception as e:
logger.error(f"Error in Metaphor searching articles: {e}")
raise
def print_search_result(contents_response):
# Define the Result namedtuple
Result = namedtuple("Result", ["url", "title", "published_date", "text"])
# Tabulate the data
table_headers = ["URL", "Title", "Published Date", "Summary"]
table_data = [(result.url, result.title, result.published_date, result.text) for result in contents_response]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left", "left"],
maxcolwidths=[20, 20, 10, 60])
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")
def metaphor_scholar_search(query, include_domains=None, time_range="anytime"):
"""
Search for papers using the Metaphor API.
Args:
query (str): The search query.
include_domains (list): List of domains to include.
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
Returns:
MetaphorResponse: The response from the Metaphor API.
"""
client = get_metaphor_client()
try:
if time_range == "day":
start_published_date = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "week":
start_published_date = (datetime.utcnow() - timedelta(weeks=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "month":
start_published_date = (datetime.utcnow() - timedelta(weeks=4)).strftime('%Y-%m-%dT%H:%M:%SZ')
elif time_range == "year":
start_published_date = (datetime.utcnow() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ')
else:
start_published_date = None
response = client.search(query, include_domains=include_domains, start_published_date=start_published_date, use_autoprompt=True)
return response
except Exception as e:
logger.error(f"Error in searching papers: {e}")

View File

@@ -0,0 +1,156 @@
"""
This Python script uses the Tavily AI service to perform advanced searches based on specified keywords and options. It retrieves Tavily AI search results, pretty-prints them using Rich and Tabulate, and provides additional information such as the answer to the search query and follow-up questions.
Features:
- Utilizes the Tavily AI service for advanced searches.
- Retrieves API keys from the environment variables loaded from a .env file.
- Configures logging with Loguru for informative messages.
- Implements a retry mechanism using Tenacity to handle transient failures during Tavily searches.
- Displays search results, including titles, snippets, and links, in a visually appealing table using Tabulate and Rich.
Usage:
- Ensure the necessary API keys are set in the .env file.
- Run the script to perform a Tavily AI search with specified keywords and options.
- The search results, including titles, snippets, and links, are displayed in a formatted table.
- Additional information, such as the answer to the search query and follow-up questions, is presented in separate tables.
Modifications:
- To modify the script, update the environment variables in the .env file with the required API keys.
- Adjust the search parameters, such as keywords and search depth, in the `get_tavilyai_results` function as needed.
- Customize logging configurations and table formatting according to preferences.
To-Do (TBD):
- Consider adding further enhancements or customization based on specific use cases.
Note: This script depends on external libraries such as Tavily, Rich, Tabulate, Loguru, and Tenacity. Install them using 'pip install tavily rich tabulate loguru tenacity' if not already installed.
"""
import os
from pathlib import Path
import sys
from dotenv import load_dotenv
from loguru import logger
from tavily import TavilyClient
from rich import print
from tabulate import tabulate
# Load environment variables from .env file
load_dotenv(Path('../../.env'))
from rich import print
# Configure logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from tenacity import retry, stop_after_attempt, wait_random_exponential
from .gpt_titles_faq import gpt_titles_faqs_google_search
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_tavilyai_results(keywords, include_urls, search_depth="advanced"):
"""
Get Tavily AI search results based on specified keywords and options.
Args:
keywords (str): Keywords for Tavily AI search.
include_urls (str): Comma-separated URLs to include in the search.
search_depth (str, optional): Search depth option (default is "advanced").
Returns:
dict: Tavily AI search results.
"""
# Run Tavily search
logger.info(f"Running Tavily search on: {keywords}")
# Retrieve API keys
api_key = os.getenv('TAVILY_API_KEY')
if not api_key:
raise ValueError("API keys for Tavily or OpenAI are not set.")
# Initialize Tavily client
try:
client = TavilyClient(api_key=api_key)
except Exception as err:
logger.error(f"Failed to create Tavily client. Check TAVILY_API_KEY: {err}")
exit(1)
try:
if include_urls:
tavily_search_result = client.search(keywords, search_depth, include_answer=True, include_domains=include_urls)
else:
tavily_search_result = client.search(keywords, search_depth, include_answer=True)
print_result_table(tavily_search_result)
return(tavily_search_result)
except Exception as err:
logger.error(f"Failed to do Tavily Research: {err}")
def print_result_table(output_data):
""" Pretty print the tavily AI serch result. """
# Prepare data for tabulate
table_data = []
for item in output_data.get("results"):
title = item.get("title", "")
snippet = item.get("content", "")
link = item.get("url", "")
table_data.append([title, snippet, link])
# Define table headers
table_headers = ["Title", "Snippet", "Link"]
# Display the table using tabulate
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
colalign=["left", "left", "left"],
maxcolwidths=[30, 60, 30])
# Print the table
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
# Display the 'answer' in a table
table_headers = [f"The answer to search query: {output_data.get('query')}"]
table_data = [[output_data.get("answer")]]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
maxcolwidths=[80])
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
# Display the 'follow_up_questions' in a table
table_headers = [f"Search Engine follow up questions for query: {output_data.get('query')}"]
table_data = [[output_data.get("follow_up_questions")]]
table = tabulate(table_data,
headers=table_headers,
tablefmt="fancy_grid",
maxcolwidths=[80])
print(table)
# Save the combined table to a file
try:
save_in_file(table)
except Exception as save_results_err:
logger.error(f"Failed to save search results: {save_results_err}")
def save_in_file(table_content):
""" Helper function to save search analysis in a file. """
file_path = os.environ.get('SEARCH_SAVE_FILE')
try:
# Save the content to the file
with open(file_path, "a") as file:
file.write(table_content)
file.write("\n" * 3) # Add three newlines at the end
logger.info(f"Search content saved to {file_path}")
except Exception as e:
logger.error(f"Error occurred while writing to the file: {e}")

View File

@@ -0,0 +1,23 @@
from langchain.adapters.openai import convert_openai_messages
from langchain.chat_models import ChatOpenAI
from ..gpt_providers.gemini_pro_text import gemini_text_response
def write_web_research_report(web_research, faq_questions, gpt_provider="gemini"):
""" """
if "gemini" in gpt_provider:
prompt = ["You are an SEO and marketing expert, who writes unique, factual and comprehensive research reports."
"I will provide you web research report as json data and a list of related FAQ questions."
"Use given json as context for writing your research report."
"Your sole purpose is to write well written, critically acclaimed, objective and structured research report"
"Use the urls from json content to provide cititations and include it in referances section of your report."
"Include appropriate emojis in your research report."
"Format your report in MLA format and markdown style, with special focus on readibility."
f"Do not provide explanations for your response.\nWeb research Report: \"\"\" {web_research} \"\"\"\n "
f"\nList of FAQ questions: \"\"\" {faq_questions} \"\"\"\n"]
report = gemini_text_response(prompt)
elif "openai" in gpt_provider:
report = openai_research_report(prompt)
return report

View File

@@ -0,0 +1,137 @@
import requests
from clint.textui import progress
from loguru import logger
def search_ydc_index(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
"""
Search YDC Index API and retrieve results.
Args:
search_query (str): The search query.
num_web_results (int): Number of web results to retrieve.
country (str): Country code.
api_key (str): YDC Index API key.
Returns:
dict: The response from the YDC Index API in JSON format.
"""
try:
url = "https://api.ydc-index.io/search"
querystring = {
"query": search_query,
"num_web_results": str(num_web_results),
"country": country
}
headers = {"X-API-Key": api_key}
with progress.Bar(expected_size=num_web_results, label="Searching YDC Index") as bar:
response = requests.get(url, headers=headers, params=querystring, stream=True)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
result_json = response.json()
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
return result_json
except requests.exceptions.RequestException as req_exc:
logger.error(f"Request to YDC Index API failed: {req_exc}")
return {"error": str(req_exc)}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {"error": str(e)}
def get_rag_results(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
"""
Retrieve RAG (Relevance, Authority, and Goodness) results from YDC Index API.
Args:
search_query (str): The search query.
num_web_results (int): Number of web results to retrieve.
country (str): Country code.
api_key (str): YDC Index API key.
Returns:
dict: The response from the YDC Index API in JSON format.
"""
try:
url = "https://api.ydc-index.io/rag"
querystring = {
"query": search_query,
"num_web_results": str(num_web_results),
"country": country
}
headers = {"X-API-Key": api_key}
with progress.Bar(expected_size=num_web_results, label="Fetching RAG Results") as bar:
response = requests.get(url, headers=headers, params=querystring, stream=True)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
result_json = response.json()
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
return result_json
except requests.exceptions.RequestException as req_exc:
logger.error(f"Request to YDC Index API failed: {req_exc}")
return {"error": str(req_exc)}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {"error": str(e)}
def get_news_results(query, spellcheck=True, api_key="<api-key>"):
"""
Retrieve news results from YDC Index API.
Args:
query (str): The search query.
spellcheck (bool): Whether to enable spellcheck.
api_key (str): YDC Index API key.
Returns:
dict: The response from the YDC Index API in JSON format.
"""
try:
url = "https://api.ydc-index.io/news"
querystring = {
"q": query,
"spellcheck": str(spellcheck).lower()
}
headers = {"X-API-Key": api_key}
with progress.Bar(expected_size=1, label="Fetching News Results") as bar:
response = requests.get(url, headers=headers, params=querystring, stream=True)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
result_json = response.json()
bar.show() # Update progress bar
return result_json
except requests.exceptions.RequestException as req_exc:
logger.error(f"Request to YDC Index API failed: {req_exc}")
return {"error": str(req_exc)}
except Exception as e:
logger.error(f"An error occurred: {e}")
return {"error": str(e)}
# Example usage
search_query = "Getting started with llamaindex"
result = get_news_results(search_query)
print(result)
result = get_rag_results(search_query)
print(result)
result = search_ydc_index(search_query)
print(result)