WIP- Under maintenence- Web research working.
This commit is contained in:
332
lib/ai_web_researcher/arxiv_schlorly_research.py
Normal file
332
lib/ai_web_researcher/arxiv_schlorly_research.py
Normal file
@@ -0,0 +1,332 @@
|
||||
####################################################
|
||||
#
|
||||
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
|
||||
# https://github.com/danielnsilva/semanticscholar
|
||||
# https://github.com/shauryr/S2QA
|
||||
#
|
||||
####################################################
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
import arxiv
|
||||
import PyPDF2
|
||||
import io
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def fetch_arxiv_data(query, max_results=10):
|
||||
try:
|
||||
# Construct the default API client
|
||||
client = arxiv.Client()
|
||||
|
||||
# Search for articles matching the keyword
|
||||
search = arxiv.Search(
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
sort_by=arxiv.SortCriterion.SubmittedDate
|
||||
)
|
||||
|
||||
# Fetching results
|
||||
results = list(client.results(search))
|
||||
# Extracting data
|
||||
all_data = []
|
||||
for result in results:
|
||||
temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
|
||||
all_data.append(temp)
|
||||
|
||||
return all_data
|
||||
|
||||
except Exception as e:
|
||||
print("An error occurred while fetching data from arXiv:", e)
|
||||
raise e
|
||||
|
||||
|
||||
def create_dataframe(data, column_names):
|
||||
try:
|
||||
df = pd.DataFrame(data, columns=column_names)
|
||||
return df
|
||||
except Exception as e:
|
||||
print("An error occurred while creating DataFrame:", e)
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def get_arxiv_main_content(url):
|
||||
"""
|
||||
Returns the main content of an arXiv paper.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the arXiv paper.
|
||||
|
||||
Returns:
|
||||
str: The main content of the paper as a string.
|
||||
"""
|
||||
|
||||
try:
|
||||
# Send a GET request to the URL
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() # Raise an exception for HTTP errors
|
||||
|
||||
# Parse the HTML content
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Find the main content in 'ltx_page_content'
|
||||
main_content = soup.find('div', class_='ltx_page_content')
|
||||
if not main_content:
|
||||
logger.warning("Main content not found in the page.")
|
||||
return "Main content not found."
|
||||
|
||||
# Remove specific section with class 'package-alerts ltx_document'
|
||||
alert_section = main_content.find('div', class_='package-alerts ltx_document')
|
||||
if alert_section:
|
||||
alert_section.decompose()
|
||||
|
||||
# Optional: Remove abstract and authors if present
|
||||
for element_id in ["abs", "authors"]:
|
||||
element = main_content.find(id=element_id)
|
||||
if element:
|
||||
element.decompose()
|
||||
return main_content.text.strip()
|
||||
|
||||
# Could not access the arxiv HTML content, instead download pdf and read its content.
|
||||
except Exception as html_error:
|
||||
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
|
||||
try:
|
||||
# Extract arXiv ID from URL
|
||||
arxiv_id = url.split('/')[-1]
|
||||
# Fetch paper information using arXiv API
|
||||
paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
|
||||
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
|
||||
# Initialize an empty string to store the extracted text
|
||||
pdf_text = ''
|
||||
|
||||
# Read the downloaded PDF
|
||||
with open(pdf_filename, 'rb') as f:
|
||||
pdf_reader = PyPDF2.PdfReader(f)
|
||||
|
||||
for page in pdf_reader.pages:
|
||||
try:
|
||||
# Attempt to extract text from the current page
|
||||
page_text = page.extract_text()
|
||||
# If text extraction is successful, add it to the cumulative text
|
||||
if page_text:
|
||||
pdf_text += page_text + '\n'
|
||||
except UnicodeDecodeError as err:
|
||||
# FIXME: Handle any UnicodeDecodeError that arises during text extraction
|
||||
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
|
||||
pass
|
||||
|
||||
# Optionally, remove the downloaded PDF file
|
||||
os.remove(pdf_filename)
|
||||
|
||||
# Pattern to match 'References' and everything that follows
|
||||
pattern = r'References\s*.*'
|
||||
pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
|
||||
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
|
||||
for section in sections_to_remove:
|
||||
# Pattern to match the section title and any text following it until the next big title or end of document
|
||||
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
|
||||
pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
return pdf_text
|
||||
|
||||
except Exception as pdf_error:
|
||||
logger.error(f"Failed to process PDF: {pdf_error}")
|
||||
return "Failed to retrieve content."
|
||||
|
||||
|
||||
def download_image(image_url, base_url, folder="images"):
|
||||
# Skip downloading if the image URL is a data URI
|
||||
if image_url.startswith('data:image'):
|
||||
print(f"Skipping download of data URI image: {image_url}")
|
||||
return False
|
||||
|
||||
# Create the folder if it doesn't exist
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
|
||||
# Form the absolute URL for image paths
|
||||
if not urllib.parse.urlparse(image_url).scheme:
|
||||
if not base_url.endswith('/'):
|
||||
base_url += '/'
|
||||
image_url = base_url + image_url
|
||||
|
||||
# Download and save the image
|
||||
try:
|
||||
response = requests.get(image_url)
|
||||
response.raise_for_status()
|
||||
|
||||
image_name = image_url.split("/")[-1]
|
||||
with open(os.path.join(folder, image_name), 'wb') as file:
|
||||
file.write(response.content)
|
||||
return True
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error downloading {image_url}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def scrape_images_from_arxiv(url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
images = soup.find_all('img')
|
||||
|
||||
image_urls = [img['src'] for img in images if 'src' in img.attrs]
|
||||
return image_urls
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching page {url}: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
def arxiv_bibtex(arxiv_id):
|
||||
"""
|
||||
Get the BibTeX entry for an arXiv paper.
|
||||
Args:
|
||||
arxiv_id: The arXiv ID of the paper.
|
||||
Returns:
|
||||
A string containing the BibTeX entry.
|
||||
"""
|
||||
|
||||
import urllib.request, xml.dom.minidom
|
||||
|
||||
# Download the XML
|
||||
try:
|
||||
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
|
||||
xmldoc = xml.dom.minidom.parse(usock)
|
||||
usock.close()
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
# Parse the XML
|
||||
entry = xmldoc.getElementsByTagName("entry")[0]
|
||||
date = entry.getElementsByTagName("updated")[0].firstChild.data
|
||||
text_year = date[:4]
|
||||
|
||||
title = entry.getElementsByTagName("title")[0]
|
||||
text_title = title.firstChild.data.strip()
|
||||
|
||||
authorlist = []
|
||||
first = True
|
||||
for person_name in entry.getElementsByTagName("author"):
|
||||
# Get names
|
||||
name = person_name.getElementsByTagName("name")[0]
|
||||
text_name = name.firstChild.data
|
||||
text_given_name = ' '.join(text_name.split()[:-1])
|
||||
text_surname = text_name.split()[-1]
|
||||
authorlist.append(f"{text_surname}, {text_given_name}")
|
||||
# First author?
|
||||
if first:
|
||||
text_first_author_surname = text_surname
|
||||
first = False
|
||||
|
||||
# Construct the BibTeX entry
|
||||
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
|
||||
bibtex += f" author = {' and '.join(authorlist)},\n"
|
||||
bibtex += f" title = {{{text_title}}},\n"
|
||||
bibtex += f" year = {{{text_year}}},\n"
|
||||
bibtex += f" eprint = {{{arxiv_id}}},\n"
|
||||
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
|
||||
bibtex += "}"
|
||||
|
||||
return bibtex
|
||||
|
||||
|
||||
#from serpapi import GoogleSearch
|
||||
#params = {
|
||||
# "api_key": "os.getenv(SERPER_API_KEY)",
|
||||
# "engine": "google_scholar",
|
||||
# "q": "llm",
|
||||
# "hl": "en",
|
||||
# "as_ylo": "2023",
|
||||
# "as_yhi": "2024"
|
||||
#}
|
||||
#search = GoogleSearch(params)
|
||||
#results = search.get_dict()
|
||||
|
||||
#from llmsherpa.readers import LayoutPDFReader
|
||||
|
||||
#llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
|
||||
#pdf_url = "https://arxiv.org/pdf/1910.13461.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
|
||||
#pdf_reader = LayoutPDFReader(llmsherpa_api_url)
|
||||
#doc = pdf_reader.read_pdf(pdf_url)
|
||||
|
||||
|
||||
|
||||
|
||||
def extract_arxiv_ids_from_line(line):
|
||||
"""
|
||||
Extract the arXiv ID from a given line of text.
|
||||
|
||||
Args:
|
||||
line (str): A line of text potentially containing an arXiv URL.
|
||||
|
||||
Returns:
|
||||
str: The extracted arXiv ID, or None if not found.
|
||||
"""
|
||||
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
|
||||
match = arxiv_id_pattern.search(line)
|
||||
if match:
|
||||
return match.group(1) + (match.group(2) if match.group(2) else '')
|
||||
return None
|
||||
|
||||
|
||||
def read_written_ids(file_path):
|
||||
"""
|
||||
Read already written arXiv IDs from a file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the file containing written IDs.
|
||||
|
||||
Returns:
|
||||
set: A set of arXiv IDs.
|
||||
"""
|
||||
written_ids = set()
|
||||
try:
|
||||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
written_ids.add(line.strip())
|
||||
except FileNotFoundError:
|
||||
logger.error(f"File not found: {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while reading the file: {e}")
|
||||
return written_ids
|
||||
|
||||
|
||||
def append_id_to_file(arxiv_id, output_file_path):
|
||||
"""
|
||||
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
|
||||
|
||||
Args:
|
||||
arxiv_id (str): The arXiv ID to append.
|
||||
output_file_path (str): Path to the output file.
|
||||
"""
|
||||
try:
|
||||
# Check if file exists
|
||||
if not os.path.exists(output_file_path):
|
||||
logger.info(f"File does not exist. Creating new file: {output_file_path}")
|
||||
# Create a new file and append the ID
|
||||
with open(output_file_path, 'a') as outfile:
|
||||
outfile.write(arxiv_id + '\n')
|
||||
else:
|
||||
logger.info(f"Appending to existing file: {output_file_path}")
|
||||
# File exists, append the ID
|
||||
with open(output_file_path, 'a') as outfile:
|
||||
outfile.write(arxiv_id + '\n')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while appending to file: {e}")
|
||||
76
lib/ai_web_researcher/combine_research_and_blog.py
Normal file
76
lib/ai_web_researcher/combine_research_and_blog.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import sys
|
||||
|
||||
from .gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def blog_with_research(report, blog, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
You are an expert copywriter specializing in content optimization for SEO.
|
||||
I will provide you with a 'research report' and a 'blog content' on the same topic.
|
||||
Your task is to transform and combine the given research and blog content into a well-structured markdown, unique
|
||||
and engaging blog article.
|
||||
|
||||
Your objectives include:
|
||||
1. Master the report and blog content: Understand main ideas, key points, and the core message.
|
||||
2. Sentence Structure: Rephrase while preserving logical flow and coherence.
|
||||
3. Identify Main Keywords: Determine the primary topic and combine the articles on the main topic.
|
||||
4. REMEMBER: From the research report, include links and cititations to make your article more authoratative.
|
||||
5. Write Code snippets: Check if given report is on programming, then write code snippets where applicable.
|
||||
6. Optimize for SEO: Generate high quality informative content.
|
||||
Implement SEO best practises with appropriate keyword density.
|
||||
7. Craft Engaging and Informative Article: Provide value and insight to readers.
|
||||
8. Proofread: Important to Check for grammar, spelling, and punctuation errors.
|
||||
9. Use Creative and Human-like Style: Incorporate contractions, idioms, transitional phrases,
|
||||
interjections, and colloquialisms. Avoid repetitive phrases and unnatural sentence structures.
|
||||
10. Blog Structuring: Include an Introduction, subtopics and use bullet points or
|
||||
numbered lists if appropriate. Important to include FAQs, Conclusion and Referances.
|
||||
11. Ensure Uniqueness: Guarantee the article is plagiarism-free. Write in unique, informative style.
|
||||
12. Punctuation: Use appropriate question marks at the end of questions.
|
||||
13. Pass AI Detection Tools: Create content that easily passes AI plagiarism detection tools.
|
||||
14. REMEMBER: Use the formatting style of given research report and include highlights, citations, referances in combined article.
|
||||
|
||||
Follow these guidelines to combine and write a new, unique, and informative blog article
|
||||
that will rank well in search engine results and engage readers effectively.
|
||||
|
||||
Create a blog post, in markdown, from the given research report and blog content below.
|
||||
Research report: {report}
|
||||
Blog content: {blog}
|
||||
"""
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
prompt = f"""You are an expert copywriter specializing in content optimization for SEO.
|
||||
You are world famous writer, known for your originality and engaging content.
|
||||
I will provide you with a 'research report' and a 'blog content' on the same topic.
|
||||
Your task is to transform and combine the given research and blog content into a blog article.
|
||||
Your blog should be highly detailed and well formatted.
|
||||
Include a section in your blog on the highlights section of blog content.
|
||||
Do not miss out any details from provided content. Always, include figures, data, results from given content.
|
||||
It is important that your blog is original and unique. It should be highly readable and SEO optimized.
|
||||
|
||||
Research report: '{report}'
|
||||
Blog content: '{blog}'
|
||||
"""
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"failed to get response from Openai: {err}")
|
||||
raise err
|
||||
108
lib/ai_web_researcher/google_search_gpt_vision.py
Normal file
108
lib/ai_web_researcher/google_search_gpt_vision.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import re #additional import for regex
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
api_key=os.getenv('OPENAI-API-KEY')
|
||||
)
|
||||
|
||||
# Target URL can be a website url or it can google search
|
||||
query = "kedarkanta trek"
|
||||
target_url = f"https://www.google.com/search?q={query}&gl=us"
|
||||
response = requests.get(target_url)
|
||||
print
|
||||
html_text = response.text
|
||||
|
||||
# Remove unnecessary part to prevent HUGE TOKEN cost!
|
||||
# Remove everything between <head> and </head>
|
||||
html_text = re.sub(r'<head.*?>.*?</head>', '', html_text, flags=re.DOTALL)
|
||||
# Remove all occurrences of content between <script> and </script>
|
||||
html_text = re.sub(r'<script.*?>.*?</script>', '', html_text, flags=re.DOTALL)
|
||||
# Remove all occurrences of content between <style> and </style>
|
||||
html_text = re.sub(r'<style.*?>.*?</style>', '', html_text, flags=re.DOTALL)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-4-1106-preview",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a master at scraping Google results data. Scrape two things: 1st. Scrape top 10 organic results data and 2nd. Scrape people_also_ask section from Google search result page."},
|
||||
{"role": "user", "content": html_text}
|
||||
],
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "parse_organic_results",
|
||||
"description": "Parse organic results from Google SERP raw HTML data nicely",
|
||||
"parameters": {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'data': {
|
||||
'type': 'array',
|
||||
'items': {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'title': {'type': 'string'},
|
||||
'original_url': {'type': 'string'},
|
||||
'snippet': {'type': 'string'},
|
||||
'position': {'type': 'integer'}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "parse_people_also_ask_section",
|
||||
"description": "Parse `people also ask` section from Google SERP raw HTML",
|
||||
"parameters": {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'data': {
|
||||
'type': 'array',
|
||||
'items': {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'question': {'type': 'string'},
|
||||
'original_url': {'type': 'string'},
|
||||
'answer': {'type': 'string'},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
tool_choice="auto"
|
||||
)
|
||||
|
||||
|
||||
# Organic_results
|
||||
argument_str = completion.choices[0].message.tool_calls[0].function.arguments
|
||||
argument_dict = json.loads(argument_str)
|
||||
organic_results = argument_dict['data']
|
||||
|
||||
print('Organic results:')
|
||||
for result in organic_results:
|
||||
print(f"Blog Title: {result['title']}")
|
||||
print(f"Blog URL: {result['original_url']}")
|
||||
print(f"Blog Snippet: {result['snippet']}")
|
||||
print(f"Blog Position: {result['position']}")
|
||||
print('---')
|
||||
|
||||
# People also ask
|
||||
argument_str = completion.choices[0].message.tool_calls[1].function.arguments
|
||||
argument_dict = json.loads(argument_str)
|
||||
people_also_ask = argument_dict['data']
|
||||
|
||||
print('People also ask:')
|
||||
for result in people_also_ask:
|
||||
print(f"People_Also_Ask: Question: {result['question']}")
|
||||
print(f"People_Also_Ask: URL: {result['original_url']}")
|
||||
print("People_Also_Ask: Answer: {result['answer']}")
|
||||
print('---')
|
||||
302
lib/ai_web_researcher/google_serp_search.py
Normal file
302
lib/ai_web_researcher/google_serp_search.py
Normal file
@@ -0,0 +1,302 @@
|
||||
"""
|
||||
This Python script performs Google searches using various services such as SerpApi, Serper.dev, and more. It displays the search results, including organic results, People Also Ask, and Related Searches, in formatted tables. The script also utilizes GPT to generate titles and FAQs for the Google search results.
|
||||
|
||||
Features:
|
||||
- Utilizes SerpApi, Serper.dev, and other services for Google searches.
|
||||
- Displays organic search results, including position, title, link, and snippet.
|
||||
- Presents People Also Ask questions and snippets in a formatted table.
|
||||
- Includes Related Searches in the combined table with People Also Ask.
|
||||
- Configures logging with Loguru for informative messages.
|
||||
- Uses Rich and Tabulate for visually appealing and formatted tables.
|
||||
|
||||
Usage:
|
||||
- Ensure the necessary API keys are set in the .env file.
|
||||
- Run the script to perform a Google search with the specified query.
|
||||
- View the displayed tables with organic results, People Also Ask, and Related Searches.
|
||||
- Additional information, such as generated titles and FAQs using GPT, is presented.
|
||||
|
||||
Modifications:
|
||||
- Update the environment variables in the .env file with the required API keys.
|
||||
- Customize the search parameters, such as location and language, in the functions as needed.
|
||||
- Adjust logging configurations, table formatting, and other aspects based on preferences.
|
||||
|
||||
To-Do (TBD):
|
||||
- Consider adding further enhancements or customization based on specific use cases.
|
||||
|
||||
Note: This script depends on external libraries such as SerpApi, Loguru, Rich, and Tabulate. Install them using 'pip install serpapi loguru rich tabulate' if not already installed.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import requests
|
||||
from clint.textui import progress
|
||||
#from serpapi import GoogleSearch
|
||||
from loguru import logger
|
||||
from tabulate import tabulate
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
from dotenv import load_dotenv
|
||||
# Load environment variables from .env file
|
||||
load_dotenv(Path('../../.env'))
|
||||
logger.add(
|
||||
sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
from .gpt_titles_faq import gpt_titles_faqs_google_search
|
||||
|
||||
#from tenacity import retry, stop_after_attempt, wait_random_exponential
|
||||
#@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
|
||||
|
||||
#FIXME: Accept language, country and time frame to search for.
|
||||
def google_search(query):
|
||||
"""
|
||||
Perform a Google search for the given query.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
flag (str, optional): The search flag (default is "faq").
|
||||
|
||||
Returns:
|
||||
list: List of search results based on the specified flag.
|
||||
"""
|
||||
try:
|
||||
perform_serpapi_google_search(query)
|
||||
logger.info(f"FIXME: Google serapi: {query}")
|
||||
#return process_search_results(search_result)
|
||||
except Exception as err:
|
||||
logger.error(f"ERROR: Check Here: https://serpapi.com/. Your requests may be over. {err}")
|
||||
|
||||
# Retry with serper.dev
|
||||
try:
|
||||
logger.info("Trying Google search with Serper.dev: https://serper.dev/api-key")
|
||||
search_result = perform_serperdev_google_search(query)
|
||||
process_search_results(search_result)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do Google search with serper.dev: {err}")
|
||||
|
||||
return(search_result)
|
||||
|
||||
# # Retry with BROWSERLESS API
|
||||
# try:
|
||||
# search_result = perform_browserless_google_search(query)
|
||||
# #return process_search_results(search_result, flag)
|
||||
# except Exception as err:
|
||||
# logger.error("FIXME: Failed to do Google search with BROWSERLESS API.")
|
||||
# logger.debug("FIXME: Trying with dataforSEO API.")
|
||||
#
|
||||
# # Retry with dataforSEO API
|
||||
# try:
|
||||
# logger.info("Perform SERP with Data for SEO.")
|
||||
# #search_result = perform_dataforseo_google_search(query)
|
||||
# #return process_search_results(search_result, flag)
|
||||
# except Exception as err:
|
||||
# logger.error("FIXME: Failed to do Google search with dataforSEO API.")
|
||||
# logger.debug("All retries failed. Giving up.")
|
||||
# raise
|
||||
|
||||
|
||||
|
||||
def perform_serpapi_google_search(query, location="in"):
|
||||
"""
|
||||
Perform a Google search using the SerpApi service.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
location (str, optional): The location for the search (default is "Austin, Texas").
|
||||
api_key (str, optional): Your secret API key for SerpApi.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the search results.
|
||||
"""
|
||||
try:
|
||||
# Check if API key is provided
|
||||
if not os.getenv("SERPAPI_KEY"):
|
||||
raise ValueError("SERPAPI_KEY key is required for SerpApi")
|
||||
|
||||
# Create a GoogleSearch instance
|
||||
search = GoogleSearch({
|
||||
"q": query,
|
||||
"location": location,
|
||||
"api_key": api_key
|
||||
})
|
||||
# Get search results as a dictionary
|
||||
result = search.get_dict()
|
||||
return result
|
||||
|
||||
except ValueError as ve:
|
||||
# Handle missing API key error
|
||||
logger.info(f"SERPAPI ValueError: {ve}")
|
||||
except Exception as e:
|
||||
# Handle other exceptions
|
||||
logger.info(f"SERPAPI An error occurred: {e}")
|
||||
|
||||
|
||||
def perform_serperdev_google_search(query):
|
||||
"""
|
||||
Perform a Google search using the Serper API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
|
||||
Returns:
|
||||
dict: The JSON response from the Serper API.
|
||||
"""
|
||||
# Get the Serper API key from environment variables
|
||||
logger.info("Doing serper.dev google search.")
|
||||
serper_api_key = os.getenv('SERPER_API_KEY')
|
||||
|
||||
# Check if the API key is available
|
||||
if not serper_api_key:
|
||||
raise ValueError("SERPER_API_KEY is missing. Set it in the .env file.")
|
||||
|
||||
# Serper API endpoint URL
|
||||
url = "https://google.serper.dev/search"
|
||||
|
||||
# FIXME: Expose options to end user. Request payload
|
||||
payload = json.dumps({
|
||||
"q": query,
|
||||
"gl": "in",
|
||||
"hl": "en",
|
||||
"num": 5,
|
||||
"autocorrect": True,
|
||||
"page": 1,
|
||||
"type": "search",
|
||||
"engine": "google"
|
||||
})
|
||||
|
||||
# Request headers with API key
|
||||
headers = {
|
||||
'X-API-KEY': serper_api_key,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
# Send a POST request to the Serper API with progress bar
|
||||
with progress.Bar(label="Searching", expected_size=100) as bar:
|
||||
response = requests.post(url, headers=headers, data=payload, stream=True)
|
||||
# Check if the request was successful
|
||||
if response.status_code == 200:
|
||||
# Parse and return the JSON response
|
||||
return response.json()
|
||||
else:
|
||||
# Print an error message if the request fails
|
||||
logger.error(f"Error: {response.status_code}, {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def perform_browserless_google_search():
|
||||
return
|
||||
|
||||
def perform_dataforseo_google_search():
|
||||
return
|
||||
|
||||
|
||||
|
||||
def process_search_results(search_results):
|
||||
"""
|
||||
Create a Pandas DataFrame from the search results.
|
||||
|
||||
Args:
|
||||
search_results (dict): The search results JSON.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Pandas DataFrame containing the search results.
|
||||
"""
|
||||
data = []
|
||||
logger.info(f"Google Search Parameters: {search_results.get('searchParameters', {})}")
|
||||
organic_results = search_results.get("organic", [])
|
||||
print(search_results)
|
||||
|
||||
# Displaying Organic Results
|
||||
organic_data = []
|
||||
for result in search_results["organic"]:
|
||||
position = result.get("position", "")
|
||||
title = result.get("title", "")
|
||||
link = result.get("link", "")
|
||||
snippet = result.get("snippet", "")
|
||||
organic_data.append([position, title, link, snippet])
|
||||
|
||||
organic_headers = ["Rank", "Title", "Link", "Snippet"]
|
||||
organic_table = tabulate(organic_data,
|
||||
headers=organic_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["center", "left", "left", "left"],
|
||||
maxcolwidths=[5, 25, 35, 50])
|
||||
|
||||
# Print the tables
|
||||
print("\n\n📢❗🚨 Google search Organic Results:")
|
||||
print(organic_table)
|
||||
|
||||
# Displaying People Also Ask and Related Searches combined
|
||||
combined_data = []
|
||||
try:
|
||||
people_also_ask_data = []
|
||||
if "peopleAlsoAsk" in search_results:
|
||||
for question in search_results["peopleAlsoAsk"]:
|
||||
title = question.get("title", "")
|
||||
snippet = question.get("snippet", "")
|
||||
link = question.get("link", "")
|
||||
people_also_ask_data.append([title, snippet, link])
|
||||
except Exception as people_also_ask_err:
|
||||
logger.error(f"Error processing 'peopleAlsoAsk': {people_also_ask_err}")
|
||||
people_also_ask_data = []
|
||||
|
||||
related_searches_data = []
|
||||
for query in search_results.get("relatedSearches", []):
|
||||
related_searches_data.append([query.get("query", "")])
|
||||
related_searches_headers = ["Related Search"]
|
||||
|
||||
if people_also_ask_data:
|
||||
# Add Related Searches as a column to People Also Ask
|
||||
combined_data = [
|
||||
row + [related_searches_data[i][0] if i < len(related_searches_data) else ""]
|
||||
for i, row in enumerate(people_also_ask_data)
|
||||
]
|
||||
combined_headers = ["Question", "Snippet", "Link", "Related Search"]
|
||||
# Display the combined table
|
||||
combined_table = tabulate(
|
||||
combined_data,
|
||||
headers=combined_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left", "left", "left", "left"],
|
||||
maxcolwidths=[20, 50, 20, 30]
|
||||
)
|
||||
else:
|
||||
combined_table = tabulate(
|
||||
related_searches_data,
|
||||
headers=related_searches_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left"],
|
||||
maxcolwidths=[60]
|
||||
)
|
||||
|
||||
print("\n\n📢❗🚨 People Also Ask & Related Searches:")
|
||||
print(combined_table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(organic_table)
|
||||
save_in_file(combined_table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
return search_results
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "a") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
530
lib/ai_web_researcher/google_trends_researcher.py
Normal file
530
lib/ai_web_researcher/google_trends_researcher.py
Normal file
@@ -0,0 +1,530 @@
|
||||
"""
|
||||
This Python script analyzes Google search keywords by fetching auto-suggestions, performing keyword clustering, and visualizing Google Trends data. It uses various libraries such as pytrends, requests_html, tqdm, and more.
|
||||
|
||||
Features:
|
||||
- Fetches auto-suggestions for a given search keyword from Google.
|
||||
- Performs keyword clustering using K-means algorithm based on TF-IDF vectors.
|
||||
- Visualizes Google Trends data, including interest over time and interest by region.
|
||||
- Retrieves related queries and topics for a set of search keywords.
|
||||
- Utilizes visualization libraries such as Matplotlib, Plotly, and Rich for displaying results.
|
||||
- Incorporates logging for error handling and informative messages.
|
||||
|
||||
Usage:
|
||||
- Provide a search term or a list of search terms for analysis.
|
||||
- Run the script to fetch auto-suggestions, perform clustering, and visualize Google Trends data.
|
||||
- Explore the displayed results, including top keywords in each cluster and related topics.
|
||||
|
||||
Modifications:
|
||||
- Customize the search terms in the 'do_google_trends_analysis' function.
|
||||
- Adjust the number of clusters for keyword clustering and other parameters as needed.
|
||||
- Explore further visualizations and analyses based on the generated data.
|
||||
|
||||
Note: Ensure that the required libraries are installed using 'pip install pytrends requests_html tqdm tabulate plotly rich'.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
import sys
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.cluster import KMeans
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.metrics import silhouette_score, silhouette_samples
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress
|
||||
import urllib
|
||||
import json
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import plotly.express as px
|
||||
import plotly.io as pio
|
||||
import logging
|
||||
from requests_html import HTML, HTMLSession
|
||||
from urllib.parse import quote_plus
|
||||
from tqdm import tqdm
|
||||
from tabulate import tabulate
|
||||
from pytrends.request import TrendReq
|
||||
import wordcloud
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
from loguru import logger
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def fetch_google_trends_interest_overtime(keyword):
|
||||
try:
|
||||
pytrends = TrendReq(hl='en-US', tz=360)
|
||||
pytrends.build_payload([keyword], timeframe='today 1-y', geo='US')
|
||||
|
||||
# 1. Interest Over Time
|
||||
data = pytrends.interest_over_time()
|
||||
data = data.reset_index()
|
||||
|
||||
# Visualization using Matplotlib
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.plot(data['date'], data[keyword], label=keyword)
|
||||
plt.title(f'Interest Over Time for "{keyword}"')
|
||||
plt.xlabel('Date')
|
||||
plt.ylabel('Interest')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
return data
|
||||
except Exception as e:
|
||||
logging.error(f"Error in fetch_google_trends_data: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def plot_interest_by_region(kw_list):
|
||||
try:
|
||||
from pytrends.request import TrendReq
|
||||
import matplotlib.pyplot as plt
|
||||
trends = TrendReq()
|
||||
trends.build_payload(kw_list=kw_list)
|
||||
kw_list = ' '.join(kw_list)
|
||||
data = trends.interest_by_region() #sorting by region
|
||||
data = data.sort_values(by=f"{kw_list}", ascending=False)
|
||||
print("\n📢❗🚨 ")
|
||||
print(f"Top 10 regions with highest interest for keyword: {kw_list}")
|
||||
data = data.head(10) #Top 10
|
||||
print(data)
|
||||
data.reset_index().plot(x="geoName", y=f"{kw_list}",
|
||||
figsize=(20,15), kind="bar")
|
||||
plt.style.use('fivethirtyeight')
|
||||
plt.show()
|
||||
# FIXME: Send this image to vision GPT for analysis.
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error plotting interest by region: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
def get_related_queries_and_save_csv(keywords, hl='en-US', tz=360, cat=0, timeframe='today 12-m'):
|
||||
"""
|
||||
Get related queries for the given search keywords and save the result to a CSV file.
|
||||
|
||||
Args:
|
||||
search_keywords (list): List of search keywords.
|
||||
hl (str): Language parameter, default is 'en-US'.
|
||||
tz (int): Timezone parameter, default is 360.
|
||||
cat (int): Category parameter, default is 0.
|
||||
timeframe (str): Timeframe parameter, default is 'today 12-m'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame containing related queries.
|
||||
"""
|
||||
try:
|
||||
# Build model
|
||||
pytrends = TrendReq(hl=hl, tz=tz)
|
||||
pytrends.build_payload(kw_list=keywords, cat=cat, timeframe=timeframe)
|
||||
|
||||
# Get related queries
|
||||
data = pytrends.related_queries()
|
||||
|
||||
# Extract data from the result
|
||||
top_queries = list(data.values())[0]['top']
|
||||
rising_queries = list(data.values())[0]['rising']
|
||||
top_rising_queries = top_queries + rising_queries
|
||||
|
||||
# Convert lists to DataFrames
|
||||
df_top_queries = pd.DataFrame(top_queries)
|
||||
df_rising_queries = pd.DataFrame(rising_queries) # Added this line
|
||||
|
||||
# Rename columns to avoid duplicates
|
||||
df_top_queries.columns = ['Top query', 'value']
|
||||
df_rising_queries.columns = ['Rising query', 'value']
|
||||
|
||||
# Save to CSV
|
||||
all_queries_df = pd.concat([df_top_queries, df_rising_queries], axis=1)
|
||||
#all_queries_df.to_csv('related_queries.csv', index=False)
|
||||
|
||||
# Display additional information
|
||||
console = Console()
|
||||
# Display additional information with emojis and bold formatting
|
||||
print("\n📢❗🚨 ")
|
||||
print("\n\033[1m🔝 Top\033[0m: The most popular search queries. Scoring is on a relative scale where a value of 100 is the most commonly searched query, 50 is a query searched half as often, and a value of 0 is a query searched for less than 1% as often as the most popular query.\n")
|
||||
print("\n\033[1m🚀 Rising\033[0m: Queries with the biggest increase in search frequency since the last time period. Results marked 'Breakout' had a tremendous increase, probably because these queries are new and had few (if any) prior searches.\n")
|
||||
# Display the DataFrame using tabulate
|
||||
print(tabulate(all_queries_df, headers='keys', tablefmt='fancy_grid'))
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(all_queries_df)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
return top_rising_queries
|
||||
|
||||
except Exception as e:
|
||||
print(f"get_related_queries_and_save_csv: ERROR: An error occurred: {e}")
|
||||
|
||||
|
||||
def get_related_topics_and_save_csv(search_keywords):
|
||||
"""
|
||||
Get related topics for the given search keywords and save the result to a CSV file.
|
||||
|
||||
Args:
|
||||
search_keywords (list): List of search keywords.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame containing related topics.
|
||||
"""
|
||||
try:
|
||||
# Build model
|
||||
pytrends = TrendReq(hl='en-US', tz=360)
|
||||
|
||||
# Build payload
|
||||
pytrends.build_payload(search_keywords, cat=0, timeframe='today 12-m')
|
||||
|
||||
# Get related topics
|
||||
data = pytrends.related_topics()
|
||||
# Extract data from the result
|
||||
top_topics = list(data.values())[0]['top']
|
||||
rising_topics = list(data.values())[0]['rising']
|
||||
|
||||
# Convert lists to DataFrames
|
||||
df_top_topics = pd.DataFrame(top_topics)
|
||||
df_rising_topics = pd.DataFrame(rising_topics)
|
||||
|
||||
# FIXME:Exclude specified columns
|
||||
columns_to_exclude = ['hasData', 'value', 'topic_mid', 'link']
|
||||
df_top_topics = df_top_topics.drop(columns=columns_to_exclude, errors='ignore')
|
||||
df_rising_topics = df_rising_topics.drop(columns=columns_to_exclude, errors='ignore')
|
||||
|
||||
# Rename columns to avoid duplicates and provide meaningful names
|
||||
df_top_topics.columns = ['Top- ' + col if col != 'topic_title' else col for col in df_top_topics.columns]
|
||||
df_rising_topics.columns = ['Rising- ' + col if col != 'topic_title' else col for col in df_rising_topics.columns]
|
||||
|
||||
# Save to CSV
|
||||
all_topics_df = pd.concat([df_top_topics, df_rising_topics], axis=1)
|
||||
#all_topics_df.to_csv('related_topics.csv', index=False)
|
||||
|
||||
print(f"\n\n 📢❗🚨 Rising and Trending Keywords for {search_keywords}\n")
|
||||
print("\033[1m🔝 Top\033[0m: The most popular search topics.")
|
||||
print("\033[1m🚀 Rising\033[0m: Topics experiencing a significant increase in search frequency since the last time period. Topics marked :pile_of_poop:'Breakout' had a tremendous surge, likely because they are new and had few prior searches.")
|
||||
# Display the DataFrame using tabulate
|
||||
pd.set_option('display.max_rows', all_topics_df.shape[0]+1)
|
||||
print(all_topics_df.head(10))
|
||||
#print(tabulate(all_topics_df, headers='keys', tablefmt='fancy_grid'))
|
||||
return all_topics_df
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR: An error occurred: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def get_source(url):
|
||||
try:
|
||||
session = HTMLSession()
|
||||
response = session.get(url)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error during HTTP request: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def get_results(query):
|
||||
try:
|
||||
query = urllib.parse.quote_plus(query)
|
||||
response = get_source(f"https://suggestqueries.google.com/complete/search?output=chrome&hl=en&q={query}")
|
||||
if response:
|
||||
response.raise_for_status()
|
||||
results = json.loads(response.text)
|
||||
return results
|
||||
else:
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
logging.error(f"Error decoding JSON response: {e}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error during HTTP request: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def format_results(results):
|
||||
try:
|
||||
suggestions = []
|
||||
for index, value in enumerate(results[1]):
|
||||
suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]}
|
||||
suggestions.append(suggestion)
|
||||
return suggestions
|
||||
except (KeyError, IndexError) as e:
|
||||
logging.error(f"Error parsing search results: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def get_expanded_term_suffixes():
|
||||
return ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm','n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
|
||||
|
||||
|
||||
|
||||
def get_expanded_term_prefixes():
|
||||
# For shopping, review type blogs.
|
||||
#return ['discount *', 'pricing *', 'cheap', 'best price *', 'lowest price', 'best value', 'sale', 'affordable', 'promo', 'budget''what *', 'where *', 'how to *', 'why *', 'buy*', 'how much*','best *', 'worse *', 'rent*', 'sale*', 'offer*','vs*','or*']
|
||||
return ['what *', 'where *', 'how to *', 'why *','best *', 'vs*', 'or*']
|
||||
|
||||
|
||||
|
||||
def get_expanded_terms(query):
|
||||
try:
|
||||
expanded_term_prefixes = get_expanded_term_prefixes()
|
||||
expanded_term_suffixes = get_expanded_term_suffixes()
|
||||
|
||||
terms = [query]
|
||||
|
||||
for term in expanded_term_prefixes:
|
||||
terms.append(f"{term} {query}")
|
||||
|
||||
for term in expanded_term_suffixes:
|
||||
terms.append(f"{query} {term}")
|
||||
|
||||
return terms
|
||||
except Exception as e:
|
||||
logging.error(f"Error in get_expanded_terms: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def get_expanded_suggestions(query):
|
||||
try:
|
||||
all_results = []
|
||||
|
||||
expanded_terms = get_expanded_terms(query)
|
||||
for term in tqdm(expanded_terms, desc="📢❗🚨 Fetching Google AutoSuggestions", unit="term"):
|
||||
results = get_results(term)
|
||||
if results:
|
||||
formatted_results = format_results(results)
|
||||
all_results += formatted_results
|
||||
all_results = sorted(all_results, key=lambda k: k.get('relevance', 0), reverse=True)
|
||||
|
||||
return all_results
|
||||
except Exception as e:
|
||||
logging.error(f"Error in get_expanded_suggestions: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def get_suggestions_for_keyword(search_term):
|
||||
""" """
|
||||
try:
|
||||
expanded_results = get_expanded_suggestions(search_term)
|
||||
expanded_results_df = pd.DataFrame(expanded_results)
|
||||
expanded_results_df.columns = ['Keywords', 'Relevance']
|
||||
#expanded_results_df.to_csv('results.csv', index=False)
|
||||
pd.set_option('display.max_rows', expanded_results_df.shape[0]+1)
|
||||
expanded_results_df.drop_duplicates('Keywords', inplace=True)
|
||||
|
||||
return expanded_results_df
|
||||
except Exception as e:
|
||||
logging.error(f"get_suggestions_for_keyword: Error in main: {e}")
|
||||
|
||||
|
||||
|
||||
def perform_keyword_clustering(expanded_results_df, num_clusters=5):
|
||||
try:
|
||||
# Preprocessing: Convert the keywords to lowercase
|
||||
expanded_results_df['Keywords'] = expanded_results_df['Keywords'].str.lower()
|
||||
|
||||
# Vectorization: Create a TF-IDF vectorizer
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
# Fit the vectorizer to the keywords
|
||||
tfidf_vectors = vectorizer.fit_transform(expanded_results_df['Keywords'])
|
||||
|
||||
# Applying K-means clustering
|
||||
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
||||
cluster_labels = kmeans.fit_predict(tfidf_vectors)
|
||||
|
||||
# Add cluster labels to the DataFrame
|
||||
expanded_results_df['cluster_label'] = cluster_labels
|
||||
|
||||
# Assessing cluster quality through silhouette score
|
||||
silhouette_avg = silhouette_score(tfidf_vectors, cluster_labels)
|
||||
print(f"Silhouette Score: {silhouette_avg}")
|
||||
|
||||
# Visualize cluster quality using a silhouette plot
|
||||
#visualize_silhouette(tfidf_vectors, cluster_labels)
|
||||
|
||||
return expanded_results_df
|
||||
except Exception as e:
|
||||
logging.error(f"Error in perform_keyword_clustering: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
|
||||
def visualize_silhouette(X, labels):
|
||||
try:
|
||||
silhouette_avg = silhouette_score(X, labels)
|
||||
print(f"Silhouette Score: {silhouette_avg}")
|
||||
|
||||
# Create a subplot with 1 row and 2 columns
|
||||
fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))
|
||||
|
||||
# The 1st subplot is the silhouette plot
|
||||
ax1.set_xlim([-0.1, 1])
|
||||
ax1.set_ylim([0, X.shape[0] + (len(set(labels)) + 1) * 10])
|
||||
|
||||
# Compute the silhouette scores for each sample
|
||||
sample_silhouette_values = silhouette_samples(X, labels)
|
||||
|
||||
y_lower = 10
|
||||
for i in set(labels):
|
||||
# Aggregate the silhouette scores for samples belonging to the cluster
|
||||
ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
|
||||
ith_cluster_silhouette_values.sort()
|
||||
|
||||
size_cluster_i = ith_cluster_silhouette_values.shape[0]
|
||||
y_upper = y_lower + size_cluster_i
|
||||
|
||||
color = plt.cm.nipy_spectral(float(i) / len(set(labels)))
|
||||
ax1.fill_betweenx(np.arange(y_lower, y_upper),
|
||||
0, ith_cluster_silhouette_values,
|
||||
facecolor=color, edgecolor=color, alpha=0.7)
|
||||
|
||||
# Label the silhouette plots with their cluster numbers at the middle
|
||||
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
|
||||
|
||||
# Compute the new y_lower for the next plot
|
||||
y_lower = y_upper + 10 # 10 for the 0 samples
|
||||
|
||||
ax1.set_title("Silhouette plot for KMeans clustering")
|
||||
ax1.set_xlabel("Silhouette coefficient values")
|
||||
ax1.set_ylabel("Cluster label")
|
||||
|
||||
# The vertical line for the average silhouette score of all the values
|
||||
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
|
||||
|
||||
plt.show()
|
||||
except Exception as e:
|
||||
logging.error(f"Error in visualize_silhouette: {e}")
|
||||
|
||||
|
||||
|
||||
def print_and_return_top_keywords(expanded_results_df, num_clusters=5):
|
||||
"""
|
||||
Display and return top keywords in each cluster.
|
||||
|
||||
Args:
|
||||
expanded_results_df (pd.DataFrame): DataFrame containing expanded keywords, relevance, and cluster labels.
|
||||
num_clusters (int or str): Number of clusters or 'all'.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with top keywords for each cluster.
|
||||
"""
|
||||
top_keywords_df = pd.DataFrame()
|
||||
|
||||
if num_clusters == 'all':
|
||||
unique_clusters = expanded_results_df['cluster_label'].unique()
|
||||
else:
|
||||
unique_clusters = range(int(num_clusters))
|
||||
|
||||
for i in unique_clusters:
|
||||
cluster_df = expanded_results_df[expanded_results_df['cluster_label'] == i]
|
||||
top_keywords = cluster_df.sort_values(by='Relevance', ascending=False).head(5)
|
||||
top_keywords_df = pd.concat([top_keywords_df, top_keywords])
|
||||
|
||||
print(f"\n📢❗🚨 GTop Keywords for All Clusters:")
|
||||
table = tabulate(top_keywords_df, headers='keys', tablefmt='fancy_grid')
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(top_keywords_df)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
print(table)
|
||||
return top_keywords_df
|
||||
|
||||
|
||||
def generate_wordcloud(keywords):
|
||||
"""
|
||||
Generate and display a word cloud from a list of keywords.
|
||||
|
||||
Args:
|
||||
keywords (list): List of keywords.
|
||||
"""
|
||||
# Convert the list of keywords to a string
|
||||
text = ' '.join(keywords)
|
||||
|
||||
# Generate word cloud
|
||||
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
|
||||
|
||||
# Display the word cloud using matplotlib
|
||||
plt.figure(figsize=(600, 200))
|
||||
plt.imshow(wordcloud, interpolation='bilinear')
|
||||
plt.axis('off')
|
||||
plt.show()
|
||||
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "w") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
|
||||
|
||||
def do_google_trends_analysis(search_term):
|
||||
""" Get a google search keywords, get its stats."""
|
||||
search_term = [f"{search_term}"]
|
||||
all_the_keywords = []
|
||||
try:
|
||||
for asearch_term in search_term:
|
||||
#FIXME: Lets work with a single root keyword.
|
||||
suggestions_df = get_suggestions_for_keyword(asearch_term)
|
||||
|
||||
result_df = perform_keyword_clustering(suggestions_df)
|
||||
# Display top keywords in each cluster
|
||||
top_keywords = print_and_return_top_keywords(result_df)
|
||||
all_the_keywords.append(top_keywords['Keywords'].tolist())
|
||||
#
|
||||
# # FIXME: Get result from vision GPT. Fetch and visualize Google Trends data
|
||||
# #trends_data = fetch_google_trends_interest_overtime("llamaindex")
|
||||
#
|
||||
# # FIXME: Plot Interest Over time.
|
||||
# result_df = plot_interest_by_region(search_term)
|
||||
#
|
||||
# # Display additional information
|
||||
result_df = get_related_topics_and_save_csv(search_term)
|
||||
# Extract 'Top' topic_title
|
||||
top_topic_title = result_df['topic_title'].values.tolist()
|
||||
|
||||
# Join each sublist into one string separated by comma
|
||||
#top_topic_title = [','.join(filter(None, map(str, sublist))) for sublist in top_topic_title]
|
||||
top_topic_title = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in top_topic_title])
|
||||
|
||||
print(f"\nRising and Top keywords: {top_topic_title}")
|
||||
# Print or use the extracted topic titles
|
||||
all_the_keywords = ','.join([', '.join(filter(None, map(str, sublist))) for sublist in all_the_keywords])
|
||||
print(f"\n\n📢❗🚨 Important keywords to target: {all_the_keywords}\n\n")
|
||||
all_the_keywords += top_topic_title
|
||||
print(all_the_keywords)
|
||||
all_the_keywords = all_the_keywords.split(',')
|
||||
|
||||
# Split the list into chunks of 5 keywords
|
||||
chunk_size = 4
|
||||
chunks = [all_the_keywords[i:i + chunk_size] for i in range(0, len(all_the_keywords), chunk_size)]
|
||||
# Create a DataFrame with columns named 'Keyword 1', 'Keyword 2', etc.
|
||||
combined_df = pd.DataFrame(chunks, columns=[f'K📢eyword Col{i + 1}' for i in range(chunk_size)])
|
||||
|
||||
# Print the table
|
||||
print(tabulate(combined_df, headers='keys', tablefmt='fancy_grid'))
|
||||
#combined_df = pd.DataFrame({'📢❗🚨 Important keywords to target': chunks})
|
||||
|
||||
print(all_the_keywords)
|
||||
generate_wordcloud(all_the_keywords.split(','))
|
||||
return(all_the_keywords)
|
||||
except Exception as e:
|
||||
logging.error(f"Error in main: {e}")
|
||||
49
lib/ai_web_researcher/gpt_blog_sections.py
Normal file
49
lib/ai_web_researcher/gpt_blog_sections.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import sys
|
||||
import json
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
# FIXME: Provide num_blogs, num_faqs as inputs.
|
||||
def get_blog_sections_from_websearch(search_keyword, search_results, gpt_providers="gemini"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
As a SEO expert and content writer, I will provide you with a search keyword and its google search result.
|
||||
Your task is to write a blog title and 5 blog sub titles, from the given google search result.
|
||||
The subtitles should be less than 40 characters and click worthy.
|
||||
Do not explain, describe your response. Respond in json format, always name the key as 'blogSections'.
|
||||
|
||||
Web Research Keyword: "{search_keyword}"
|
||||
Google search Result: "{search_results}"
|
||||
"""
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
if '```' in response and '\n' in response:
|
||||
response = response.strip().split('\n')
|
||||
# Remove the first and last lines
|
||||
response = '\n'.join(response[1:-1])
|
||||
response = json.loads(response)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
logger.error(f"Gemini Error: {response.prompt_feedback}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from Openai: {err}")
|
||||
raise err
|
||||
41
lib/ai_web_researcher/gpt_competitor_analysis.py
Normal file
41
lib/ai_web_researcher/gpt_competitor_analysis.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def summarize_competitor_content(research_content, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f""" Web page content: {research_content} """
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
prompt = f"""You are a helpful assistant writing a research report about a company. I will provide you with company details.
|
||||
Summarize the given company details into multiple paragraphs.
|
||||
Be extremely concise, professional, and factual as possible.
|
||||
The first paragraph should be an introduction and summary of the company.
|
||||
The second paragraph should include pros and cons of the company.
|
||||
The third paragraph should be on their pricing model.
|
||||
Include a conclusion, summarizing your research about the given company details.
|
||||
Company details: '{research_content}'"""
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"failed to get response from Openai: {err}")
|
||||
raise err
|
||||
185
lib/ai_web_researcher/gpt_online_researcher.py
Normal file
185
lib/ai_web_researcher/gpt_online_researcher.py
Normal file
@@ -0,0 +1,185 @@
|
||||
################################################################
|
||||
#
|
||||
#
|
||||
#
|
||||
##############################################################
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import List, NamedTuple
|
||||
from loguru import logger
|
||||
from datetime import datetime
|
||||
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
from .tavily_ai_search import get_tavilyai_results
|
||||
from .metaphor_basic_neural_web_search import metaphor_find_similar, metaphor_search_articles
|
||||
from .google_serp_search import google_search
|
||||
from .google_trends_researcher import do_google_trends_analysis
|
||||
from .gpt_blog_sections import get_blog_sections_from_websearch
|
||||
from .web_research_report import write_web_research_report
|
||||
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def gpt_web_researcher(search_keywords, time_range=None, include_domains=list(), similar_url=None):
|
||||
""" """
|
||||
print(f"Web Research:Time Range - {time_range},Search Keywords - {search_keywords},Include URLs - {include_domains}")
|
||||
if not include_domains:
|
||||
include_domains = list()
|
||||
# TBD: Keeping the results directory as fixed, for now.
|
||||
os.environ["SEARCH_SAVE_FILE"] = os.path.join(os.getcwd(), "workspace", "web_research_reports",
|
||||
search_keywords.replace(" ", "_") + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
|
||||
# Collect all blog titles featuring in search results. This *may help in generating blog titles
|
||||
# closest to competing ones. All search blog titles, given keyword and keywords from analysis, give
|
||||
# llm a good context for the task of generating blog titles.
|
||||
blog_titles = []
|
||||
# Get a list of FAQs from search results.
|
||||
blog_faqs = None
|
||||
google_result = None
|
||||
tavily_result = None
|
||||
report = None
|
||||
# try:
|
||||
# logger.info(f"Doing Google search for: {search_keywords}\n")
|
||||
# google_result = google_search(search_keywords)
|
||||
# blog_titles.append(extract_info(google_result, "titles"))
|
||||
# except Exception as err:
|
||||
# logger.error(f"Failed to do Google Serpapi research: {err}")
|
||||
# # Not failing, as tavily would do same and then GPT-V to search.
|
||||
#
|
||||
# try:
|
||||
# # FIXME: Include the follow-up questions as blog FAQs.
|
||||
# logger.info(f"Doing Tavily AI search for: {search_keywords}")
|
||||
# tavily_result = get_tavilyai_results(search_keywords, include_domains)
|
||||
# blog_titles.append(tavily_extract_information(tavily_result, "titles"))
|
||||
# except Exception as err:
|
||||
# logger.error(f"Failed to do Tavily AI Search: {err}")
|
||||
|
||||
# try:
|
||||
# logger.info(f"Start Semantic/Neural web search with Metahpor: {search_keywords}")
|
||||
# response_articles = metaphor_search_articles(
|
||||
# search_keywords,
|
||||
# include_domains=include_domains,
|
||||
# time_range=time_range,
|
||||
# similar_url=similar_url)
|
||||
# blog_titles.append(metaphor_extract_titles_or_text(response_articles, return_titles=True))
|
||||
# except Exception as err:
|
||||
# logger.error(f"Failed to do Metaphor search: {err}")
|
||||
# print(blog_titles)
|
||||
|
||||
try:
|
||||
logger.info(f"Do Google Trends analysis for given keywords: {search_keywords}")
|
||||
important_keywords = do_google_trends_analysis(search_keywords)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do google trends analysis: {err}")
|
||||
print(important_keywords)
|
||||
# Now that we have search results from given keywords. Generate blog title and subtopics suggestions.
|
||||
# 1. Return a list of related keywords along with search volumes.
|
||||
# 2. New blog titles to write on(niche, top) and blog sections.
|
||||
# 3. Competitors list, similar urls if given.
|
||||
|
||||
|
||||
class Result(NamedTuple):
|
||||
url: str
|
||||
id: str
|
||||
title: str
|
||||
score: float
|
||||
published_date: str
|
||||
author: str
|
||||
text: str
|
||||
highlights: List[str]
|
||||
highlight_scores: List[float]
|
||||
|
||||
|
||||
def metaphor_extract_titles_or_text(json_data, return_titles=True):
|
||||
"""
|
||||
Extract either titles or text from the given JSON structure.
|
||||
|
||||
Args:
|
||||
json_data (list): List of Result objects in JSON format.
|
||||
return_titles (bool): If True, return titles. If False, return text.
|
||||
|
||||
Returns:
|
||||
list: List of titles or text.
|
||||
"""
|
||||
result_list = [Result(**result) for result in json_data]
|
||||
|
||||
if return_titles:
|
||||
return [result.title for result in result_list]
|
||||
else:
|
||||
return [result.text for result in result_list]
|
||||
|
||||
|
||||
def extract_info(json_data, info_type):
|
||||
"""
|
||||
Extract information (titles, peopleAlsoAsk, or relatedSearches) from the given JSON.
|
||||
|
||||
Args:
|
||||
json_data (dict): The JSON data.
|
||||
info_type (str): The type of information to extract (titles, peopleAlsoAsk, relatedSearches).
|
||||
|
||||
Returns:
|
||||
list or None: A list containing the requested information, or None if the type is invalid.
|
||||
"""
|
||||
if info_type == "titles":
|
||||
return [result.get("title") for result in json_data.get("organic", [])]
|
||||
elif info_type == "peopleAlsoAsk":
|
||||
return [item.get("question") for item in json_data.get("peopleAlsoAsk", [])]
|
||||
elif info_type == "relatedSearches":
|
||||
return [item.get("query") for item in json_data.get("relatedSearches", [])]
|
||||
else:
|
||||
print("Invalid info_type. Please use 'titles', 'peopleAlsoAsk', or 'relatedSearches'.")
|
||||
return None
|
||||
|
||||
|
||||
def tavily_extract_information(json_data, keyword):
|
||||
"""
|
||||
Extract information from the given JSON based on the specified keyword.
|
||||
|
||||
Args:
|
||||
json_data (dict): The JSON data.
|
||||
keyword (str): The keyword (title, content, answer, follow-query).
|
||||
|
||||
Returns:
|
||||
list or str: The extracted information based on the keyword.
|
||||
"""
|
||||
if keyword == 'title':
|
||||
return [result['title'] for result in json_data['results']]
|
||||
elif keyword == 'content':
|
||||
return [result['content'] for result in json_data['results']]
|
||||
elif keyword == 'answer':
|
||||
return json_data['answer']
|
||||
elif keyword == 'follow-query':
|
||||
return json_data['follow_up_questions']
|
||||
else:
|
||||
return f"Invalid keyword: {keyword}"
|
||||
|
||||
|
||||
def compete_organic_results(query, report, organic_results):
|
||||
""" Given a blog content and google search organinc results, create a new blog to compete against them."""
|
||||
prompt = f""" As an SEO expert and copywriter, I will provide you with my blog content on topic '{query}', and
|
||||
Top google search results.
|
||||
Your task is to rewrite the given blog to make it compete against top position results.
|
||||
Make sure, the new blog has high probability of ranking highest against given organic search result competitors.
|
||||
Modify the given blog content following best SEO practises.
|
||||
Make sure the blog is original, unique and highly readable.
|
||||
Remember, Maintain and adopt the formatting, structure, style and tone of the provided blog content.
|
||||
Include relevant emojis in your final blog for visual appeal. Use it sparingly.
|
||||
Your response should be well-structured, objective, and critically acclaimed blog article based on provided texts.
|
||||
|
||||
Remember, your goal is to create a detailed blog article that will compete against given organic result competitors.
|
||||
Do not provide explanations, suggestions for your response, reply only with your final response.
|
||||
Take your time in crafting your content, do not rush to give the response.
|
||||
Blog Content: '{report}'\n
|
||||
Organic Search result: '{organic_results}'
|
||||
"""
|
||||
report = gemini_text_response(prompt)
|
||||
return report
|
||||
38
lib/ai_web_researcher/gpt_summarize_web_content.py
Normal file
38
lib/ai_web_researcher/gpt_summarize_web_content.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import sys
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def summarize_web_content(page_content, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
Web page content: {page_content}
|
||||
"""
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
prompt = f"""You are a helpful assistant that briefly summarizes the content of a webpage.
|
||||
Summarize the given web page content below.
|
||||
Web page content: '{page_content}'"""
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"failed to get response from Openai: {err}")
|
||||
raise err
|
||||
53
lib/ai_web_researcher/gpt_titles_faq.py
Normal file
53
lib/ai_web_researcher/gpt_titles_faq.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import sys
|
||||
import json
|
||||
|
||||
from ..gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
# FIXME: Provide num_blogs, num_faqs as inputs.
|
||||
def gpt_titles_faqs_google_search(search_keyword, search_results, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
As a SEO expert and content writer, I will provide you with my web research keyword and its google search result in json format.
|
||||
Your task is to write 1 blog title and 10 FAQs.
|
||||
|
||||
1). Your blog title should compete against all the provided search results.
|
||||
2). Your FAQ should be based on 'People also ask' and 'Related Queries' from given result.
|
||||
Always include answers for each FAQ, use your knowledge and confirm with snippets given in search result.
|
||||
3). Respond in json data with 'blogTitles' and 'FAQs' as json keys. Do not explain, describe your response.
|
||||
4). Follow best practises of SEO.
|
||||
|
||||
Web Research Keyword: "{search_keyword}"
|
||||
Google search Result: "{search_results}"
|
||||
"""
|
||||
logger.info("Generating blog title and FAQs from web search result.")
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
print(f"\n\n\n RESPONSE: {response}\n\n\n")
|
||||
if '```' in response and '\n' in response:
|
||||
response = response.strip().split('\n')
|
||||
# Remove the first and last lines
|
||||
response = '\n'.join(response[1:-1])
|
||||
response = json.loads(response)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from Openai: {err}")
|
||||
raise err
|
||||
223
lib/ai_web_researcher/metaphor_basic_neural_web_search.py
Normal file
223
lib/ai_web_researcher/metaphor_basic_neural_web_search.py
Normal file
@@ -0,0 +1,223 @@
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
from metaphor_python import Metaphor
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
from tabulate import tabulate
|
||||
from collections import namedtuple
|
||||
import textwrap
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../../.env'))
|
||||
|
||||
from exa_py import Exa
|
||||
|
||||
from tenacity import (retry, stop_after_attempt, wait_random_exponential,)# for exponential backoff
|
||||
from .gpt_summarize_web_content import summarize_web_content
|
||||
from .gpt_competitor_analysis import summarize_competitor_content
|
||||
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def get_metaphor_client():
|
||||
"""
|
||||
Get the Metaphor client.
|
||||
|
||||
Returns:
|
||||
Metaphor: An instance of the Metaphor client.
|
||||
"""
|
||||
METAPHOR_API_KEY = os.environ.get('METAPHOR_API_KEY')
|
||||
if not METAPHOR_API_KEY:
|
||||
raise ValueError("METAPHOR_API_KEY environment variable not set!")
|
||||
return Exa(METAPHOR_API_KEY)
|
||||
|
||||
|
||||
def metaphor_rag_search():
|
||||
""" Mainly used for researching blog sections. """
|
||||
metaphor = get_metaphor_client()
|
||||
|
||||
|
||||
|
||||
def metaphor_find_similar(similar_url):
|
||||
"""
|
||||
Find similar content using the Metaphor API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to find similar content.
|
||||
|
||||
Returns:
|
||||
MetaphorResponse: The response from the Metaphor API.
|
||||
"""
|
||||
metaphor = get_metaphor_client()
|
||||
try:
|
||||
logger.info(f"Doing similar web search for url: {similar_url}")
|
||||
search_response = metaphor.find_similar_and_contents(
|
||||
similar_url,
|
||||
highlights=True,
|
||||
num_results=10)
|
||||
except Exception as e:
|
||||
logger.error(f"Metaphor: Error in finding similar content: {e}")
|
||||
raise
|
||||
|
||||
competitors = search_response.results
|
||||
for acompetitor in tqdm(competitors, desc="Processing Competitors", unit="competitor"):
|
||||
all_contents = ""
|
||||
try:
|
||||
search_response = metaphor.search_and_contents(
|
||||
acompetitor.url,
|
||||
type="keyword",
|
||||
num_results=5
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do metaphor keyword/url research: {err}")
|
||||
|
||||
research_response = search_response.results
|
||||
|
||||
# Add a progress bar for the inner loop
|
||||
for r in tqdm(research_response, desc=f"{acompetitor.url}", unit="research"):
|
||||
all_contents += r.text
|
||||
try:
|
||||
acompetitor.text = summarize_competitor_content(all_contents, "gemini")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to summarize_web_content: {err}")
|
||||
|
||||
# Convert the data into a list of lists
|
||||
print_search_result(competitors)
|
||||
return search_response
|
||||
|
||||
|
||||
|
||||
def metaphor_search_articles(query,
|
||||
num_results=5,
|
||||
use_autoprompt=True,
|
||||
include_domains=[],
|
||||
time_range=None,
|
||||
similar_url=None):
|
||||
"""
|
||||
Search for articles using the Metaphor API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
num_results (int): Number of results to retrieve.
|
||||
use_autoprompt (bool): Whether to use autoprompt.
|
||||
include_domains (list): List of domains to include.
|
||||
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
|
||||
|
||||
Returns:
|
||||
MetaphorResponse: The response from the Metaphor API.
|
||||
"""
|
||||
metaphor = get_metaphor_client()
|
||||
try:
|
||||
if time_range == "past day":
|
||||
start_published_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
|
||||
elif time_range == "past week":
|
||||
start_published_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")
|
||||
elif time_range == "past month":
|
||||
start_published_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
|
||||
elif time_range == "past year":
|
||||
start_published_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
|
||||
else:
|
||||
start_published_date = None
|
||||
|
||||
logger.info(f"Metaphor web search with Date: {start_published_date} and Query: {query}")
|
||||
try:
|
||||
search_response = metaphor.search_and_contents(
|
||||
query,
|
||||
include_domains=include_domains,
|
||||
use_autoprompt=True,
|
||||
start_published_date=start_published_date,
|
||||
num_results=num_results
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed in metaphor.search_and_contents: {err}")
|
||||
|
||||
# From each webpage, get a summary of the web page.
|
||||
contents_response = search_response.results
|
||||
for content in tqdm(contents_response, desc="Reading Web URL content:", unit="content"):
|
||||
summarized_content = summarize_web_content(content.text, "gemini")
|
||||
content.text = summarized_content
|
||||
|
||||
print_search_result(contents_response)
|
||||
|
||||
if similar_url:
|
||||
logger.info(f"Doing similar/semantic search for URL: {similar_url}")
|
||||
metaphor_find_similar(similar_url)
|
||||
return contents_response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Metaphor searching articles: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def print_search_result(contents_response):
|
||||
# Define the Result namedtuple
|
||||
Result = namedtuple("Result", ["url", "title", "published_date", "text"])
|
||||
# Tabulate the data
|
||||
table_headers = ["URL", "Title", "Published Date", "Summary"]
|
||||
table_data = [(result.url, result.title, result.published_date, result.text) for result in contents_response]
|
||||
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left", "left", "left", "left"],
|
||||
maxcolwidths=[20, 20, 10, 60])
|
||||
print(table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "a") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
|
||||
|
||||
def metaphor_scholar_search(query, include_domains=None, time_range="anytime"):
|
||||
"""
|
||||
Search for papers using the Metaphor API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
include_domains (list): List of domains to include.
|
||||
time_range (str): Time range for published articles ("day", "week", "month", "year", "anytime").
|
||||
|
||||
Returns:
|
||||
MetaphorResponse: The response from the Metaphor API.
|
||||
"""
|
||||
client = get_metaphor_client()
|
||||
try:
|
||||
if time_range == "day":
|
||||
start_published_date = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
elif time_range == "week":
|
||||
start_published_date = (datetime.utcnow() - timedelta(weeks=1)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
elif time_range == "month":
|
||||
start_published_date = (datetime.utcnow() - timedelta(weeks=4)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
elif time_range == "year":
|
||||
start_published_date = (datetime.utcnow() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
else:
|
||||
start_published_date = None
|
||||
|
||||
response = client.search(query, include_domains=include_domains, start_published_date=start_published_date, use_autoprompt=True)
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in searching papers: {e}")
|
||||
156
lib/ai_web_researcher/tavily_ai_search.py
Normal file
156
lib/ai_web_researcher/tavily_ai_search.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""
|
||||
This Python script uses the Tavily AI service to perform advanced searches based on specified keywords and options. It retrieves Tavily AI search results, pretty-prints them using Rich and Tabulate, and provides additional information such as the answer to the search query and follow-up questions.
|
||||
|
||||
Features:
|
||||
- Utilizes the Tavily AI service for advanced searches.
|
||||
- Retrieves API keys from the environment variables loaded from a .env file.
|
||||
- Configures logging with Loguru for informative messages.
|
||||
- Implements a retry mechanism using Tenacity to handle transient failures during Tavily searches.
|
||||
- Displays search results, including titles, snippets, and links, in a visually appealing table using Tabulate and Rich.
|
||||
|
||||
Usage:
|
||||
- Ensure the necessary API keys are set in the .env file.
|
||||
- Run the script to perform a Tavily AI search with specified keywords and options.
|
||||
- The search results, including titles, snippets, and links, are displayed in a formatted table.
|
||||
- Additional information, such as the answer to the search query and follow-up questions, is presented in separate tables.
|
||||
|
||||
Modifications:
|
||||
- To modify the script, update the environment variables in the .env file with the required API keys.
|
||||
- Adjust the search parameters, such as keywords and search depth, in the `get_tavilyai_results` function as needed.
|
||||
- Customize logging configurations and table formatting according to preferences.
|
||||
|
||||
To-Do (TBD):
|
||||
- Consider adding further enhancements or customization based on specific use cases.
|
||||
|
||||
Note: This script depends on external libraries such as Tavily, Rich, Tabulate, Loguru, and Tenacity. Install them using 'pip install tavily rich tabulate loguru tenacity' if not already installed.
|
||||
"""
|
||||
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
from tavily import TavilyClient
|
||||
from rich import print
|
||||
from tabulate import tabulate
|
||||
# Load environment variables from .env file
|
||||
load_dotenv(Path('../../.env'))
|
||||
from rich import print
|
||||
|
||||
# Configure logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
||||
|
||||
from .gpt_titles_faq import gpt_titles_faqs_google_search
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def get_tavilyai_results(keywords, include_urls, search_depth="advanced"):
|
||||
"""
|
||||
Get Tavily AI search results based on specified keywords and options.
|
||||
|
||||
Args:
|
||||
keywords (str): Keywords for Tavily AI search.
|
||||
include_urls (str): Comma-separated URLs to include in the search.
|
||||
search_depth (str, optional): Search depth option (default is "advanced").
|
||||
|
||||
Returns:
|
||||
dict: Tavily AI search results.
|
||||
"""
|
||||
# Run Tavily search
|
||||
logger.info(f"Running Tavily search on: {keywords}")
|
||||
|
||||
# Retrieve API keys
|
||||
api_key = os.getenv('TAVILY_API_KEY')
|
||||
if not api_key:
|
||||
raise ValueError("API keys for Tavily or OpenAI are not set.")
|
||||
|
||||
# Initialize Tavily client
|
||||
try:
|
||||
client = TavilyClient(api_key=api_key)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to create Tavily client. Check TAVILY_API_KEY: {err}")
|
||||
exit(1)
|
||||
try:
|
||||
if include_urls:
|
||||
tavily_search_result = client.search(keywords, search_depth, include_answer=True, include_domains=include_urls)
|
||||
else:
|
||||
tavily_search_result = client.search(keywords, search_depth, include_answer=True)
|
||||
print_result_table(tavily_search_result)
|
||||
return(tavily_search_result)
|
||||
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to do Tavily Research: {err}")
|
||||
|
||||
|
||||
def print_result_table(output_data):
|
||||
""" Pretty print the tavily AI serch result. """
|
||||
# Prepare data for tabulate
|
||||
table_data = []
|
||||
for item in output_data.get("results"):
|
||||
title = item.get("title", "")
|
||||
snippet = item.get("content", "")
|
||||
link = item.get("url", "")
|
||||
table_data.append([title, snippet, link])
|
||||
|
||||
# Define table headers
|
||||
table_headers = ["Title", "Snippet", "Link"]
|
||||
# Display the table using tabulate
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
colalign=["left", "left", "left"],
|
||||
maxcolwidths=[30, 60, 30])
|
||||
# Print the table
|
||||
print(table)
|
||||
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
# Display the 'answer' in a table
|
||||
table_headers = [f"The answer to search query: {output_data.get('query')}"]
|
||||
table_data = [[output_data.get("answer")]]
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
maxcolwidths=[80])
|
||||
print(table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
# Display the 'follow_up_questions' in a table
|
||||
table_headers = [f"Search Engine follow up questions for query: {output_data.get('query')}"]
|
||||
table_data = [[output_data.get("follow_up_questions")]]
|
||||
table = tabulate(table_data,
|
||||
headers=table_headers,
|
||||
tablefmt="fancy_grid",
|
||||
maxcolwidths=[80])
|
||||
print(table)
|
||||
# Save the combined table to a file
|
||||
try:
|
||||
save_in_file(table)
|
||||
except Exception as save_results_err:
|
||||
logger.error(f"Failed to save search results: {save_results_err}")
|
||||
|
||||
|
||||
def save_in_file(table_content):
|
||||
""" Helper function to save search analysis in a file. """
|
||||
file_path = os.environ.get('SEARCH_SAVE_FILE')
|
||||
try:
|
||||
# Save the content to the file
|
||||
with open(file_path, "a") as file:
|
||||
file.write(table_content)
|
||||
file.write("\n" * 3) # Add three newlines at the end
|
||||
logger.info(f"Search content saved to {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred while writing to the file: {e}")
|
||||
23
lib/ai_web_researcher/web_research_report.py
Normal file
23
lib/ai_web_researcher/web_research_report.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from langchain.adapters.openai import convert_openai_messages
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
|
||||
from ..gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
|
||||
def write_web_research_report(web_research, faq_questions, gpt_provider="gemini"):
|
||||
""" """
|
||||
if "gemini" in gpt_provider:
|
||||
prompt = ["You are an SEO and marketing expert, who writes unique, factual and comprehensive research reports."
|
||||
"I will provide you web research report as json data and a list of related FAQ questions."
|
||||
"Use given json as context for writing your research report."
|
||||
"Your sole purpose is to write well written, critically acclaimed, objective and structured research report"
|
||||
"Use the urls from json content to provide cititations and include it in referances section of your report."
|
||||
"Include appropriate emojis in your research report."
|
||||
"Format your report in MLA format and markdown style, with special focus on readibility."
|
||||
f"Do not provide explanations for your response.\nWeb research Report: \"\"\" {web_research} \"\"\"\n "
|
||||
f"\nList of FAQ questions: \"\"\" {faq_questions} \"\"\"\n"]
|
||||
report = gemini_text_response(prompt)
|
||||
|
||||
elif "openai" in gpt_provider:
|
||||
report = openai_research_report(prompt)
|
||||
return report
|
||||
137
lib/ai_web_researcher/you_web_reseacher.py
Normal file
137
lib/ai_web_researcher/you_web_reseacher.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import requests
|
||||
from clint.textui import progress
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
||||
def search_ydc_index(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
|
||||
"""
|
||||
Search YDC Index API and retrieve results.
|
||||
|
||||
Args:
|
||||
search_query (str): The search query.
|
||||
num_web_results (int): Number of web results to retrieve.
|
||||
country (str): Country code.
|
||||
api_key (str): YDC Index API key.
|
||||
|
||||
Returns:
|
||||
dict: The response from the YDC Index API in JSON format.
|
||||
"""
|
||||
try:
|
||||
url = "https://api.ydc-index.io/search"
|
||||
|
||||
querystring = {
|
||||
"query": search_query,
|
||||
"num_web_results": str(num_web_results),
|
||||
"country": country
|
||||
}
|
||||
|
||||
headers = {"X-API-Key": api_key}
|
||||
|
||||
with progress.Bar(expected_size=num_web_results, label="Searching YDC Index") as bar:
|
||||
response = requests.get(url, headers=headers, params=querystring, stream=True)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
||||
|
||||
result_json = response.json()
|
||||
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
|
||||
|
||||
return result_json
|
||||
|
||||
except requests.exceptions.RequestException as req_exc:
|
||||
logger.error(f"Request to YDC Index API failed: {req_exc}")
|
||||
return {"error": str(req_exc)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def get_rag_results(search_query, num_web_results=10, country="IN", api_key="<api-key>"):
|
||||
"""
|
||||
Retrieve RAG (Relevance, Authority, and Goodness) results from YDC Index API.
|
||||
|
||||
Args:
|
||||
search_query (str): The search query.
|
||||
num_web_results (int): Number of web results to retrieve.
|
||||
country (str): Country code.
|
||||
api_key (str): YDC Index API key.
|
||||
|
||||
Returns:
|
||||
dict: The response from the YDC Index API in JSON format.
|
||||
"""
|
||||
try:
|
||||
url = "https://api.ydc-index.io/rag"
|
||||
|
||||
querystring = {
|
||||
"query": search_query,
|
||||
"num_web_results": str(num_web_results),
|
||||
"country": country
|
||||
}
|
||||
|
||||
headers = {"X-API-Key": api_key}
|
||||
|
||||
with progress.Bar(expected_size=num_web_results, label="Fetching RAG Results") as bar:
|
||||
response = requests.get(url, headers=headers, params=querystring, stream=True)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
||||
|
||||
result_json = response.json()
|
||||
bar.show(result_json.get("web_results", [])) # Update progress bar with the number of web results
|
||||
|
||||
return result_json
|
||||
|
||||
except requests.exceptions.RequestException as req_exc:
|
||||
logger.error(f"Request to YDC Index API failed: {req_exc}")
|
||||
return {"error": str(req_exc)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
def get_news_results(query, spellcheck=True, api_key="<api-key>"):
|
||||
"""
|
||||
Retrieve news results from YDC Index API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
spellcheck (bool): Whether to enable spellcheck.
|
||||
api_key (str): YDC Index API key.
|
||||
|
||||
Returns:
|
||||
dict: The response from the YDC Index API in JSON format.
|
||||
"""
|
||||
try:
|
||||
url = "https://api.ydc-index.io/news"
|
||||
|
||||
querystring = {
|
||||
"q": query,
|
||||
"spellcheck": str(spellcheck).lower()
|
||||
}
|
||||
|
||||
headers = {"X-API-Key": api_key}
|
||||
|
||||
with progress.Bar(expected_size=1, label="Fetching News Results") as bar:
|
||||
response = requests.get(url, headers=headers, params=querystring, stream=True)
|
||||
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
||||
|
||||
result_json = response.json()
|
||||
bar.show() # Update progress bar
|
||||
|
||||
return result_json
|
||||
|
||||
except requests.exceptions.RequestException as req_exc:
|
||||
logger.error(f"Request to YDC Index API failed: {req_exc}")
|
||||
return {"error": str(req_exc)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
# Example usage
|
||||
search_query = "Getting started with llamaindex"
|
||||
result = get_news_results(search_query)
|
||||
print(result)
|
||||
result = get_rag_results(search_query)
|
||||
print(result)
|
||||
result = search_ydc_index(search_query)
|
||||
print(result)
|
||||
Reference in New Issue
Block a user