AI Blogen - WIP - 0.0.00000.001
This commit is contained in:
323
lib/arxiv_schlorly_research.py
Normal file
323
lib/arxiv_schlorly_research.py
Normal file
@@ -0,0 +1,323 @@
|
||||
####################################################
|
||||
#
|
||||
# FIXME: Gotta use this lib: https://github.com/monk1337/resp/tree/main
|
||||
# https://github.com/danielnsilva/semanticscholar
|
||||
# https://github.com/shauryr/S2QA
|
||||
#
|
||||
####################################################
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
import arxiv
|
||||
import PyPDF2
|
||||
import io
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def fetch_arxiv_data(query, max_results=10):
|
||||
try:
|
||||
# Construct the default API client
|
||||
client = arxiv.Client()
|
||||
|
||||
# Search for articles matching the keyword
|
||||
search = arxiv.Search(
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
sort_by=arxiv.SortCriterion.SubmittedDate
|
||||
)
|
||||
|
||||
# Fetching results
|
||||
results = list(client.results(search))
|
||||
# Extracting data
|
||||
all_data = []
|
||||
for result in results:
|
||||
temp = [result.title, result.published, result.entry_id, result.summary, result.pdf_url]
|
||||
all_data.append(temp)
|
||||
|
||||
return all_data
|
||||
|
||||
except Exception as e:
|
||||
print("An error occurred while fetching data from arXiv:", e)
|
||||
raise e
|
||||
|
||||
|
||||
def create_dataframe(data, column_names):
|
||||
try:
|
||||
df = pd.DataFrame(data, columns=column_names)
|
||||
return df
|
||||
except Exception as e:
|
||||
print("An error occurred while creating DataFrame:", e)
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def get_arxiv_main_content(url):
|
||||
"""
|
||||
Returns the main content of an arXiv paper.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the arXiv paper.
|
||||
|
||||
Returns:
|
||||
str: The main content of the paper as a string.
|
||||
"""
|
||||
|
||||
try:
|
||||
# Send a GET request to the URL
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() # Raise an exception for HTTP errors
|
||||
|
||||
# Parse the HTML content
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Find the main content in 'ltx_page_content'
|
||||
main_content = soup.find('div', class_='ltx_page_content')
|
||||
if not main_content:
|
||||
logger.warning("Main content not found in the page.")
|
||||
return "Main content not found."
|
||||
|
||||
# Remove specific section with class 'package-alerts ltx_document'
|
||||
alert_section = main_content.find('div', class_='package-alerts ltx_document')
|
||||
if alert_section:
|
||||
alert_section.decompose()
|
||||
|
||||
# Optional: Remove abstract and authors if present
|
||||
for element_id in ["abs", "authors"]:
|
||||
element = main_content.find(id=element_id)
|
||||
if element:
|
||||
element.decompose()
|
||||
return main_content.text.strip()
|
||||
|
||||
# Could not access the arxiv HTML content, instead download pdf and read its content.
|
||||
except Exception as html_error:
|
||||
logger.warning(f"HTML content not accessible, trying PDF: {html_error}")
|
||||
try:
|
||||
# Extract arXiv ID from URL
|
||||
arxiv_id = url.split('/')[-1]
|
||||
# Fetch paper information using arXiv API
|
||||
paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
|
||||
pdf_filename = paper.download_pdf(filename=f"downloaded-paper-{arxiv_id}.pdf")
|
||||
# Initialize an empty string to store the extracted text
|
||||
pdf_text = ''
|
||||
|
||||
# Read the downloaded PDF
|
||||
with open(pdf_filename, 'rb') as f:
|
||||
pdf_reader = PyPDF2.PdfReader(f)
|
||||
|
||||
for page in pdf_reader.pages:
|
||||
try:
|
||||
# Attempt to extract text from the current page
|
||||
page_text = page.extract_text()
|
||||
# If text extraction is successful, add it to the cumulative text
|
||||
if page_text:
|
||||
pdf_text += page_text + '\n'
|
||||
except UnicodeDecodeError as err:
|
||||
# FIXME: Handle any UnicodeDecodeError that arises during text extraction
|
||||
logger.error(f"UnicodeDecodeError that arises during text extraction: {err}")
|
||||
pass
|
||||
|
||||
# Optionally, remove the downloaded PDF file
|
||||
os.remove(pdf_filename)
|
||||
|
||||
# Pattern to match 'References' and everything that follows
|
||||
pattern = r'References\s*.*'
|
||||
pdf_text = re.sub(pattern, '', pdf_text, flags=re.IGNORECASE | re.DOTALL)
|
||||
sections_to_remove = ['Acknowledgements', 'References', 'Bibliography']
|
||||
for section in sections_to_remove:
|
||||
# Pattern to match the section title and any text following it until the next big title or end of document
|
||||
pattern = r'(' + re.escape(section) + r'\s*.*?)(?=\n[A-Z]{2,}|$)'
|
||||
pdf_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
return pdf_text
|
||||
|
||||
except Exception as pdf_error:
|
||||
logger.error(f"Failed to process PDF: {pdf_error}")
|
||||
return "Failed to retrieve content."
|
||||
|
||||
|
||||
def download_image(image_url, base_url, folder="images"):
|
||||
# Skip downloading if the image URL is a data URI
|
||||
if image_url.startswith('data:image'):
|
||||
print(f"Skipping download of data URI image: {image_url}")
|
||||
return False
|
||||
|
||||
# Create the folder if it doesn't exist
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
|
||||
# Form the absolute URL for image paths
|
||||
if not urllib.parse.urlparse(image_url).scheme:
|
||||
if not base_url.endswith('/'):
|
||||
base_url += '/'
|
||||
image_url = base_url + image_url
|
||||
|
||||
# Download and save the image
|
||||
try:
|
||||
response = requests.get(image_url)
|
||||
response.raise_for_status()
|
||||
|
||||
image_name = image_url.split("/")[-1]
|
||||
with open(os.path.join(folder, image_name), 'wb') as file:
|
||||
file.write(response.content)
|
||||
return True
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error downloading {image_url}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def scrape_images_from_arxiv(url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
images = soup.find_all('img')
|
||||
|
||||
image_urls = [img['src'] for img in images if 'src' in img.attrs]
|
||||
return image_urls
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching page {url}: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
def arxiv_bibtex(arxiv_id):
|
||||
"""
|
||||
Get the BibTeX entry for an arXiv paper.
|
||||
Args:
|
||||
arxiv_id: The arXiv ID of the paper.
|
||||
Returns:
|
||||
A string containing the BibTeX entry.
|
||||
"""
|
||||
|
||||
import urllib.request, xml.dom.minidom
|
||||
|
||||
# Download the XML
|
||||
try:
|
||||
usock = urllib.request.urlopen(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')
|
||||
xmldoc = xml.dom.minidom.parse(usock)
|
||||
usock.close()
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
# Parse the XML
|
||||
entry = xmldoc.getElementsByTagName("entry")[0]
|
||||
date = entry.getElementsByTagName("updated")[0].firstChild.data
|
||||
text_year = date[:4]
|
||||
|
||||
title = entry.getElementsByTagName("title")[0]
|
||||
text_title = title.firstChild.data.strip()
|
||||
|
||||
authorlist = []
|
||||
first = True
|
||||
for person_name in entry.getElementsByTagName("author"):
|
||||
# Get names
|
||||
name = person_name.getElementsByTagName("name")[0]
|
||||
text_name = name.firstChild.data
|
||||
text_given_name = ' '.join(text_name.split()[:-1])
|
||||
text_surname = text_name.split()[-1]
|
||||
authorlist.append(f"{text_surname}, {text_given_name}")
|
||||
# First author?
|
||||
if first:
|
||||
text_first_author_surname = text_surname
|
||||
first = False
|
||||
|
||||
# Construct the BibTeX entry
|
||||
bibtex = f"@MISC{{{text_first_author_surname}{text_year[-2:]},\n"
|
||||
bibtex += f" author = {' and '.join(authorlist)},\n"
|
||||
bibtex += f" title = {{{text_title}}},\n"
|
||||
bibtex += f" year = {{{text_year}}},\n"
|
||||
bibtex += f" eprint = {{{arxiv_id}}},\n"
|
||||
bibtex += f" url = {{http://arxiv.org/abs/{arxiv_id}}}\n"
|
||||
bibtex += "}"
|
||||
|
||||
return bibtex
|
||||
|
||||
|
||||
#from serpapi import GoogleSearch
|
||||
#params = {
|
||||
# "api_key": "os.getenv(SERPER_API_KEY)",
|
||||
# "engine": "google_scholar",
|
||||
# "q": "llm",
|
||||
# "hl": "en",
|
||||
# "as_ylo": "2023",
|
||||
# "as_yhi": "2024"
|
||||
#}
|
||||
#search = GoogleSearch(params)
|
||||
#results = search.get_dict()
|
||||
|
||||
|
||||
def extract_arxiv_ids_from_line(line):
|
||||
"""
|
||||
Extract the arXiv ID from a given line of text.
|
||||
|
||||
Args:
|
||||
line (str): A line of text potentially containing an arXiv URL.
|
||||
|
||||
Returns:
|
||||
str: The extracted arXiv ID, or None if not found.
|
||||
"""
|
||||
arxiv_id_pattern = re.compile(r'arxiv\.org\/abs\/(\d+\.\d+)(v\d+)?')
|
||||
match = arxiv_id_pattern.search(line)
|
||||
if match:
|
||||
return match.group(1) + (match.group(2) if match.group(2) else '')
|
||||
return None
|
||||
|
||||
|
||||
def read_written_ids(file_path):
|
||||
"""
|
||||
Read already written arXiv IDs from a file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the file containing written IDs.
|
||||
|
||||
Returns:
|
||||
set: A set of arXiv IDs.
|
||||
"""
|
||||
written_ids = set()
|
||||
try:
|
||||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
written_ids.add(line.strip())
|
||||
except FileNotFoundError:
|
||||
logger.error(f"File not found: {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while reading the file: {e}")
|
||||
return written_ids
|
||||
|
||||
|
||||
def append_id_to_file(arxiv_id, output_file_path):
|
||||
"""
|
||||
Append a single arXiv ID to a file. Checks if the file exists and creates it if not.
|
||||
|
||||
Args:
|
||||
arxiv_id (str): The arXiv ID to append.
|
||||
output_file_path (str): Path to the output file.
|
||||
"""
|
||||
try:
|
||||
# Check if file exists
|
||||
if not os.path.exists(output_file_path):
|
||||
logger.info(f"File does not exist. Creating new file: {output_file_path}")
|
||||
# Create a new file and append the ID
|
||||
with open(output_file_path, 'a') as outfile:
|
||||
outfile.write(arxiv_id + '\n')
|
||||
else:
|
||||
logger.info(f"Appending to existing file: {output_file_path}")
|
||||
# File exists, append the ID
|
||||
with open(output_file_path, 'a') as outfile:
|
||||
outfile.write(arxiv_id + '\n')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while appending to file: {e}")
|
||||
@@ -1,13 +1,23 @@
|
||||
import sys
|
||||
|
||||
from .gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def blog_with_research(report, blog):
|
||||
def blog_with_research(report, blog, gpt_providers="openai"):
|
||||
"""Combine the given online research and gpt blog content"""
|
||||
|
||||
prompt = f"""
|
||||
You are an expert copywriter specializing in content optimization for SEO.
|
||||
I will provide you with a 'research report' and a 'blog content' on the same topic.
|
||||
Your task is to transform and combine the given research and blog content into a well-structured, unique
|
||||
Your task is to transform and combine the given research and blog content into a well-structured markdown, unique
|
||||
and engaging blog article.
|
||||
|
||||
Your objectives include:
|
||||
@@ -27,18 +37,40 @@ def blog_with_research(report, blog):
|
||||
11. Ensure Uniqueness: Guarantee the article is plagiarism-free. Write in unique, informative style.
|
||||
12. Punctuation: Use appropriate question marks at the end of questions.
|
||||
13. Pass AI Detection Tools: Create content that easily passes AI plagiarism detection tools.
|
||||
14. REMEMBER: Use the formatting style of given research report and include citations, referances in combined article.
|
||||
14. REMEMBER: Use the formatting style of given research report and include highlights, citations, referances in combined article.
|
||||
|
||||
Follow these guidelines to combine and write a new, unique, and informative blog article
|
||||
Follow these guidelines to combine and write a new, unique, and informative blog article
|
||||
that will rank well in search engine results and engage readers effectively.
|
||||
|
||||
Create a blog post from the given research report and blog content below.
|
||||
Create a blog post, in markdown, from the given research report and blog content below.
|
||||
Research report: {report}
|
||||
Blog content: {blog}
|
||||
"""
|
||||
try:
|
||||
# TBD: Add logic for which_provider and which_model
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
SystemError(f"Error in combining blog and research report.")
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
prompt = f"""You are an expert copywriter specializing in content optimization for SEO.
|
||||
You are world famous writer, known for your originality and engaging content.
|
||||
I will provide you with a 'research report' and a 'blog content' on the same topic.
|
||||
Your task is to transform and combine the given research and blog content into a blog article.
|
||||
Your blog should be highly detailed and well formatted.
|
||||
Include a section in your blog on the highlights section of blog content.
|
||||
Do not miss out any details from provided content. Always, include figures, data, results from given content.
|
||||
It is important that your blog is original and unique. It should be highly readable and SEO optimized.
|
||||
|
||||
Research report: '{report}'
|
||||
Blog content: '{blog}'
|
||||
"""
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"failed to get response from Openai: {err}")
|
||||
raise err
|
||||
|
||||
@@ -4,12 +4,13 @@ from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
def convert_tomarkdown_format(blog_content, gpt_provider="openai"):
|
||||
""" Helper for converting content to markdown format for static sites. """
|
||||
prompt = f"""
|
||||
As an expert in markdown language format and font matter,
|
||||
|
||||
prompt = f"""As an expert in markdown language format and font matter,
|
||||
I will provide you with a blog post.
|
||||
Your task is to improve formatting of given blog post.
|
||||
Your task is to only Improve the formatting and structure of a blog post to enhance readability, visual appeal, and overall user experience. Do not alter the content of the provided blog. Modify only for the formatting.
|
||||
Dont provide explanations, just your final response.
|
||||
|
||||
Use below guidelines to do formatting, structuring to make it highly readable:
|
||||
Guidelines to do formatting:
|
||||
1. **Headings for Structure:**
|
||||
- Use # for the main title of the blog post.
|
||||
- Use ## for subheadings that divide the post into clear sections.
|
||||
@@ -54,8 +55,6 @@ def convert_tomarkdown_format(blog_content, gpt_provider="openai"):
|
||||
- Keep the blog post organized and easy to navigate.
|
||||
- Use a consistent formatting style throughout the post.
|
||||
|
||||
Dont provide explanations, just your final response.
|
||||
Convert the given blog post in well organised markdown content:\n
|
||||
Blog Post: '{blog_content}'"""
|
||||
|
||||
if 'openai' in gpt_provider:
|
||||
@@ -65,6 +64,10 @@ def convert_tomarkdown_format(blog_content, gpt_provider="openai"):
|
||||
except Exception as err:
|
||||
SystemError(f"Openai Error in converting to Markdown format.")
|
||||
elif 'gemini' in gpt_provider:
|
||||
|
||||
prompt = f""" Convert the given blog post into well structured MARKDOWN content.
|
||||
Do not alter the given blog post.
|
||||
blog post: "{blog_content}" """
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
|
||||
26
lib/get_blog_metadata.py
Normal file
26
lib/get_blog_metadata.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import sys
|
||||
|
||||
from .get_blog_meta_desc import generate_blog_description
|
||||
from .get_tags import get_blog_tags
|
||||
from .get_blog_category import get_blog_categories
|
||||
from .get_blog_title import generate_blog_title
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def blog_metadata(blog_content, gpt_providers="openai"):
|
||||
""" Common function to get blog metadata """
|
||||
blog_title = generate_blog_title(blog_content, gpt_providers)
|
||||
blog_meta_desc = generate_blog_description(blog_content, gpt_providers)
|
||||
logger.info(f"The blog meta description is: {blog_meta_desc}\n")
|
||||
blog_tags = get_blog_tags(blog_content, gpt_providers)
|
||||
logger.info(f"Blog tags for generated content: {blog_tags}")
|
||||
blog_categories = get_blog_categories(blog_content, gpt_providers)
|
||||
logger.info(f"Generated blog categories: {blog_categories}\n")
|
||||
|
||||
return(blog_title, blog_meta_desc, blog_tags, blog_categories)
|
||||
@@ -36,4 +36,4 @@ def generate_blog_title(blog_article, gpt_providers="openai"):
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
SystemError(f"Error in generating blog summary: {err}")
|
||||
SystemError(f"Failed to get response from Openai: {err}")
|
||||
|
||||
79
lib/gpt_providers/gemini_arvix_image_details.py
Normal file
79
lib/gpt_providers/gemini_arvix_image_details.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import google.generativeai as genai
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../../.env'))
|
||||
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_random_exponential,
|
||||
) # for exponential backoff
|
||||
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def gemini_arxiv_img_info(img_path):
|
||||
""" Get image details from arxiv papers. """
|
||||
try:
|
||||
genai.configure(api_key=os.getenv("API_KEY"))
|
||||
except Exception as e:
|
||||
logging.error(f"Could not load gemini API key: {e}")
|
||||
raise e
|
||||
|
||||
# Set up the model
|
||||
generation_config = {
|
||||
"temperature": 0.9,
|
||||
"top_p": 1,
|
||||
"top_k": 1,
|
||||
"max_output_tokens": 1096,
|
||||
}
|
||||
|
||||
safety_settings = [{
|
||||
"category": "HARM_CATEGORY_HARASSMENT",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
},
|
||||
{
|
||||
"category": "HARM_CATEGORY_HATE_SPEECH",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
},
|
||||
{
|
||||
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
},
|
||||
{
|
||||
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
||||
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
||||
},]
|
||||
|
||||
try:
|
||||
model = genai.GenerativeModel(model_name="gemini-pro-vision",
|
||||
generation_config=generation_config,
|
||||
safety_settings=safety_settings)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not create GenerativeModel: {e}")
|
||||
raise e
|
||||
|
||||
# Validate that an image is present
|
||||
if not (img := Path(img_path)).exists():
|
||||
raise FileNotFoundError(f"Could not find image: {img}")
|
||||
|
||||
image_parts = [{
|
||||
"mime_type": "image/png",
|
||||
"data": Path(img_path).read_bytes()
|
||||
},]
|
||||
|
||||
prompt_parts = [
|
||||
"As scholar on evaluating research papers, I will provide you with an image from a research paper. Your task is to explain the image in details so that I can use it in a blog article. Explain the key findings and conclusions from the image. Your description should be in simple terms to explain to a wider audience. Explain key findings from the given image.",
|
||||
image_parts[0],]
|
||||
|
||||
try:
|
||||
response = model.generate_content(prompt_parts)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logging.error(f"Could not generate gemini content: {e}")
|
||||
raise e
|
||||
@@ -7,6 +7,7 @@ import google.generativeai as genai
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../../.env'))
|
||||
from .mistral_chat_completion import mistral_text_response
|
||||
|
||||
from tenacity import (
|
||||
retry,
|
||||
@@ -17,7 +18,7 @@ from tenacity import (
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def gemini_text_response(prompt):
|
||||
""" Provide a programming blog and get code exmaples."""
|
||||
""" Common functiont to get response from gemini pro Text. """
|
||||
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Set up the model
|
||||
@@ -25,14 +26,17 @@ def gemini_text_response(prompt):
|
||||
"temperature": 1,
|
||||
"top_p": 1,
|
||||
"top_k": 1,
|
||||
"max_output_tokens": 4096,
|
||||
"max_output_tokens": 6096,
|
||||
}
|
||||
|
||||
model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config)
|
||||
try:
|
||||
response = model.generate_content(prompt)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from Gemini: {err}. Retrying..")
|
||||
gemini_research_report(query)
|
||||
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from Gemini: {err}. Retrying.")
|
||||
# Try with minstral.
|
||||
print(f"\n\n\n--MINSTRAL--\n\n\n\n")
|
||||
response = mistral_text_response(prompt)
|
||||
return response
|
||||
return response.text
|
||||
|
||||
40
lib/gpt_providers/mistral_chat_completion.py
Normal file
40
lib/gpt_providers/mistral_chat_completion.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from mistralai.client import MistralClient
|
||||
from mistralai.models.chat_completion import ChatMessage
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(module)s-%(lineno)d-%(message)s')
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path('../../.env'))
|
||||
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_random_exponential,
|
||||
) # for exponential backoff
|
||||
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def mistral_text_response(prompt):
|
||||
""" Common function to get text response from minstral. """
|
||||
api_key = os.environ["MISTRAL_API_KEY"]
|
||||
model = "mistral-medium"
|
||||
|
||||
client = MistralClient(api_key=api_key)
|
||||
|
||||
messages = [
|
||||
ChatMessage(role="user", content=prompt)
|
||||
]
|
||||
|
||||
# No streaming
|
||||
chat_response = client.chat(
|
||||
model=model,
|
||||
messages=messages,
|
||||
)
|
||||
print(chat_response)
|
||||
|
||||
# With streaming
|
||||
for chunk in client.chat_stream(model=model, messages=messages):
|
||||
print(chunk)
|
||||
209
lib/main_arxiv_to_blog.py
Normal file
209
lib/main_arxiv_to_blog.py
Normal file
@@ -0,0 +1,209 @@
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
|
||||
import tiktoken
|
||||
|
||||
from .arxiv_schlorly_research import fetch_arxiv_data, create_dataframe, get_arxiv_main_content
|
||||
from .arxiv_schlorly_research import arxiv_bibtex, scrape_images_from_arxiv, download_image
|
||||
from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
|
||||
from .write_research_review_blog import review_research_paper
|
||||
from .combine_research_and_blog import blog_with_research
|
||||
from .write_blog_scholar_paper import write_blog_from_paper
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
from .generate_image_from_prompt import generate_image
|
||||
from .convert_content_to_markdown import convert_tomarkdown_format
|
||||
from .get_blog_metadata import blog_metadata
|
||||
from .get_code_examples import gemini_get_code_samples
|
||||
from .save_blog_to_file import save_blog_to_file
|
||||
from .take_url_screenshot import screenshot_api
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def blog_arxiv_keyword(query):
|
||||
""" Write blog on given arxiv paper."""
|
||||
arxiv_id = None
|
||||
arxiv_url = None
|
||||
bibtex = None
|
||||
research_review = None
|
||||
column_names = ['Title', 'Date', 'Id', 'Summary', 'PDF URL']
|
||||
papers = fetch_arxiv_data(query)
|
||||
df = create_dataframe(papers, column_names)
|
||||
|
||||
for paper in papers:
|
||||
# Extracting the arxiv_id
|
||||
arxiv_id = paper[2].split('/')[-1]
|
||||
arxiv_url = "https://browse.arxiv.org/html/" + arxiv_id
|
||||
bibtex = arxiv_bibtex(arxiv_id)
|
||||
logger.info(f"Get research paper text from the url: {arxiv_url}")
|
||||
research_content = get_arxiv_main_content(arxiv_url)
|
||||
|
||||
num_tokens = num_tokens_from_string(research_content, "cl100k_base")
|
||||
logger.info(f"Number of tokens sent: {num_tokens}")
|
||||
# If the number of tokens is below the threshold, process and print the review
|
||||
if 1000 < num_tokens < 30000:
|
||||
logger.info(f"Writing research review on {paper[0]}")
|
||||
research_review = review_research_paper(research_content)
|
||||
research_review = f"\n{research_review}\n\n" + f"```{bibtex}```"
|
||||
#research_review = research_review + "\n\n\n" + f"{df.to_markdown()}"
|
||||
research_review = convert_tomarkdown_format(research_review, "gemini")
|
||||
break
|
||||
else:
|
||||
# Skip to the next iteration if the condition is not met
|
||||
continue
|
||||
|
||||
logger.info(f"Final scholar article: \n\n{research_review}\n")
|
||||
|
||||
# TBD: Scrape images from research reports and pass to vision to get conclusions out of it.
|
||||
#image_urls = scrape_images_from_arxiv(arxiv_url)
|
||||
#print("Downloading images found on the page:")
|
||||
#for img_url in image_urls:
|
||||
# download_image(img_url, arxiv_url)
|
||||
try:
|
||||
blog_postprocessing(arxiv_id, research_review)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed in blog post processing: {err}")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info(f"\n\n ################ Finished writing Blog for : #################### \n")
|
||||
|
||||
|
||||
def blog_arxiv_url_list(file_path):
|
||||
""" Write blogs on all the arxiv links given in a file. """
|
||||
extracted_ids = []
|
||||
try:
|
||||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
arxiv_id = extract_arxiv_ids_from_line(line)
|
||||
if arxiv_id:
|
||||
extracted_ids.append(arxiv_id)
|
||||
except FileNotFoundError:
|
||||
logger.error(f"File not found: {file_path}")
|
||||
raise FileNotFoundError
|
||||
except Exception as e:
|
||||
logger.error(f"Error while reading the file: {e}")
|
||||
raise e
|
||||
|
||||
# Read already written IDs
|
||||
written_ids = read_written_ids('papers_already_written_on.txt')
|
||||
|
||||
# Write blogs on each of arxiv_id from the file.
|
||||
for arxiv_id in extracted_ids:
|
||||
# Check if we have already written on this research_paper. For this, all arxiv ids are written in
|
||||
# a file called 'papers_already_written_on.txt'. If arxiv ID is found in this file, skip writing again.
|
||||
# YUP, use a DB. KISS for now.
|
||||
written_ids = read_written_ids('papers_already_written_on.txt')
|
||||
|
||||
# Loop through extracted IDs
|
||||
for arxiv_id in extracted_ids:
|
||||
if arxiv_id not in written_ids:
|
||||
# This ID has not been written on yet
|
||||
arxiv_url = "https://browse.arxiv.org/html/" + arxiv_id
|
||||
logger.info(f"Get research paper text from the url: {arxiv_url}")
|
||||
research_content = get_arxiv_main_content(arxiv_url)
|
||||
try:
|
||||
num_tokens = num_tokens_from_string(research_content, "cl100k_base")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed in counting tokens: {err}")
|
||||
sys.exit(1)
|
||||
logger.info(f"Number of tokens sent: {num_tokens}")
|
||||
# If the number of tokens is below the threshold, process and print the review
|
||||
# FIXME: Docs over 30k tokens, need to be chunked and summarized.
|
||||
if 1000 < num_tokens < 30000:
|
||||
try:
|
||||
logger.info(f"Getting bibtex for arxiv ID: {arxiv_id}")
|
||||
bibtex = arxiv_bibtex(arxiv_id)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get Bibtex: {err}")
|
||||
|
||||
try:
|
||||
logger.info(f"Writing a research review..")
|
||||
research_review = review_research_paper(research_content, "gemini")
|
||||
logger.info(f"Research Review: \n{research_review}\n\n")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to write review on research paper: {arxiv_id}{err}")
|
||||
|
||||
research_blog = write_blog_from_paper(research_content, "gemini")
|
||||
logger.info(f"\n\nResearch Blog: {research_blog}\n\n")
|
||||
research_blog = f"\n{research_review}\n\n" + f"```\n{bibtex}\n```"
|
||||
#research_review = blog_with_research(research_review, research_blog, "gemini")
|
||||
#logger.info(f"\n\n\nBLOG_WITH_RESEARCh: {research_review}\n\n\n")
|
||||
research_review = convert_tomarkdown_format(research_review, "gemini")
|
||||
research_review = f"\n{research_review}\n\n" + f"```{bibtex}```"
|
||||
logger.info(f"Final blog from research paper: \n\n{research_review}\n\n\n")
|
||||
|
||||
try:
|
||||
blog_postprocessing(arxiv_id, research_review)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed in blog post processing: {err}")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info(f"\n\n ################ Finished writing Blog for : #################### \n")
|
||||
else:
|
||||
# Skip to the next iteration if the condition is not met
|
||||
logger.error("FIXME: Docs over 30k tokens, need to be chunked and summarized.")
|
||||
continue
|
||||
else:
|
||||
logger.warning(f"Already written, skip writing on Arxiv paper ID: {arxiv_id}")
|
||||
|
||||
|
||||
def blog_postprocessing(arxiv_id, research_review):
|
||||
""" Common function to do blog postprocessing. """
|
||||
try:
|
||||
append_id_to_file(arxiv_id, "papers_already_written_on.txt")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
|
||||
raise err
|
||||
|
||||
try:
|
||||
blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(research_review, "gemini")
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get blog metadata: {err}")
|
||||
raise err
|
||||
|
||||
try:
|
||||
arxiv_url_scrnsht = f"https://arxiv.org/abs/{arxiv_id}"
|
||||
generated_image_filepath = take_paper_screenshot(arxiv_url_scrnsht)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to tsk paper screenshot: {err}")
|
||||
raise err
|
||||
|
||||
try:
|
||||
save_blog_to_file(research_review, blog_title, blog_meta_desc, blog_tags,\
|
||||
blog_categories, generated_image_filepath)
|
||||
except Exception as err:
|
||||
logger.__repr__ror(f"Failed to save blog to a file: {err}")
|
||||
raise err
|
||||
|
||||
|
||||
def take_paper_screenshot(arxiv_url):
|
||||
""" Common function to take paper screenshot. """
|
||||
# fixme: Remove the hardcoding, need add another option OR in config ?
|
||||
image_dir = os.path.join(os.getcwd(), "blog_images")
|
||||
generated_image_name = f"generated_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
|
||||
generated_image_filepath = os.path.join(image_dir, generated_image_name)
|
||||
|
||||
if arxiv_url:
|
||||
try:
|
||||
generated_image_filepath = screenshot_api(arxiv_url, generated_image_filepath)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed in taking url screenshot: {err}")
|
||||
|
||||
return generated_image_filepath
|
||||
|
||||
|
||||
def num_tokens_from_string(string, encoding_name):
|
||||
"""Returns the number of tokens in a text string."""
|
||||
try:
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
return num_tokens
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to count tokens: {err}")
|
||||
sys.exit(1)
|
||||
@@ -2,6 +2,8 @@ import sys
|
||||
import os
|
||||
import re
|
||||
import datetime
|
||||
import random
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from textwrap import dedent
|
||||
import logging
|
||||
from zoneinfo import ZoneInfo
|
||||
@@ -22,6 +24,18 @@ output_path = "blogs"
|
||||
output_path = os.path.join(os.getcwd(), output_path)
|
||||
|
||||
|
||||
def random_date_last_three_months():
|
||||
current_date = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
|
||||
three_months_ago = current_date - relativedelta(months=3)
|
||||
|
||||
# Generate a random date between three_months_ago and current_date
|
||||
random_date = three_months_ago + datetime.timedelta(
|
||||
seconds=random.randint(0, int((current_date - three_months_ago).total_seconds()))
|
||||
)
|
||||
|
||||
return random_date.strftime('%Y-%m-%d %H:%M:%S %z')
|
||||
|
||||
|
||||
def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_categories, main_img_path=None, file_type="md"):
|
||||
"""
|
||||
Saves the provided blog content to a file in the specified format.
|
||||
@@ -60,9 +74,11 @@ def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_
|
||||
# Handle Markdown file type
|
||||
if file_type == "md":
|
||||
logger.info("Writing/Saving the resultant blog content in Markdown format.")
|
||||
dtobj = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
|
||||
formatted_date = dtobj.strftime('%Y-%m-%d %H:%M:%S %z')
|
||||
blog_title = blog_title.replace(":", "-").replace('"', '')
|
||||
# Hmmmm, bulk generation will benefit from randomizing publishing dates.
|
||||
#dtobj = datetime.datetime.now(ZoneInfo('Asia/Kolkata'))
|
||||
#formatted_date = dtobj.strftime('%Y-%m-%d %H:%M:%S %z')
|
||||
formatted_date = random_date_last_three_months()
|
||||
blog_title = blog_title.replace(":", "-").replace('"', '').replace('**', '')
|
||||
if main_img_path:
|
||||
blog_frontmatter = dedent(f"""\
|
||||
---
|
||||
@@ -70,7 +86,7 @@ def save_blog_to_file(blog_content, blog_title, blog_meta_desc, blog_tags, blog_
|
||||
date: {formatted_date}
|
||||
categories: [{blog_categories}]
|
||||
tags: [{blog_tags}]
|
||||
description: {blog_meta_desc.replace(":", "-")}
|
||||
description: {blog_meta_desc.replace(":", "-").replace('**', '')}
|
||||
img_path: '/assets/'
|
||||
image:
|
||||
path: {os.path.basename(main_img_path)}
|
||||
|
||||
@@ -44,27 +44,15 @@ def screenshot_api(url, generated_image_filepath):
|
||||
|
||||
return generated_image_filepath
|
||||
|
||||
|
||||
def take_screenshot(url, generated_image_filepath, full_screenshot):
|
||||
def take_screenshot(url, generated_image_filepath):
|
||||
# Create a webdriver instance
|
||||
driver = webdriver.Chrome()
|
||||
|
||||
# Navigate to the given url
|
||||
driver.get(url)
|
||||
|
||||
# Get the height of the webpage
|
||||
page_height = driver.execute_script("return document.body.scrollHeight")
|
||||
|
||||
# Scroll down to the bottom of the webpage
|
||||
for i in range(0, page_height, 100):
|
||||
driver.execute_script(f"window.scrollTo(0, {i})")
|
||||
|
||||
# Get the total height of the webpage
|
||||
total_height = driver.execute_script("return document.body.scrollHeight")
|
||||
|
||||
# Resize the webdriver window to the height of the webpage
|
||||
if full_screenshot:
|
||||
driver.set_window_size(800, total_height)
|
||||
# Set a fixed window size (you can adjust this as needed)
|
||||
driver.set_window_size(800, 600)
|
||||
|
||||
# Take a screenshot of the webpage
|
||||
screenshot = driver.get_screenshot_as_png()
|
||||
|
||||
49
lib/write_blog_scholar_paper.py
Normal file
49
lib/write_blog_scholar_paper.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import sys
|
||||
|
||||
from .gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def write_blog_from_paper(paper_content, gpt_providers="openai"):
|
||||
""" Write blog from given paper url. """
|
||||
prompt = f"""As an expert in NLP and AI, I will provide you with a content of a research paper.
|
||||
Your task is to write a highly detailed blog(at least 2000 words), breaking down complex concepts for beginners.
|
||||
Take your time and do not rush to respond.
|
||||
Do not provide explanations, suggestions in your response.
|
||||
|
||||
Include the below section in your blog:
|
||||
Highlights: Include a list of 5 most important and unique claims of the given research paper.
|
||||
Abstract: Start by reading the abstract, which provides a concise summary of the research, including its purpose, methodology, and key findings.
|
||||
Introduction: This section will give you background information and set the context for the research. It often ends with a statement of the research question or hypothesis.
|
||||
Methodology: Include description of how authors conducted the research. This can include data sources, experimental setup, analytical techniques, etc.
|
||||
Results: This section presents the data or findings of the research. Pay attention to figures, tables, and any statistical analysis provided.
|
||||
Discussion/Analysis: In this section, Explain how research paper answers the research questions or how they fit with existing knowledge.
|
||||
Conclusion: This part summarizes the main findings and their implications. It might also suggest areas for further research.
|
||||
References: The cited works can provide additional context or background reading.
|
||||
Remember, Please use MLA format and markdown syntax.
|
||||
Do not provide description, explanations for your response.
|
||||
Take your time in crafting your blog content, do not rush to give the response.
|
||||
Using the blog structure above, please write a detailed and original blog on given research paper: \n'{paper_content}'\n\n"""
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
raise err
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"failed to get response from Openai: {err}")
|
||||
raise err
|
||||
89
lib/write_research_review_blog.py
Normal file
89
lib/write_research_review_blog.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import sys
|
||||
|
||||
from .gpt_providers.openai_chat_completion import openai_chatgpt
|
||||
from .gpt_providers.gemini_pro_text import gemini_text_response
|
||||
from .gpt_providers.mistral_chat_completion import mistral_text_response
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def review_research_paper(research_blog, gpt_providers="openai"):
|
||||
""" """
|
||||
prompt = f"""As world's top researcher and academician, I will provide you with research paper.
|
||||
Your task is to write a highly detailed review report.
|
||||
Important, your report should be factual, original and demostrate your expertise.
|
||||
|
||||
Review guidelines:
|
||||
1). Read the Abstract and Introduction Carefully:
|
||||
Begin by thoroughly reading the abstract and introduction of the paper.
|
||||
Try to understand the research question, the objectives, and the background information.
|
||||
Identify the central argument or hypothesis that the study is examining.
|
||||
|
||||
2). Examine the Methodology and Methods:
|
||||
Read closely at the research design, whether it is experimental, observational, qualitative, or a combination of methods.
|
||||
Check the sampling strategy and the size of the sample.
|
||||
Review the methods of data collection and the instruments used for this purpose.
|
||||
Think about any ethical issues and possible biases in the study.
|
||||
|
||||
3). Analyze the Results and Discussion:
|
||||
Review how the results are presented, including any tables, graphs, and statistical analysis.
|
||||
Evaluate the findings' validity and reliability.
|
||||
Analyze whether the results support or contradict the research question and hypothesis.
|
||||
Read the discussion section where the authors interpret their findings and their significance.
|
||||
|
||||
4). Consider the Limitations and Strengths:
|
||||
Spot any limitations or potential weaknesses in the study.
|
||||
Evaluate the strengths and contributions that the research makes.
|
||||
Think about how generalizable the findings are to other populations or situations.
|
||||
|
||||
5). Assess the Writing and Organization:
|
||||
Judge the clarity and structure of the report.
|
||||
Consider the use of language, grammar, and the overall formatting.
|
||||
Assess how well the arguments are logically organized and how coherent the report is.
|
||||
|
||||
6). Evaluate the Literature Review:
|
||||
Examine how comprehensive and relevant the literature review is.
|
||||
Consider how the study adds to or builds upon existing research.
|
||||
Evaluate the timeliness and quality of the sources cited in the research.
|
||||
|
||||
7). Review the Conclusion and Implications:
|
||||
Look at the conclusions drawn from the study and how well they align with the findings.
|
||||
Think about the practical implications and potential applications of the research.
|
||||
Evaluate the suggestions for further research or policy actions.
|
||||
|
||||
8). Overall Assessment:
|
||||
Formulate an overall opinion about the research report's quality and thoroughness.
|
||||
Consider the significance and impact of the findings.
|
||||
Evaluate how the study contributes to its field of research.
|
||||
|
||||
9). Provide Constructive Feedback:
|
||||
Offer constructive criticism and suggestions for improvement, where necessary.
|
||||
Think about possible biases or alternative ways to interpret the findings.
|
||||
Suggest ideas for future research or for replicating the study.
|
||||
|
||||
Do not provide description, explanations for your response.
|
||||
Using the above review guidelines, write a detailed review report on the below research paper.
|
||||
Research Paper: '{research_blog}'
|
||||
"""
|
||||
|
||||
if 'gemini' in gpt_providers:
|
||||
try:
|
||||
response = gemini_text_response(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get response from gemini: {err}")
|
||||
response = mistral_text_response(prompt)
|
||||
return response
|
||||
|
||||
elif 'openai' in gpt_providers:
|
||||
try:
|
||||
logger.info("Calling OpenAI LLM.")
|
||||
response = openai_chatgpt(prompt)
|
||||
return response
|
||||
except Exception as err:
|
||||
SystemError(f"Failed to get response from Openai: {err}")
|
||||
Reference in New Issue
Block a user