Features: AI Rich snippet from url, AI product description writer

This commit is contained in:
ajaysi
2024-07-17 12:00:27 +05:30
parent c923435be2
commit 44d83e2b81
19 changed files with 136 additions and 130 deletions

View File

@@ -0,0 +1,39 @@
import sys
from .gpt_providers.openai_chat_completion import openai_chatgpt
from .gpt_providers.gemini_pro_text import gemini_text_response
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def github_readme_blog(readme_content):
""" """
prompt = f"""As an expert programmer and teacher, Write an original, detailed and step-by-step guide, from the provided Text below.
Your guide should be original, engaging and help beginners get started easily.
Write new example codes and detailed comments on how to run them. Include appropriate emoji where applicable.
Include a referances section that links to more code examples.
Your response MUST be a how-to blog in markdown format.
Respond ONLY with your blog content.
Text: '{readme_content}'
"""
if 'gemini' in gpt_providers:
try:
response = gemini_text_response(prompt)
return response
except Exception as err:
logger.error(f"Failed to get response from gemini: {err}")
sys.exit(1)
elif 'openai' in gpt_providers:
try:
logger.info("Calling OpenAI LLM.")
response = openai_chatgpt(prompt)
return response
except Exception as err:
SystemError(f"Failed to get response from Openai: {err}")

View File

@@ -0,0 +1,140 @@
""" Package for writing getting-started and how to guides. """
import os
import sys
import datetime
import json
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .scrape_github_readme import get_gh_details_vision, get_readme_content
from .scrape_github_readme import research_github_topics, check_if_already_written
from .github_getting_started import github_readme_blog
from .gpt_online_researcher import do_online_research
from .faqs_generator_blog import generate_blog_faq
from .get_blog_metadata import blog_metadata
from .save_blog_to_file import save_blog_to_file
from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
def blog_from_github(github_opts, flag):
""" Module for writing getting started code examples from github. """
if 'url' in flag:
try:
write_from_url(github_opts)
except Exception as err:
logger.error(f"Failed to write from github url: {github_opts}")
sys.exit(1)
elif 'csv' in flag:
try:
gh_urls = []
with open(github_opts, 'r', encoding="utf-8") as file:
# Read each line in the file
for gh_url in file:
gh_urls.append(gh_url.strip())
except FileNotFoundError:
logger.error(f"CSV File not found: {file_path}")
except Exception as e:
logger.error(f"CSV: An error occurred: {str(e)}")
for gh_url in gh_urls:
try:
write_from_url(gh_url.strip())
except Exception as err:
logger.error(f"Failed to write blog from github: {err}")
def write_from_url(gh_url):
# String to store the blog content.
howto_blog = ''
# The url was not found in already_written data.
if not check_if_already_written(gh_url):
logger.info(f"Writing getting started from url: {gh_url}")
else:
logger.error(f"Skipping, already written on url: {gh_url}")
return
# Direct link to the raw content of README file
# fixme: Remove the hardcoding, need add another option OR in config ?
image_dir = os.path.join(os.getcwd(), "blog_images")
generated_image_name = f"screenshot_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
generated_image_filepath = os.path.join(image_dir, generated_image_name)
try:
logger.info(f"Getting github repo details from vision model: {generated_image_filepath}")
gh_json = get_gh_details_vision(gh_url, generated_image_filepath)
except Exception as err:
logger.error(f"Failed to get gemini vision details from GH repo image: {err}")
sys.exit(1)
howto_blog = "```" + f"\nGithub URL:{gh_url}\nStars:{gh_json.get('stars')}\n"
howto_blog += f"Forks:{gh_json.get('forks')}\n"
howto_blog += f"Description:{gh_json.get('about')}\nBranch:{gh_json.get('branch_name')}\n" + "```\n\n"
raw_readme_url_base = "https://raw.githubusercontent.com/" + "/".join(gh_url.split("/")[-2:])
if gh_json.get('branch_name'):
raw_readme_url = raw_readme_url_base + f"/{gh_json.get('branch_name')}/" + "README.md"
else:
raw_readme_url = raw_readme_url_base + f"/main/" + "README.md"
logger.info(f"Using this url to fetch the README file: {raw_readme_url}")
try:
# Get and print the main content
readme_content = get_readme_content(raw_readme_url)
except Exception as err:
logger.error(f"Failed to get README from URL: {raw_readme_url}: {err}")
# If the readme is still None, try with master branch.
if not readme_content:
raw_readme_url = raw_readme_url_base + f"/master/" + "README.md"
logger.warning(f"Trying with master branch: {raw_readme_url}")
readme_content = get_readme_content(raw_readme_url)
if not readme_content:
logger.error(f"Still failed to get the README: {readme_content}")
sys.exit(1)
# Create a getting-started blog, adapted from the GH url README.
howto_blog += github_readme_blog(readme_content, "gemini")
# Do online research for faqs on the github url.
try:
# Repo names are misnomers for others search, include its decription too.
# Which, skews the result favourably towards its home/paid pages.
#online_query = f"{''.join(gh_url.split('/')[-1:])} " + gh_json.get('about')
online_query = f"{''.join(gh_url.split('/')[-1:])} "
logger.info("Do web research with Tavily & Metaphor AI.")
research_report = do_online_research(online_query, "gemini", gh_url)
except Exception as err:
logger.error(f"failed to do online research: {err}")
# Generate FAQs from the online research report.
try:
blog_faqs = generate_blog_faq(research_report, "gemini")
howto_blog += f"\n\n## {''.join(gh_url.split('/')[-1:])} FAQs\n\n" + blog_faqs
except Exception as err:
logger.error(f"Failed to generate FAQs from web research_report: {err}")
logger.info(f"\n\nFinal Blog Content: {howto_blog}\n\n")
try:
blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(howto_blog, "gemini")
except Exception as err:
logger.error(f"Failed to get blog metadata: {err}")
raise err
try:
save_blog_to_file(howto_blog, blog_title, blog_meta_desc, blog_tags,\
blog_categories, generated_image_filepath)
except Exception as err:
logger.error(f"Failed to save blog to a file: {err}")
sys.exit(1)
try:
append_id_to_file(gh_url, "papers_already_written_on.txt")
except Exception as err:
logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
raise err

View File

@@ -0,0 +1,297 @@
import os
import sys
import datetime
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
from .take_url_screenshot import take_screenshot
from .gpt_providers.gemini_image_details import gemini_get_img_info
def get_readme_content(url):
try:
# Fetch the README content directly from the URL
response = requests.get(url)
print(response.status_code)
if response.status_code == 200:
logger.debug("Successfully fetched the README.md")
readme_content = response.text
else:
readme_content = None
return readme_content
except Exception as err:
logger.error(f"Failed to fetch raw readme from {url}: {err}: {response.status_code}")
sys.exit(1)
def get_gh_repo_metadata(github_url):
""" Function to get the repo details like stars, commits, forks etc """
logger.info("Scraping github with BS4 and requests.")
# download the target page
page = requests.get(github_url)
# parse the HTML document returned by the server
soup = BeautifulSoup(page.text, 'html.parser')
# initialize the object that will contain the scraped data
repo = {}
# repo scraping logic
name_html_element = soup.select_one('[itemprop="name"]')
name = name_html_element.get_text().strip()
git_branch_icon_html_element = soup.select_one('.octicon-git-branch')
main_branch_html_element = git_branch_icon_html_element.find_next_sibling('span')
main_branch = main_branch_html_element.get_text().strip()
# scrape the repo history data
boxheader_html_element = soup.select_one('.Box .Box-header')
# scrape the repo details in the right box
bordergrid_html_element = soup.select_one('.BorderGrid')
about_html_element = bordergrid_html_element.select_one('h2')
description_html_element = about_html_element.find_next_sibling('p')
description = description_html_element.get_text().strip()
star_icon_html_element = bordergrid_html_element.select_one('.octicon-star')
stars_html_element = star_icon_html_element.find_next_sibling('strong')
stars = stars_html_element.get_text().strip().replace(',', '')
eye_icon_html_element = bordergrid_html_element.select_one('.octicon-eye')
watchers_html_element = eye_icon_html_element.find_next_sibling('strong')
watchers = watchers_html_element.get_text().strip().replace(',', '')
fork_icon_html_element = bordergrid_html_element.select_one('.octicon-repo-forked')
forks_html_element = fork_icon_html_element.find_next_sibling('strong')
forks = forks_html_element.get_text().strip().replace(',', '')
# Find the div with class "f6" containing topic links
topic_div = soup.find('div', class_='f6')
if topic_div:
# Find all the topic links within the div
topic_links = topic_div.find_all('a', class_='topic-tag-link')
# Extract and print the topics
repo['topics'] = [link.text.strip() for link in topic_links]
# FIXME: Unable to scrape branch name.
repo['branch_name'] = None
# store the scraped data
repo['name'] = name
repo['about'] = description
repo['stars'] = stars
repo['watchers'] = watchers
repo['forks'] = forks
#repo['readme'] = readme
logger.info(f"Github Repo Details: {repo}")
return(repo)
def get_gh_details_vision(github_url, generated_image_filepath):
""" Take a screenshot of the url and feed to vision models for scraping details. """
logger.info(f"Take screenshot and pass it to gemini for repo details of {github_url}")
generated_image_filepath = take_screenshot(github_url, generated_image_filepath)
prompt = """From the given image of a github page, find out the number of stars, about, forks, last commit days, link url, topics and branch name. Return the result as json."""
try:
gh_details = gemini_get_img_info(prompt, generated_image_filepath)
logger.info(f"Github Repo details, from vision model: {gh_details}")
#gh_details = get_gh_repo_metadata(github_url)
except Exception as err:
logger.error(f"Failed to get gh images details: {err}")
gh_details = get_gh_repo_metadata(github_url)
return gh_details
# Convert string to dictionary Split the string into lines
lines = gh_details.split('\n')
# Remove the first and last line
modified_lines = lines[1:-1]
# Join the modified lines back into a string
gh_details = '\n'.join(modified_lines)
gh_details = json.loads(gh_details)
return(gh_details)
def research_github_topics(topics):
""" Scrape github topics of interest for top repos to write on """
# https://www.kaggle.com/code/subhaskumarray/scraping-github-topics-with-their-repositories
# We are going to scrape https://github.com/topics
# We will get a list of topics. For each topic, we will extract topic name, topic description and topic url.
# For each topic, we will get top 30 repositories with repo name, repo username, stars and repo url.
# Finally we are going to create csv file for each topic with respective repo details.
#github_topics = "https://github.com/topics/"
#response = requests.get(github_topics)
#if response.status_code != 200:
# logger.error(f'There is something wrong with {url}')
#response_contents = response.text
# Now we will parse the contents using BeautifulSoup:
#parsed_contents = BeautifulSoup(response_contents,'html.parser')
#logger.info("Get all topics, Titles and their urls from github.")
#topic_titles = get_topic_titles(parsed_contents)
#topic_desc = get_topic_desc(parsed_contents)
#topic_urls = get_topic_url(parsed_contents)
#topic_df = pd.DataFrame(list(zip(topic_titles, topic_desc,topic_urls)),\
# columns =['title', 'description', 'url'])
#logger.info(f"Scraped data from github: {topic_df}")
gh_topics = ['ai', 'ai-tools', 'ai-assistant', 'ai-agents-framework', 'llm', 'multi-agent', 'fine-tuning', 'rag', 'generative', 'prompt-engineering', 'generative-ai', 'text-to-image-generation', 'llm-ops', 'retrieval-augmented-generation', 'langchain', 'gemini-api', 'vertex-ai', 'huggingface', 'auto-gpt', 'llmops', 'ai-toolkit', 'chatbot', 'chatgpt', 'code-assistant', 'text-to-video', 'llms', 'gpt-4']
repo_info_dict = {
'username':[],
'repo_name': [],
'stars': [],
'repo_url': []
}
for agh_topic in gh_topics:
topic_url = f"https://github.com/topics/{agh_topic}"
first_topic_repo_page = download_repo_page(topic_url)
logger.info(f"Get details on github topic: {topic_url}")
repo_tags = first_topic_repo_page.find_all('h3', {'class': 'f3 color-fg-muted text-normal lh-condensed'})
star_tags = first_topic_repo_page.find_all('span', {'class': 'Counter js-social-count'})
for i in range(len(repo_tags)):
repo_details = get_repo_info(repo_tags[i], star_tags[i])
# Check if the repo URL is not already present in the dictionary
if repo_details[3] not in repo_info_dict['repo_url']:
# Store repos with more than 5000 stars.
if repo_details[2] > 5000:
repo_info_dict['username'].append(repo_details[0])
repo_info_dict['repo_name'].append(repo_details[1])
repo_info_dict['stars'].append(repo_details[2])
repo_info_dict['repo_url'].append(repo_details[3])
# Create a DataFrame from repo_info_dict
df_repo_info = pd.DataFrame(repo_info_dict['repo_url'])
# Check if the file already exists
csv_filename = 'github_url_to_write.csv'
if os.path.isfile(csv_filename):
# Append to the existing file
df_repo_info.to_csv(csv_filename, mode='a', header=False, index=False)
logger.info(f"Data appended to existing file: {csv_filename}")
else:
# Create a new file
df_repo_info.to_csv(csv_filename, index=False)
def get_topic_titles(parsed_content):
try:
selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = parsed_content.find_all('p',{'class':selected_class})
# We can make a list of topics
topic_titles = []
for tags in topic_title_tags:
topic_titles.append(tags.text)
return topic_titles
except Exception as err:
logger.error(f"Failed to get github topic titles: {err}")
def get_topic_desc(parsed_contents):
try:
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
topic_desc_tags = parsed_contents.find_all('p',{'class': desc_selector})
print(f"{topic_desc_tags}")
topic_desc = []
for desc in topic_desc_tags:
print("dsfsfs")
topic_desc.append(desc.text.strip()) # strip() is used for trimming all extra spaces in description.
return topic_desc
except Exception as err:
logger.error(f"Failed to get github topic desc: {err}")
def get_topic_url(parsed_contents):
try:
topic_link_tag = parsed_contents.find_all('a',{'class':'no-underline flex-1 d-flex flex-column'})
topic_urls = []
base_url = 'http://github.com'
for urls in topic_link_tag:
topic_urls.append(base_url + urls['href'])
return topic_urls
except Exception as err:
logger.error(f"Failed to get github topic urls: {err}")
def download_repo_page(topic_url):
response = requests.get(topic_url)
if response.status_code != 200:
print('There is some error in {}'.format(topic_url))
response_contents = response.text
parsed_contents = BeautifulSoup(response_contents,'html.parser')
return parsed_contents
def get_repo_info(repo_tags,star_tags):
# returns all info for a repo
a_tags = repo_tags.find_all('a')
username = a_tags[0].text.strip()
repo_name = a_tags[1].text.strip()
base_url = 'http://github.com/'
repo_url = base_url + a_tags[1]['href'].strip()
# Defining a function so that it will convert our star count to integer
def star_counts_converter(stars):
stars = stars.strip()
if stars[-1] == 'k':
return int(float(stars[:-1]) * 1000)
return int(stars)
star_counts = star_counts_converter(star_tags.text.strip())
return username,repo_name,star_counts,repo_url
def save_to_csv(topic_url,topic_name):
file_name = topic_name + '.csv'
if os.path.exists(file_name):
logger.debug(f"The file {file_name} already exists. Skipping.")
topics_df = topic_repo_details(topic_url)
topics_df.to_csv(file_name,index=None)
logger.info(f"Successfully scraped topic {topic_name}")
def check_if_already_written(github_url, file_path='papers_already_written_on.txt'):
"""
Check if a GitHub URL is an exact match in each line of a file.
Args:
github_url (str): GitHub URL string to check.
file_path (str): Path to the file containing lines to check against. Default is 'papers_already_written_on.txt'.
Returns:
bool: True if an exact match is found, False otherwise.
"""
try:
with open(file_path, 'r', encoding="utf-8") as file:
# Read each line in the file
for line in file:
# Check for an exact match
if github_url.strip() == line.strip():
return True
except FileNotFoundError:
print(f"File not found: {file_path}")
except Exception as e:
print(f"An error occurred: {str(e)}")
return False