AI FAQ Generator & github blogs

This commit is contained in:
ajaysi
2025-05-04 17:04:44 +05:30
parent c51e355d26
commit 26b02b9719
9 changed files with 1810 additions and 463 deletions

View File

@@ -1,140 +1,157 @@
""" Package for writing getting-started and how to guides. """
"""
Enhanced GitHub Blog Generator
This module provides comprehensive content generation from GitHub repositories,
including technical documentation, tutorials, case studies, and more.
"""
import os
import sys
import datetime
import json
from typing import Dict, List, Optional
from pathlib import Path
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")
from .scrape_github_readme import GitHubScraper, GitHubContent
from .scrape_github_readme import get_gh_details_vision, get_readme_content
from .scrape_github_readme import research_github_topics, check_if_already_written
from .github_getting_started import github_readme_blog
from .gpt_online_researcher import do_online_research
from .faqs_generator_blog import generate_blog_faq
from .get_blog_metadata import blog_metadata
from .save_blog_to_file import save_blog_to_file
from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file
from .github_getting_started import (
generate_technical_documentation,
generate_getting_started_guide,
generate_tutorial_series,
generate_comparison_analysis,
generate_case_studies,
generate_contribution_guide,
generate_security_guide,
generate_performance_guide
)
def blog_from_github(github_opts, flag):
""" Module for writing getting started code examples from github. """
if 'url' in flag:
try:
write_from_url(github_opts)
except Exception as err:
logger.error(f"Failed to write from github url: {github_opts}")
sys.exit(1)
elif 'csv' in flag:
try:
gh_urls = []
with open(github_opts, 'r', encoding="utf-8") as file:
# Read each line in the file
for gh_url in file:
gh_urls.append(gh_url.strip())
except FileNotFoundError:
logger.error(f"CSV File not found: {file_path}")
except Exception as e:
logger.error(f"CSV: An error occurred: {str(e)}")
for gh_url in gh_urls:
try:
write_from_url(gh_url.strip())
except Exception as err:
logger.error(f"Failed to write blog from github: {err}")
def write_from_url(gh_url):
# String to store the blog content.
howto_blog = ''
# The url was not found in already_written data.
if not check_if_already_written(gh_url):
logger.info(f"Writing getting started from url: {gh_url}")
else:
logger.error(f"Skipping, already written on url: {gh_url}")
return
# Direct link to the raw content of README file
# fixme: Remove the hardcoding, need add another option OR in config ?
image_dir = os.path.join(os.getcwd(), "blog_images")
generated_image_name = f"screenshot_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
generated_image_filepath = os.path.join(image_dir, generated_image_name)
try:
logger.info(f"Getting github repo details from vision model: {generated_image_filepath}")
gh_json = get_gh_details_vision(gh_url, generated_image_filepath)
except Exception as err:
logger.error(f"Failed to get gemini vision details from GH repo image: {err}")
sys.exit(1)
howto_blog = "```" + f"\nGithub URL:{gh_url}\nStars:{gh_json.get('stars')}\n"
howto_blog += f"Forks:{gh_json.get('forks')}\n"
howto_blog += f"Description:{gh_json.get('about')}\nBranch:{gh_json.get('branch_name')}\n" + "```\n\n"
raw_readme_url_base = "https://raw.githubusercontent.com/" + "/".join(gh_url.split("/")[-2:])
if gh_json.get('branch_name'):
raw_readme_url = raw_readme_url_base + f"/{gh_json.get('branch_name')}/" + "README.md"
else:
raw_readme_url = raw_readme_url_base + f"/main/" + "README.md"
logger.info(f"Using this url to fetch the README file: {raw_readme_url}")
try:
# Get and print the main content
readme_content = get_readme_content(raw_readme_url)
except Exception as err:
logger.error(f"Failed to get README from URL: {raw_readme_url}: {err}")
# If the readme is still None, try with master branch.
if not readme_content:
raw_readme_url = raw_readme_url_base + f"/master/" + "README.md"
logger.warning(f"Trying with master branch: {raw_readme_url}")
readme_content = get_readme_content(raw_readme_url)
if not readme_content:
logger.error(f"Still failed to get the README: {readme_content}")
sys.exit(1)
class GitHubBlogGenerator:
"""Generator for various types of GitHub-related content."""
# Create a getting-started blog, adapted from the GH url README.
howto_blog += github_readme_blog(readme_content, "gemini")
def __init__(self, cache_dir: str = ".github_cache", ttl_hours: int = 24):
"""Initialize the blog generator."""
self.cache_dir = Path(cache_dir)
self.scraper = GitHubScraper(cache_dir, ttl_hours)
self.output_dir = Path("generated_content")
self.output_dir.mkdir(exist_ok=True)
async def generate_content(self, github_url: str, content_types: List[str] = None) -> Dict[str, str]:
"""Generate various types of content from a GitHub repository."""
if content_types is None:
content_types = ["getting_started", "technical_docs", "tutorials"]
try:
# Scrape GitHub content
repo_content = await self.scraper.scrape_github_content(github_url)
# Generate different types of content
generated_content = {}
for content_type in content_types:
if content_type == "getting_started":
content = generate_getting_started_guide(repo_content.dict())
elif content_type == "technical_docs":
content = generate_technical_documentation(repo_content.dict())
elif content_type == "tutorials":
content = generate_tutorial_series(repo_content.dict())
elif content_type == "comparison":
content = generate_comparison_analysis(repo_content.dict())
elif content_type == "case_studies":
content = generate_case_studies(repo_content.dict())
elif content_type == "contribution":
content = generate_contribution_guide(repo_content.dict())
elif content_type == "security":
content = generate_security_guide(repo_content.dict())
elif content_type == "performance":
content = generate_performance_guide(repo_content.dict())
else:
logger.warning(f"Unknown content type: {content_type}")
continue
generated_content[content_type] = content
# Generate FAQs from online research
try:
research_report = do_online_research(repo_content.title, "gemini", github_url)
faqs = generate_blog_faq(research_report, "gemini")
generated_content["faqs"] = faqs
except Exception as err:
logger.error(f"Failed to generate FAQs: {err}")
return generated_content
except Exception as err:
logger.error(f"Failed to generate content: {err}")
raise
def save_content(self, content: Dict[str, str], base_filename: str):
"""Save generated content to files."""
try:
for content_type, content_text in content.items():
# Generate metadata for each content type
title, meta_desc, tags, categories = blog_metadata(content_text, "gemini")
# Create filename with content type
filename = f"{base_filename}_{content_type}.md"
# Save content to file
save_blog_to_file(
content_text,
title,
meta_desc,
tags,
categories,
None # No image path for now
)
logger.info(f"Saved {content_type} content to {filename}")
except Exception as err:
logger.error(f"Failed to save content: {err}")
raise
# Do online research for faqs on the github url.
try:
# Repo names are misnomers for others search, include its decription too.
# Which, skews the result favourably towards its home/paid pages.
#online_query = f"{''.join(gh_url.split('/')[-1:])} " + gh_json.get('about')
online_query = f"{''.join(gh_url.split('/')[-1:])} "
logger.info("Do web research with Tavily & Metaphor AI.")
research_report = do_online_research(online_query, "gemini", gh_url)
except Exception as err:
logger.error(f"failed to do online research: {err}")
async def main():
"""Example usage of the GitHub blog generator."""
generator = GitHubBlogGenerator()
# Example GitHub URLs
urls = [
"https://github.com/owner/repo",
"https://github.com/owner/another-repo"
]
content_types = [
"getting_started",
"technical_docs",
"tutorials",
"comparison",
"case_studies",
"contribution",
"security",
"performance"
]
for url in urls:
try:
# Generate content
content = await generator.generate_content(url, content_types)
# Create base filename from URL
base_filename = url.split("/")[-1]
# Save content
generator.save_content(content, base_filename)
except Exception as e:
logger.error(f"Error processing {url}: {e}")
# Generate FAQs from the online research report.
try:
blog_faqs = generate_blog_faq(research_report, "gemini")
howto_blog += f"\n\n## {''.join(gh_url.split('/')[-1:])} FAQs\n\n" + blog_faqs
except Exception as err:
logger.error(f"Failed to generate FAQs from web research_report: {err}")
logger.info(f"\n\nFinal Blog Content: {howto_blog}\n\n")
try:
blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(howto_blog, "gemini")
except Exception as err:
logger.error(f"Failed to get blog metadata: {err}")
raise err
try:
save_blog_to_file(howto_blog, blog_title, blog_meta_desc, blog_tags,\
blog_categories, generated_image_filepath)
except Exception as err:
logger.error(f"Failed to save blog to a file: {err}")
sys.exit(1)
try:
append_id_to_file(gh_url, "papers_already_written_on.txt")
except Exception as err:
logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
raise err
if __name__ == "__main__":
asyncio.run(main())