ALwrity/lib/github_blogs/main_getting_started_blogs.py

""" Package for writing getting-started and how to guides. """

import os
import sys
import datetime
import json

from loguru import logger
logger.remove()
logger.add(sys.stdout,
        colorize=True,
        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
    )

from .scrape_github_readme import get_gh_details_vision, get_readme_content
from .scrape_github_readme import research_github_topics, check_if_already_written
from .github_getting_started import github_readme_blog
from .gpt_online_researcher import do_online_research
from .faqs_generator_blog import generate_blog_faq
from .get_blog_metadata import blog_metadata
from .save_blog_to_file import save_blog_to_file
from .arxiv_schlorly_research import read_written_ids, extract_arxiv_ids_from_line, append_id_to_file


def blog_from_github(github_opts, flag):
    """ Module for writing getting started code examples from github. """
    if 'url' in flag:
        try:
            write_from_url(github_opts)
        except Exception as err:
            logger.error(f"Failed to write from github url: {github_opts}")
            sys.exit(1)
    elif 'csv' in flag:
        try:
            gh_urls = []
            with open(github_opts, 'r') as file:
                # Read each line in the file
                for gh_url in file:
                    gh_urls.append(gh_url.strip())
        except FileNotFoundError:
            logger.error(f"CSV File not found: {file_path}")
        except Exception as e:
            logger.error(f"CSV: An error occurred: {str(e)}")

        for gh_url in gh_urls:
            try:
                write_from_url(gh_url.strip())
            except Exception as err:
                logger.error(f"Failed to write blog from github: {err}")


def write_from_url(gh_url):
    # String to store the blog content.
    howto_blog = ''
    # The url was not found in already_written data.
    if not check_if_already_written(gh_url):
        logger.info(f"Writing getting started from url: {gh_url}")
    else:
        logger.error(f"Skipping, already written on url: {gh_url}")
        return

    # Direct link to the raw content of README file
    # fixme: Remove the hardcoding, need add another option OR in config ?
    image_dir = os.path.join(os.getcwd(), "blog_images")
    generated_image_name = f"screenshot_image_{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.png"
    generated_image_filepath = os.path.join(image_dir, generated_image_name)
    try:
        logger.info(f"Getting github repo details from vision model: {generated_image_filepath}")
        gh_json = get_gh_details_vision(gh_url, generated_image_filepath)
    except Exception as err:
        logger.error(f"Failed to get gemini vision details from GH repo image: {err}")
        sys.exit(1)
    howto_blog = "```" + f"\nGithub URL:{gh_url}\nStars:{gh_json.get('stars')}\n"
    howto_blog += f"Forks:{gh_json.get('forks')}\n"
    howto_blog += f"Description:{gh_json.get('about')}\nBranch:{gh_json.get('branch_name')}\n" + "```\n\n"

    raw_readme_url_base = "https://raw.githubusercontent.com/" + "/".join(gh_url.split("/")[-2:])
    if gh_json.get('branch_name'):
        raw_readme_url = raw_readme_url_base + f"/{gh_json.get('branch_name')}/" + "README.md"
    else:
        raw_readme_url = raw_readme_url_base + f"/main/" + "README.md"
    logger.info(f"Using this url to fetch the README file: {raw_readme_url}")

    try:
        # Get and print the main content
        readme_content = get_readme_content(raw_readme_url)
    except Exception as err:
        logger.error(f"Failed to get README from URL: {raw_readme_url}: {err}")
    # If the readme is still None, try with master branch.
    if not readme_content:
        raw_readme_url = raw_readme_url_base + f"/master/" + "README.md"
        logger.warning(f"Trying with master branch: {raw_readme_url}")
        readme_content = get_readme_content(raw_readme_url)
        if not readme_content:
            logger.error(f"Still failed to get the README: {readme_content}")
            sys.exit(1)

    # Create a getting-started blog, adapted from the GH url README.
    howto_blog += github_readme_blog(readme_content, "gemini")

    # Do online research for faqs on the github url.
    try:
        # Repo names are misnomers for others search, include its decription too.
        # Which, skews the result favourably towards its home/paid pages.
        #online_query = f"{''.join(gh_url.split('/')[-1:])} " + gh_json.get('about')
        online_query = f"{''.join(gh_url.split('/')[-1:])} "
        logger.info("Do web research with Tavily & Metaphor AI.")
        research_report = do_online_research(online_query, "gemini", gh_url)
    except Exception as err:
        logger.error(f"failed to do online research: {err}")

    # Generate FAQs from the online research report.
    try:
        blog_faqs = generate_blog_faq(research_report, "gemini")
        howto_blog += f"\n\n## {''.join(gh_url.split('/')[-1:])} FAQs\n\n" + blog_faqs
    except Exception as err:
        logger.error(f"Failed to generate FAQs from web research_report: {err}")

    logger.info(f"\n\nFinal Blog Content: {howto_blog}\n\n")

    try:
        blog_title, blog_meta_desc, blog_tags, blog_categories = blog_metadata(howto_blog, "gemini")
    except Exception as err:
        logger.error(f"Failed to get blog metadata: {err}")
        raise err

    try:
        save_blog_to_file(howto_blog, blog_title, blog_meta_desc, blog_tags,\
            blog_categories, generated_image_filepath)
    except Exception as err:
        logger.error(f"Failed to save blog to a file: {err}")
        sys.exit(1)

    try:
        append_id_to_file(gh_url, "papers_already_written_on.txt")
    except Exception as err:
        logger.error(f"Failed to write/append ID to papers_already_written_on.txt: {err}")
        raise err