main_config changes - WIP

2024-03-27 22:19:16 +05:30
parent e5a5372a29
commit 74b7bc3cbe
16 changed files with 63 additions and 543 deletions
--- a/lib/speech_to_blog/main_audio_to_blog.py
+++ b/lib/speech_to_blog/main_audio_to_blog.py
@@ -0,0 +1,114 @@
+import json
+import os
+import datetime #I wish
+import sys
+
+import openai
+from tqdm import tqdm, trange
+import time
+import re
+from textwrap import dedent
+import nltk
+nltk.download('punkt', quiet=True)
+from nltk.corpus import stopwords
+nltk.download('stopwords', quiet=True)
+
+from .write_blogs_from_youtube_videos import youtube_to_blog
+from .wordpress_blog_uploader import compress_image, upload_blog_post, upload_media
+from .gpt_online_researcher import do_online_research
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+
+def generate_youtube_blog(yt_url_list, output_format="markdown"):
+    """Takes a list of youtube videos and generates blog for each one of them.
+    """
+    # Use to store the blog in a string, to save in a *.md file.
+    blog_markdown_str = ""
+    for a_yt_url in yt_url_list:
+        try:
+            logger.info(f"Starting to write blog on URL: {a_yt_url}")
+            yt_blog = youtube_to_blog(a_yt_url)
+        except Exception as e:
+            logger.error(f"Error in youtube_to_blog: {e}")
+            sys.exit(1)
+
+        try:
+            logger.info("Starting with online research for URL title.")
+            research_report = do_online_research(yt_blog)
+        except Exception as e:
+            logger.error(f"Error in do_online_research: {e}")
+            sys.exit(1)
+
+        try:
+            # Note: Check if the order of input matters for your function
+            logger.info("Preparing a blog content from audio script and online research content...")
+            blog_with_research(research_report, yt_blog)
+        except Exception as e:
+            logger.error(f"Error in blog_with_research: {e}")
+            sys.exit(1)
+
+        try:
+            # Get the title and meta description of the blog.
+            blog_meta_desc = generate_blog_description(yt_blog)
+            title = generate_blog_title(blog_meta_desc)
+            logger.info(f"Title is {title} and description is {blog_meta_desc}")
+            blog_markdown_str = "# " + title.replace('"', '') + "\n\n"
+            # Get blog tags and categories.
+            blog_tags = get_blog_tags(blog_meta_desc)
+            logger.info(f"Blog tags are: {blog_tags}")
+            blog_categories = get_blog_categories(blog_meta_desc)
+            logger.info(f"Blog categories are: {blog_categories}")
+
+            # Generate an introduction for the blog
+            blog_intro = get_blog_intro(title, yt_blog)
+            logger.info(f"The Blog intro is:\n {blog_intro}")
+            blog_markdown_str = blog_markdown_str + "\n\n" + f"{blog_intro}" + "\n\n"
+
+            # Generate an image based on meta description
+            logger.info(f"Calling Image generation with prompt: {blog_meta_desc}")
+            main_img_path = generate_image(blog_meta_desc, image_dir, "dalle3")
+
+            # Get a variation of the yt url screenshot to use in the blog.
+            #varied_img_path = gen_new_from_given_img(yt_img_path, image_dir)
+            #logger.info(f"Image path: {main_img_path} and varied path: {varied_img_path}")
+            #blog_markdown_str = blog_markdown_str + f'![img-description]({os.path.basename(varied_img_path)})' + '_Image Caption_'
+
+            #stbdiff_img_path = generate_image(yt_img_path, image_dir, "stable_diffusion")
+            #logger.info(f"Image path: {main_img_path} from stable diffusion: {stbdiff_img_path}")
+            #blog_markdown_str = blog_markdown_str + f'![img-description]({os.path.basename(stbdiff_img_path)})' + f'_{title}_'
+            
+            # Add the body of the blog content.
+            blog_markdown_str = blog_markdown_str + "\n\n" + f'{yt_blog}' + "\n\n"
+
+            # Get the Conclusion of the blog, by passing the generated blog.
+            blog_conclusion = get_blog_conclusion(blog_markdown_str)
+            # TBD: Add another image.
+            blog_markdown_str = blog_markdown_str + "### Conclusion" + "\n\n" + f"{blog_conclusion}" + "\n"
+
+            # Proofread the blog, edit and remove dubplicates and refine it further.
+            # Presently, fixing the blog keywords to be tags and categories.
+            blog_keywords = f"{blog_tags} + {blog_categories}"
+            blog_markdown_str = blog_proof_editor(blog_markdown_str, blog_keywords)
+
+            # Check the type of blog format needed by the user.
+            if 'html' in output_format:
+                blog_markdown_str = convert_tomarkdown_format(blog_markdown_str)
+            elif 'markdown' in output_path:
+                blog_markdown_str = convert_markdown_to_html(blog_markdown_str)
+
+            # Try to save the blog content in a file, in whichever format. Just dump it.
+            try:
+                save_blog_to_file(blog_markdown_str, title, blog_meta_desc, blog_tags, blog_categories, main_img_path)
+            except Exception as err:
+                logger.error("Failed to Save blog content: {blog_markdown_str}")
+
+        except Exception as e:
+            # raise assertionerror
+            logger.error(f"Error: Failed to generate_youtube_blog: {e}")
+            exit(1)
--- a/lib/speech_to_blog/main_youtube_research_blog.py
+++ b/lib/speech_to_blog/main_youtube_research_blog.py
@@ -0,0 +1,150 @@
+import json
+import os
+import sys
+from loguru import logger
+
+# Import from local packages
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+from .gpt_providers.gpt_vision_img_details import analyze_and_extract_details_from_image
+from .generate_image_from_prompt import generate_image
+from .write_blogs_from_youtube_videos import youtube_to_blog
+from .wordpress_blog_uploader import compress_image, upload_blog_post, upload_media
+from .gpt_online_researcher import do_online_research
+from .save_blog_to_file import save_blog_to_file
+from .optimize_images_for_upload import optimize_image
+from .combine_research_and_blog import blog_with_research
+from .get_blog_meta_desc import generate_blog_description
+from .get_blog_title import generate_blog_title
+from .get_tags import get_blog_tags
+from .get_blog_category import get_blog_categories
+from .convert_content_to_markdown import convert_tomarkdown_format
+from .convert_markdown_to_html import convert_markdown_to_html
+from .utils.youtube_keyword_research import research_yt
+
+# Configuring the logger
+logger.remove()
+logger.add(sys.stdout, colorize=True, format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}")
+
+# Constants for directory paths
+IMAGE_DIR = os.path.join(os.getcwd(), "blog_images")
+OUTPUT_PATH = os.path.join(os.getcwd(), "blogs")
+
+
+def generate_youtube_research_blog(yt_keywords):
+    """
+    Research YouTube based on given keywords and get top video URLs.
+    """
+    for ayt_keyword in yt_keywords:
+        yt_research_response = ''
+        data = {}
+        logger.info(f"Researching YouTube top videos for: {yt_keywords}")
+        try:
+            yt_research_response = research_yt(ayt_keyword)
+            if not yt_research_response:
+                yt_research_response = research_yt(ayt_keyword)
+        except Exception as err:
+            logger.error(f"Failed to do YouTube Research: {err}")
+
+        if not yt_research_response.strip():
+            logger.warning("Error: JSON data is empty.")
+            yt_research_response = research_yt(ayt_keyword)
+        else:
+            try:
+                aggregated_data = load_response_json(yt_research_response, ayt_keyword)
+            except Exception as err:
+                logger.error(f"Failed to load json response: {err}")
+                sys.exit(1)
+
+            for title, a_yt_url, views, references, quickstart_code in zip(
+                    aggregated_data["titles"], aggregated_data["urls"], aggregated_data["views"],
+                    aggregated_data["references"], aggregated_data["quickstart_codes"]):
+                blog_markdown_str = ""
+                if a_yt_url != "No URL Provided":
+                    # Transcribe the audio using whisper model.
+                    try:
+                        logger.info(f"Starting to write blog on URL: {a_yt_url}")
+                        blog_markdown_str, yt_title = youtube_to_blog(a_yt_url)
+                        logger.warning("\n\n--------------- First Draft of the Blog: --------\n\n")
+                        logger.info(f"{blog_markdown_str}\n")
+                        logger.warning("--------------------END of First draft----------\n\n")
+                        if not yt_title or not blog_markdown_str:
+                            logger.error("No content or title for audio to proceed.")
+                            sys.exit(1)
+                    except Exception as e:
+                        logger.error(f"Error in youtube_to_blog: {e}")
+                        sys.exit(1)
+                sys.exit(1)
+
+                if title != "Unknown Title":
+                    print(f"Title: {title}")
+                if url != "No URL Provided":
+                    print(f"URL: {url}")
+                if views != "No View Count":
+                    print(f"Views: {views}")
+                if references:  # Checks if references list is not empty
+                    print(f"References: {', '.join(references)}")
+                if quickstart_code != "Code coming soon":
+                    print(f"Quickstart Code: {quickstart_code}")
+                print()  # Adds a newline for separation between entries
+
+
+
+def load_response_json(yt_research_response, yt_keyword):
+    """
+    Load and parse the YouTube research response JSON.
+    """
+    try:
+        logger.info(f"Loading the JSON data for parsing: {yt_research_response}")
+        data = json.loads(yt_research_response.replace('`', '').strip())
+
+        if isinstance(data, dict):
+            results_key = next((key for key in data if key.lower().startswith("result")), None)
+            if results_key:
+                research_yt_dict = process_results(data[results_key])
+        elif isinstance(data, list):
+            research_yt_dict = process_results(data)
+
+    except json.JSONDecodeError as e:
+        logger.error(f"load_response_json: Failed to parse JSON data: {e}")
+        generate_youtube_research_blog([yt_keyword])
+
+    return research_yt_dict
+
+
+def process_results(results):
+    """
+    Process the results from the YouTube research JSON and return the aggregated data.
+    
+    Args:
+        results (list): List of dictionaries containing YouTube video details.
+
+    Returns:
+        dict: A dictionary containing lists of titles, URLs, views, references, and quickstart codes.
+
+    Raises:
+        Exception: If an error occurs during the processing of individual entries.
+    """
+    titles = []
+    urls = []
+    views_list = []
+    references_list = []
+    quickstart_codes = []
+
+    for entry in results:
+        try:
+            titles.append(entry.get("Title", "Unknown Title"))
+            urls.append(entry.get("URL", "No URL Provided"))
+            views_list.append(entry.get("Views", "No View Count"))
+            references_list.append(entry.get("References", []))
+            quickstart_codes.append(entry.get("Quickstart_Code", "Code coming soon"))
+        except Exception as e:
+            logger.error(f"Error processing yt resulr entry: {e}")
+            continue
+
+    return {
+        "titles": titles,
+        "urls": urls,
+        "views": views_list,
+        "references": references_list,
+        "quickstart_codes": quickstart_codes
+    }
--- a/lib/speech_to_blog/write_blogs_from_youtube_videos.py
+++ b/lib/speech_to_blog/write_blogs_from_youtube_videos.py
@@ -0,0 +1,97 @@
+import os
+import time
+import sys
+
+from pytube import YouTube
+import tempfile
+import openai
+from html2image import Html2Image
+from tqdm import tqdm, trange
+import google.generativeai as genai
+
+from loguru import logger
+logger.remove()
+logger.add(sys.stdout,
+        colorize=True,
+        format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
+    )
+
+from .gpt_providers.stt_audio_blog import speech_to_text
+from .gpt_providers.openai_chat_completion import openai_chatgpt
+
+
+def youtube_to_blog(video_url):
+    """Function to transcribe a given youtube url """
+    # fixme: Doesnt work all types of yt urls.
+    vid_id = video_url.split("=")[1]
+    #hti = Html2Image(output_path="../blog_images")
+    #hti.screenshot(url=video_url, save_as=f"yt-img-{vid_id}.png")
+    #yt_img_path = os.path.join("../blog_images", f"yt-img-{vid_id}.png")
+
+    try:
+        # Starting the speech-to-text process
+        logger.info("Starting with Speech to Text.")
+        audio_text, audio_title = speech_to_text(video_url)
+    except Exception as e:
+        logger.error(f"Error in speech_to_text: {e}")
+        sys.exit(1)  # Exit the program due to error in speech_to_text
+
+    try:
+        # Summarizing the content of the YouTube video
+        audio_blog_content = summarize_youtube_video(audio_text, "gemini")
+        logger.info("Successfully converted given URL to blog article.")
+        return audio_blog_content, audio_title
+    except Exception as e:
+        logger.error(f"Error in summarize_youtube_video: {e}")
+        sys.exit(1)  # Exit the program due to error in summarize_youtube_video
+    return audio_blog_content
+
+
+def summarize_youtube_video(user_content, gpt_providers):
+    """Generates a summary of a YouTube video using OpenAI GPT-3 and displays a progress bar. 
+    Args:
+      video_link: The URL of the YouTube video to summarize.
+    Returns:
+      A string containing the summary of the video.
+    """
+
+    logger.info("Start summarize_youtube_video..")
+    prompt = f"""
+        You are an expert copywriter specializing in digital content writing. I will provide you with a transcript. 
+        Your task is to transform a given transcript into a well-structured and informative blog article. 
+        Please follow the below objectives:
+
+        1. Master the Transcript: Understand main ideas, key points, and the core message.
+        2. Sentence Structure: Rephrase while preserving logical flow and coherence. Dont quote anyone from video.
+        3. Note: Check if the transcript is about programming, then include code examples and snippets in your article.
+        4. Write Unique Content: Avoid direct copying; rewrite in your own words. 
+        5. REMEMBER to avoid direct quoting and maintain uniqueness.
+        6. Proofread: Check for grammar, spelling, and punctuation errors.
+        7. Use Creative and Human-like Style: Incorporate contractions, idioms, transitional phrases, interjections, and colloquialisms.        8. Avoid repetitive phrases and unnatural sentence structures.
+        9. Ensure Uniqueness: Guarantee the article is plagiarism-free.
+        10. Punctuation: Use appropriate question marks at the end of questions.
+        11. Pass AI Detection Tools: Create content that easily passes AI plagiarism detection tools.
+        12. Rephrase words like 'video, youtube, channel' with 'article, blog' and such suitable words.
+
+        Follow the above guidelines to create a well-optimized, unique, and informative article,
+        that will rank well in search engine results and engage readers effectively.
+        Follow above guidelines to craft a blog content from the following transcript:\n{user_content}
+        """
+    if 'gemini' in gpt_providers:
+        try:
+            genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
+        except Exception as err:
+            logger.error("Failed in getting GEMINI_API_KEY")
+        # Use gemini-pro model for text and image.
+        model = genai.GenerativeModel('gemini-pro')
+        try:
+            response = model.generate_content(prompt)
+            return response.text
+        except Exception as err:
+            logger.error("Failed to get response from gemini.")
+    elif 'openai' in gpt_providers:
+        try:
+            response = openai_chatgpt(prompt)
+            return response
+        except Exception as err:
+            SystemError(f"Error in generating blog summary: {err}")