YT to blog, bug fixes - WIP
This commit is contained in:
@@ -2,10 +2,13 @@ import os
|
|||||||
import datetime #I wish
|
import datetime #I wish
|
||||||
import sys
|
import sys
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
import openai
|
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from pytubefix import YouTube
|
||||||
|
import tempfile
|
||||||
|
from html2image import Html2Image
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
logger.remove()
|
logger.remove()
|
||||||
logger.add(sys.stdout,
|
logger.add(sys.stdout,
|
||||||
@@ -13,11 +16,70 @@ logger.add(sys.stdout,
|
|||||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||||
)
|
)
|
||||||
|
|
||||||
from .write_blogs_from_youtube_videos import youtube_to_blog
|
|
||||||
from ...ai_web_researcher.gpt_online_researcher import do_google_serp_search
|
from ...ai_web_researcher.gpt_online_researcher import do_google_serp_search
|
||||||
from ..blog_from_google_serp import blog_with_research
|
from ..blog_from_google_serp import blog_with_research
|
||||||
from ...blog_metadata.get_blog_metadata import blog_metadata
|
from ...blog_metadata.get_blog_metadata import blog_metadata
|
||||||
from ...blog_postprocessing.save_blog_to_file import save_blog_to_file
|
from ...blog_postprocessing.save_blog_to_file import save_blog_to_file
|
||||||
|
from ...gpt_providers.audio_to_text_generation.stt_audio_blog import speech_to_text
|
||||||
|
from ...gpt_providers.text_generation.main_text_generation import llm_text_gen
|
||||||
|
|
||||||
|
|
||||||
|
def youtube_to_blog(video_url):
|
||||||
|
"""Function to transcribe a given youtube url """
|
||||||
|
try:
|
||||||
|
# Starting the speech-to-text process
|
||||||
|
logger.info("Starting with Speech to Text.")
|
||||||
|
audio_text, audio_title = speech_to_text(video_url)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in speech_to_text: {e}")
|
||||||
|
sys.exit(1) # Exit the program due to error in speech_to_text
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Summarizing the content of the YouTube video
|
||||||
|
audio_blog_content = summarize_youtube_video(audio_text)
|
||||||
|
logger.info("Successfully converted given URL to blog article.")
|
||||||
|
return audio_blog_content, audio_title
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in summarize_youtube_video: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_youtube_video(user_content):
|
||||||
|
"""Generates a summary of a YouTube video using OpenAI GPT-3 and displays a progress bar.
|
||||||
|
Args:
|
||||||
|
video_link: The URL of the YouTube video to summarize.
|
||||||
|
Returns:
|
||||||
|
A string containing the summary of the video.
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger.info("Start summarize_youtube_video..")
|
||||||
|
prompt = f"""
|
||||||
|
You are an expert copywriter specializing in digital content writing. I will provide you with a transcript.
|
||||||
|
Your task is to transform a given transcript into a well-structured and informative blog article.
|
||||||
|
Please follow the below objectives:
|
||||||
|
|
||||||
|
1. Master the Transcript: Understand main ideas, key points, and the core message.
|
||||||
|
2. Sentence Structure: Rephrase while preserving logical flow and coherence. Dont quote anyone from video.
|
||||||
|
3. Note: Check if the transcript is about programming, then include code examples and snippets in your article.
|
||||||
|
4. Write Unique Content: Avoid direct copying; rewrite in your own words.
|
||||||
|
5. REMEMBER to avoid direct quoting and maintain uniqueness.
|
||||||
|
6. Proofread: Check for grammar, spelling, and punctuation errors.
|
||||||
|
7. Use Creative and Human-like Style: Incorporate contractions, idioms, transitional phrases, interjections, and colloquialisms. 8. Avoid repetitive phrases and unnatural sentence structures.
|
||||||
|
9. Ensure Uniqueness: Guarantee the article is plagiarism-free.
|
||||||
|
10. Punctuation: Use appropriate question marks at the end of questions.
|
||||||
|
11. Pass AI Detection Tools: Create content that easily passes AI plagiarism detection tools.
|
||||||
|
12. Rephrase words like 'video, youtube, channel' with 'article, blog' and such suitable words.
|
||||||
|
|
||||||
|
Follow the above guidelines to create a well-optimized, unique, and informative article,
|
||||||
|
that will rank well in search engine results and engage readers effectively.
|
||||||
|
Follow above guidelines to craft a blog content from the following transcript:\n{user_content}
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
response = llm_text_gen(prompt)
|
||||||
|
return response
|
||||||
|
except Exception as err:
|
||||||
|
logger.error(f"Failed to summarize_youtube_video: {err}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
def generate_audio_blog(audio_input):
|
def generate_audio_blog(audio_input):
|
||||||
|
|||||||
@@ -1,82 +0,0 @@
|
|||||||
import os
|
|
||||||
import time
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from pytube import YouTube
|
|
||||||
import tempfile
|
|
||||||
import openai
|
|
||||||
from html2image import Html2Image
|
|
||||||
from tqdm import tqdm, trange
|
|
||||||
import google.generativeai as genai
|
|
||||||
|
|
||||||
from loguru import logger
|
|
||||||
logger.remove()
|
|
||||||
logger.add(sys.stdout,
|
|
||||||
colorize=True,
|
|
||||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
from ...gpt_providers.audio_to_text_generation.stt_audio_blog import speech_to_text
|
|
||||||
from ...gpt_providers.text_generation.main_text_generation import llm_text_gen
|
|
||||||
|
|
||||||
|
|
||||||
def youtube_to_blog(video_url):
|
|
||||||
"""Function to transcribe a given youtube url """
|
|
||||||
# fixme: Doesnt work all types of yt urls.
|
|
||||||
vid_id = video_url.split("=")[1]
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Starting the speech-to-text process
|
|
||||||
logger.info("Starting with Speech to Text.")
|
|
||||||
audio_text, audio_title = speech_to_text(video_url)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in speech_to_text: {e}")
|
|
||||||
sys.exit(1) # Exit the program due to error in speech_to_text
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Summarizing the content of the YouTube video
|
|
||||||
audio_blog_content = summarize_youtube_video(audio_text)
|
|
||||||
logger.info("Successfully converted given URL to blog article.")
|
|
||||||
return audio_blog_content, audio_title
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in summarize_youtube_video: {e}")
|
|
||||||
sys.exit(1) # Exit the program due to error in summarize_youtube_video
|
|
||||||
|
|
||||||
|
|
||||||
def summarize_youtube_video(user_content):
|
|
||||||
"""Generates a summary of a YouTube video using OpenAI GPT-3 and displays a progress bar.
|
|
||||||
Args:
|
|
||||||
video_link: The URL of the YouTube video to summarize.
|
|
||||||
Returns:
|
|
||||||
A string containing the summary of the video.
|
|
||||||
"""
|
|
||||||
|
|
||||||
logger.info("Start summarize_youtube_video..")
|
|
||||||
prompt = f"""
|
|
||||||
You are an expert copywriter specializing in digital content writing. I will provide you with a transcript.
|
|
||||||
Your task is to transform a given transcript into a well-structured and informative blog article.
|
|
||||||
Please follow the below objectives:
|
|
||||||
|
|
||||||
1. Master the Transcript: Understand main ideas, key points, and the core message.
|
|
||||||
2. Sentence Structure: Rephrase while preserving logical flow and coherence. Dont quote anyone from video.
|
|
||||||
3. Note: Check if the transcript is about programming, then include code examples and snippets in your article.
|
|
||||||
4. Write Unique Content: Avoid direct copying; rewrite in your own words.
|
|
||||||
5. REMEMBER to avoid direct quoting and maintain uniqueness.
|
|
||||||
6. Proofread: Check for grammar, spelling, and punctuation errors.
|
|
||||||
7. Use Creative and Human-like Style: Incorporate contractions, idioms, transitional phrases, interjections, and colloquialisms. 8. Avoid repetitive phrases and unnatural sentence structures.
|
|
||||||
9. Ensure Uniqueness: Guarantee the article is plagiarism-free.
|
|
||||||
10. Punctuation: Use appropriate question marks at the end of questions.
|
|
||||||
11. Pass AI Detection Tools: Create content that easily passes AI plagiarism detection tools.
|
|
||||||
12. Rephrase words like 'video, youtube, channel' with 'article, blog' and such suitable words.
|
|
||||||
|
|
||||||
Follow the above guidelines to create a well-optimized, unique, and informative article,
|
|
||||||
that will rank well in search engine results and engage readers effectively.
|
|
||||||
Follow above guidelines to craft a blog content from the following transcript:\n{user_content}
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
response = llm_text_gen(prompt)
|
|
||||||
return response
|
|
||||||
except Exception as err:
|
|
||||||
logger.error(f"Failed to summarize_youtube_video: {err}")
|
|
||||||
exit(1)
|
|
||||||
@@ -1,61 +1,101 @@
|
|||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
import google.generativeai as genai
|
import google.generativeai as genai
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
logger.remove()
|
||||||
|
logger.add(sys.stdout,
|
||||||
|
colorize=True,
|
||||||
|
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_environment():
|
def load_environment():
|
||||||
"""Load environment variables from a .env file."""
|
"""Loads environment variables from a .env file."""
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
logger.info("Environment variables loaded successfully.")
|
||||||
|
|
||||||
|
|
||||||
def configure_google_api():
|
def configure_google_api():
|
||||||
"""Configure the Google API for audio summarization."""
|
"""Configures the Google Gemini API for audio transcription.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the GEMINI_API_KEY environment variable is not set.
|
||||||
|
"""
|
||||||
api_key = os.getenv("GEMINI_API_KEY")
|
api_key = os.getenv("GEMINI_API_KEY")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
raise ValueError("Google API key not found. Please set the GEMINI_API_KEY environment variable.")
|
error_message = "Google API key not found. Please set the GEMINI_API_KEY environment variable."
|
||||||
|
logger.error(error_message)
|
||||||
|
raise ValueError(error_message)
|
||||||
|
|
||||||
genai.configure(api_key=api_key)
|
genai.configure(api_key=api_key)
|
||||||
|
logger.info("Google Gemini API configured successfully.")
|
||||||
|
|
||||||
|
|
||||||
def transcribe_audio(audio_file_path):
|
def transcribe_audio(audio_file_path):
|
||||||
"""Summarize the audio using Google's Generative API.
|
"""
|
||||||
|
Transcribes audio using Google's Gemini Pro model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
audio_file_path (str): The path to the audio file to be summarized.
|
audio_file_path (str): The path to the audio file to be transcribed.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: The summary text of the audio.
|
str: The transcribed text from the audio.
|
||||||
|
Returns None if transcription fails.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the audio file path is invalid or the API response is not successful.
|
FileNotFoundError: If the audio file is not found.
|
||||||
Exception: For any other errors that occur during the process.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Load environment variables and configure API
|
# Load environment variables and configure the Google API
|
||||||
load_environment()
|
load_environment()
|
||||||
configure_google_api()
|
configure_google_api()
|
||||||
|
|
||||||
# Create generative model instance
|
logger.info(f"Attempting to transcribe audio file: {audio_file_path}")
|
||||||
model = genai.GenerativeModel("models/gemini-1.5-pro-latest")
|
|
||||||
audio_file = None
|
# Check if file exists
|
||||||
try:
|
if not os.path.exists(audio_file_path):
|
||||||
# Upload the audio file
|
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
|
||||||
audio_file = genai.upload_file(path=audio_file_path)
|
logger.error(error_message)
|
||||||
except Exception as err:
|
raise FileNotFoundError(error_message)
|
||||||
print(err)
|
|
||||||
# Generate the summary
|
# Initialize a Gemini model appropriate for your use case.
|
||||||
response = model.generate_content(
|
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
|
||||||
[
|
|
||||||
"Listen carefully to the given following audio file. Transcribe the following given audio.",
|
# Upload the audio file
|
||||||
audio_file
|
try:
|
||||||
]
|
audio_file = genai.upload_file(audio_file_path)
|
||||||
)
|
logger.info(f"Audio file uploaded successfully: {audio_file=}")
|
||||||
|
except FileNotFoundError:
|
||||||
# Check if the response contains text
|
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
|
||||||
if not hasattr(response, 'text'):
|
logger.error(error_message)
|
||||||
raise ValueError("The API response does not contain text.")
|
raise FileNotFoundError(error_message)
|
||||||
|
except Exception as e:
|
||||||
return response.text
|
logger.error(f"Error uploading audio file: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Generate the transcription
|
||||||
|
try:
|
||||||
|
response = model.generate_content([
|
||||||
|
"Transcribe the following audio:",
|
||||||
|
audio_file
|
||||||
|
])
|
||||||
|
|
||||||
|
# Check for valid response and extract text
|
||||||
|
if response and hasattr(response, 'text'):
|
||||||
|
transcript = response.text
|
||||||
|
logger.info(f"Transcription successful:\n{transcript}")
|
||||||
|
return transcript
|
||||||
|
else:
|
||||||
|
logger.warning("Transcription failed: Invalid or empty response from API.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during transcription: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
except ValueError as ve:
|
|
||||||
print(f"ValueError: {ve}")
|
|
||||||
except FileNotFoundError:
|
|
||||||
print(f"FileNotFoundError: The audio file at {audio_file_path} does not exist.")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"An error occurred: {e}")
|
logger.error(f"An unexpected error occurred: {e}")
|
||||||
|
return None
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from pytube import YouTube
|
from pytubefix import YouTube
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
@@ -22,6 +22,7 @@ def progress_function(stream, chunk, bytes_remaining):
|
|||||||
current = ((stream.filesize - bytes_remaining) / stream.filesize)
|
current = ((stream.filesize - bytes_remaining) / stream.filesize)
|
||||||
progress_bar.update(current - progress_bar.n) # Update the progress bar
|
progress_bar.update(current - progress_bar.n) # Update the progress bar
|
||||||
|
|
||||||
|
|
||||||
def rename_file_with_underscores(file_path):
|
def rename_file_with_underscores(file_path):
|
||||||
"""Rename a file by replacing spaces and special characters with underscores.
|
"""Rename a file by replacing spaces and special characters with underscores.
|
||||||
|
|
||||||
@@ -62,22 +63,32 @@ def speech_to_text(video_url):
|
|||||||
SystemExit: If a critical error occurs that prevents successful execution.
|
SystemExit: If a critical error occurs that prevents successful execution.
|
||||||
"""
|
"""
|
||||||
output_path = os.getenv("CONTENT_SAVE_DIR")
|
output_path = os.getenv("CONTENT_SAVE_DIR")
|
||||||
|
yt = None
|
||||||
|
audio_file = None
|
||||||
with st.status("Started Writing..", expanded=False) as status:
|
with st.status("Started Writing..", expanded=False) as status:
|
||||||
try:
|
try:
|
||||||
audio_file = None
|
|
||||||
if video_url.startswith("https://www.youtube.com/") or video_url.startswith("http://www.youtube.com/"):
|
if video_url.startswith("https://www.youtube.com/") or video_url.startswith("http://www.youtube.com/"):
|
||||||
logger.info(f"Accessing YouTube URL: {video_url}")
|
logger.info(f"Accessing YouTube URL: {video_url}")
|
||||||
status.update(label=f"Accessing YouTube URL: {video_url}")
|
status.update(label=f"Accessing YouTube URL: {video_url}")
|
||||||
yt = YouTube(video_url, on_progress_callback=progress_function)
|
try:
|
||||||
|
vid_id = video_url.split("=")[1]
|
||||||
logger.info("Fetching the highest quality audio stream")
|
yt = YouTube(video_url, on_progress_callback=progress_function)
|
||||||
status.update(label="Fetching the highest quality audio stream")
|
except Exception as err:
|
||||||
audio_stream = yt.streams.filter(only_audio=True).first()
|
logger.error(f"Failed to get pytube stream object: {err}")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
logger.info(f"Fetching the highest quality audio stream:{yt.title}")
|
||||||
|
status.update(label=f"Fetching the highest quality audio stream: {yt.title}")
|
||||||
|
try:
|
||||||
|
audio_stream = yt.streams.filter(only_audio=True).first()
|
||||||
|
except Exception as err:
|
||||||
|
logger.error(f"Failed to Download Youtube Audio: {err}")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
if audio_stream is None:
|
if audio_stream is None:
|
||||||
logger.warning("No audio stream found for this video.")
|
logger.warning("No audio stream found for this video.")
|
||||||
st.warning("No audio stream found for this video.")
|
st.warning("No audio stream found for this video.")
|
||||||
return None
|
st.stop()
|
||||||
|
|
||||||
logger.info(f"Downloading audio for: {yt.title}")
|
logger.info(f"Downloading audio for: {yt.title}")
|
||||||
status.update(label=f"Downloading audio for: {yt.title}")
|
status.update(label=f"Downloading audio for: {yt.title}")
|
||||||
@@ -113,9 +124,13 @@ def speech_to_text(video_url):
|
|||||||
# FIXME: We can chunk hour long videos, the code is not tested.
|
# FIXME: We can chunk hour long videos, the code is not tested.
|
||||||
#long_video(audio_file)
|
#long_video(audio_file)
|
||||||
sys.exit("File size limit exceeded.")
|
sys.exit("File size limit exceeded.")
|
||||||
st.error("Audio File size limit exceeded.")
|
st.error("Audio File size limit exceeded. File a fixme/issues at ALwrity github.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
print(f"Audio File: {audio_file}")
|
||||||
|
transcript = transcribe_audio(audio_file)
|
||||||
|
print(f"\n\n\n--- Tracribe: {transcript} ----\n\n\n")
|
||||||
|
exit(1)
|
||||||
status.update(label=f"Initializing OpenAI client for transcription: {audio_file}")
|
status.update(label=f"Initializing OpenAI client for transcription: {audio_file}")
|
||||||
logger.info(f"Initializing OpenAI client for transcription: {audio_file}")
|
logger.info(f"Initializing OpenAI client for transcription: {audio_file}")
|
||||||
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
||||||
@@ -170,7 +185,7 @@ def long_video(temp_file_name):
|
|||||||
video_url (str): URL of the YouTube video to be transcribed.
|
video_url (str): URL of the YouTube video to be transcribed.
|
||||||
"""
|
"""
|
||||||
# Extract audio and split into chunks
|
# Extract audio and split into chunks
|
||||||
app.logger.info(f"Processing the YT video: {temp_file_name}")
|
logger.info(f"Processing the YT video: {temp_file_name}")
|
||||||
full_audio = mp.AudioFileClip(temp_file_name)
|
full_audio = mp.AudioFileClip(temp_file_name)
|
||||||
duration = full_audio.duration
|
duration = full_audio.duration
|
||||||
chunk_length = 600 # 10 minutes in seconds
|
chunk_length = 600 # 10 minutes in seconds
|
||||||
|
|||||||
@@ -169,7 +169,8 @@ def blog_from_keyword():
|
|||||||
st.error('🚫 Blog keywords should be at least two words long. Please try again.')
|
st.error('🚫 Blog keywords should be at least two words long. Please try again.')
|
||||||
|
|
||||||
elif input_type == "youtube_url" or input_type == "audio_file":
|
elif input_type == "youtube_url" or input_type == "audio_file":
|
||||||
generate_audio_blog(user_input)
|
if not generate_audio_blog(user_input):
|
||||||
|
st.stop()
|
||||||
|
|
||||||
elif input_type == "web_url":
|
elif input_type == "web_url":
|
||||||
blog_from_url(user_input)
|
blog_from_url(user_input)
|
||||||
|
|||||||
@@ -38,4 +38,5 @@ streamlit-mic-recorder
|
|||||||
tinify
|
tinify
|
||||||
cloudscraper
|
cloudscraper
|
||||||
xmlschema
|
xmlschema
|
||||||
|
moviepy
|
||||||
googlesearch-python
|
googlesearch-python
|
||||||
|
|||||||
Reference in New Issue
Block a user