ALwrity Version 0.5.0 (Fastapi + React )
This commit is contained in:
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
Gemini Audio Text Generation Module
|
||||
|
||||
This module provides a comprehensive interface for working with audio files using Google's Gemini API.
|
||||
It supports various audio processing capabilities including transcription, summarization, and analysis.
|
||||
|
||||
Key Features:
|
||||
------------
|
||||
1. Audio Transcription: Convert speech in audio files to text
|
||||
2. Audio Summarization: Generate concise summaries of audio content
|
||||
3. Segment Analysis: Analyze specific time segments of audio files
|
||||
4. Timestamped Transcription: Generate transcriptions with timestamps
|
||||
5. Token Counting: Count tokens in audio files
|
||||
6. Format Support: Information about supported audio formats
|
||||
|
||||
Supported Audio Formats:
|
||||
----------------------
|
||||
- WAV (audio/wav)
|
||||
- MP3 (audio/mp3)
|
||||
- AIFF (audio/aiff)
|
||||
- AAC (audio/aac)
|
||||
- OGG Vorbis (audio/ogg)
|
||||
- FLAC (audio/flac)
|
||||
|
||||
Technical Details:
|
||||
----------------
|
||||
- Each second of audio is represented as 32 tokens
|
||||
- Maximum supported length of audio data in a single prompt is 9.5 hours
|
||||
- Audio files are downsampled to 16 Kbps data resolution
|
||||
- Multi-channel audio is combined into a single channel
|
||||
|
||||
Usage:
|
||||
------
|
||||
```python
|
||||
from lib.gpt_providers.audio_to_text_generation.gemini_audio_text import transcribe_audio, summarize_audio
|
||||
|
||||
# Basic transcription
|
||||
transcript = transcribe_audio("path/to/audio.mp3")
|
||||
print(transcript)
|
||||
|
||||
# Summarization
|
||||
summary = summarize_audio("path/to/audio.mp3")
|
||||
print(summary)
|
||||
|
||||
# Analyze specific segment
|
||||
segment_analysis = analyze_audio_segment("path/to/audio.mp3", "02:30", "03:29")
|
||||
print(segment_analysis)
|
||||
```
|
||||
|
||||
Requirements:
|
||||
------------
|
||||
- GEMINI_API_KEY environment variable must be set
|
||||
- google-generativeai Python package
|
||||
- python-dotenv for environment variable management
|
||||
- loguru for logging
|
||||
|
||||
Dependencies:
|
||||
------------
|
||||
- google.genai
|
||||
- dotenv
|
||||
- loguru
|
||||
- os, sys, base64, typing
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import google.genai as genai
|
||||
from google.genai import types
|
||||
|
||||
|
||||
from loguru import logger
|
||||
logger.remove()
|
||||
logger.add(sys.stdout,
|
||||
colorize=True,
|
||||
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
|
||||
)
|
||||
|
||||
|
||||
def load_environment():
|
||||
"""Loads environment variables from a .env file."""
|
||||
load_dotenv()
|
||||
logger.info("Environment variables loaded successfully.")
|
||||
|
||||
|
||||
def configure_google_api():
|
||||
"""
|
||||
Configures the Google Gemini API with the API key from environment variables.
|
||||
|
||||
Raises:
|
||||
ValueError: If the GEMINI_API_KEY environment variable is not set.
|
||||
"""
|
||||
# Use APIKeyManager instead of direct environment variable access
|
||||
api_key_manager = APIKeyManager()
|
||||
api_key = api_key_manager.get_api_key("gemini")
|
||||
|
||||
if not api_key:
|
||||
error_message = "Gemini API key not found. Please configure it in the onboarding process."
|
||||
logger.error(error_message)
|
||||
raise ValueError(error_message)
|
||||
|
||||
genai.configure(api_key=api_key)
|
||||
logger.info("Google Gemini API configured successfully.")
|
||||
|
||||
|
||||
def transcribe_audio(audio_file_path: str, prompt: str = "Transcribe the following audio:") -> Optional[str]:
|
||||
"""
|
||||
Transcribes audio using Google's Gemini model.
|
||||
|
||||
Args:
|
||||
audio_file_path (str): The path to the audio file to be transcribed.
|
||||
prompt (str, optional): The prompt to guide the transcription. Defaults to "Transcribe the following audio:".
|
||||
|
||||
Returns:
|
||||
str: The transcribed text from the audio.
|
||||
Returns None if transcription fails.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the audio file is not found.
|
||||
"""
|
||||
try:
|
||||
# Load environment variables and configure the Google API
|
||||
load_environment()
|
||||
configure_google_api()
|
||||
|
||||
logger.info(f"Attempting to transcribe audio file: {audio_file_path}")
|
||||
|
||||
# Check if file exists
|
||||
if not os.path.exists(audio_file_path):
|
||||
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
|
||||
logger.error(error_message)
|
||||
raise FileNotFoundError(error_message)
|
||||
|
||||
# Initialize a Gemini model appropriate for audio understanding
|
||||
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
|
||||
|
||||
# Upload the audio file
|
||||
try:
|
||||
audio_file = genai.upload_file(audio_file_path)
|
||||
logger.info(f"Audio file uploaded successfully: {audio_file=}")
|
||||
except FileNotFoundError:
|
||||
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
|
||||
logger.error(error_message)
|
||||
raise FileNotFoundError(error_message)
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading audio file: {e}")
|
||||
return None
|
||||
|
||||
# Generate the transcription
|
||||
try:
|
||||
response = model.generate_content([
|
||||
prompt,
|
||||
audio_file
|
||||
])
|
||||
|
||||
# Check for valid response and extract text
|
||||
if response and hasattr(response, 'text'):
|
||||
transcript = response.text
|
||||
logger.info(f"Transcription successful:\n{transcript}")
|
||||
return transcript
|
||||
else:
|
||||
logger.warning("Transcription failed: Invalid or empty response from API.")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during transcription: {e}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def summarize_audio(audio_file_path: str) -> Optional[str]:
|
||||
"""
|
||||
Summarizes the content of an audio file using Google's Gemini model.
|
||||
|
||||
Args:
|
||||
audio_file_path (str): The path to the audio file to be summarized.
|
||||
|
||||
Returns:
|
||||
str: A summary of the audio content.
|
||||
Returns None if summarization fails.
|
||||
"""
|
||||
return transcribe_audio(audio_file_path, prompt="Please summarize the audio content:")
|
||||
|
||||
|
||||
def analyze_audio_segment(audio_file_path: str, start_time: str, end_time: str) -> Optional[str]:
|
||||
"""
|
||||
Analyzes a specific segment of an audio file using timestamps.
|
||||
|
||||
Args:
|
||||
audio_file_path (str): The path to the audio file.
|
||||
start_time (str): Start time in MM:SS format.
|
||||
end_time (str): End time in MM:SS format.
|
||||
|
||||
Returns:
|
||||
str: Analysis of the specified audio segment.
|
||||
Returns None if analysis fails.
|
||||
"""
|
||||
prompt = f"Analyze the audio content from {start_time} to {end_time}."
|
||||
return transcribe_audio(audio_file_path, prompt=prompt)
|
||||
|
||||
|
||||
def transcribe_with_timestamps(audio_file_path: str) -> Optional[str]:
|
||||
"""
|
||||
Transcribes audio with timestamps for each segment.
|
||||
|
||||
Args:
|
||||
audio_file_path (str): The path to the audio file.
|
||||
|
||||
Returns:
|
||||
str: Transcription with timestamps.
|
||||
Returns None if transcription fails.
|
||||
"""
|
||||
return transcribe_audio(audio_file_path, prompt="Transcribe the audio with timestamps for each segment:")
|
||||
|
||||
|
||||
def count_tokens(audio_file_path: str) -> Optional[int]:
|
||||
"""
|
||||
Counts the number of tokens in an audio file.
|
||||
|
||||
Args:
|
||||
audio_file_path (str): The path to the audio file.
|
||||
|
||||
Returns:
|
||||
int: Number of tokens in the audio file.
|
||||
Returns None if counting fails.
|
||||
"""
|
||||
try:
|
||||
# Load environment variables and configure the Google API
|
||||
load_environment()
|
||||
configure_google_api()
|
||||
|
||||
logger.info(f"Attempting to count tokens in audio file: {audio_file_path}")
|
||||
|
||||
# Check if file exists
|
||||
if not os.path.exists(audio_file_path):
|
||||
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
|
||||
logger.error(error_message)
|
||||
raise FileNotFoundError(error_message)
|
||||
|
||||
# Initialize a Gemini model
|
||||
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
|
||||
|
||||
# Upload the audio file
|
||||
try:
|
||||
audio_file = genai.upload_file(audio_file_path)
|
||||
logger.info(f"Audio file uploaded successfully: {audio_file=}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading audio file: {e}")
|
||||
return None
|
||||
|
||||
# Count tokens
|
||||
try:
|
||||
response = model.count_tokens([audio_file])
|
||||
token_count = response.total_tokens
|
||||
logger.info(f"Token count: {token_count}")
|
||||
return token_count
|
||||
except Exception as e:
|
||||
logger.error(f"Error counting tokens: {e}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_supported_formats() -> List[str]:
|
||||
"""
|
||||
Returns a list of supported audio formats.
|
||||
|
||||
Returns:
|
||||
List[str]: List of supported MIME types.
|
||||
"""
|
||||
return [
|
||||
"audio/wav",
|
||||
"audio/mp3",
|
||||
"audio/aiff",
|
||||
"audio/aac",
|
||||
"audio/ogg",
|
||||
"audio/flac"
|
||||
]
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Example 1: Basic transcription
|
||||
audio_path = "path/to/your/audio.mp3"
|
||||
transcript = transcribe_audio(audio_path)
|
||||
print(f"Transcript: {transcript}")
|
||||
|
||||
# Example 2: Summarization
|
||||
summary = summarize_audio(audio_path)
|
||||
print(f"Summary: {summary}")
|
||||
|
||||
# Example 3: Analyze specific segment
|
||||
segment_analysis = analyze_audio_segment(audio_path, "02:30", "03:29")
|
||||
print(f"Segment Analysis: {segment_analysis}")
|
||||
|
||||
# Example 4: Transcription with timestamps
|
||||
timestamped_transcript = transcribe_with_timestamps(audio_path)
|
||||
print(f"Timestamped Transcript: {timestamped_transcript}")
|
||||
|
||||
# Example 5: Count tokens
|
||||
token_count = count_tokens(audio_path)
|
||||
print(f"Token Count: {token_count}")
|
||||
|
||||
# Example 6: Get supported formats
|
||||
formats = get_supported_formats()
|
||||
print(f"Supported Formats: {formats}")
|
||||
@@ -0,0 +1,218 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from pytubefix import YouTube
|
||||
from loguru import logger
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
import streamlit as st
|
||||
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_random_exponential,
|
||||
) # for exponential backoff
|
||||
|
||||
from .gemini_audio_text import transcribe_audio
|
||||
|
||||
# Import APIKeyManager
|
||||
from ...api_key_manager import APIKeyManager
|
||||
|
||||
|
||||
def progress_function(stream, chunk, bytes_remaining):
|
||||
# Calculate the percentage completion
|
||||
current = ((stream.filesize - bytes_remaining) / stream.filesize)
|
||||
progress_bar.update(current - progress_bar.n) # Update the progress bar
|
||||
|
||||
|
||||
def rename_file_with_underscores(file_path):
|
||||
"""Rename a file by replacing spaces and special characters with underscores.
|
||||
|
||||
Args:
|
||||
file_path (str): The original file path.
|
||||
|
||||
Returns:
|
||||
str: The new file path with underscores.
|
||||
"""
|
||||
# Extract the directory and the filename
|
||||
dir_name, original_filename = os.path.split(file_path)
|
||||
|
||||
# Replace spaces and special characters with underscores in the filename
|
||||
new_filename = re.sub(r'[^\w\-_\.]', '_', original_filename)
|
||||
|
||||
# Create the new file path
|
||||
new_file_path = os.path.join(dir_name, new_filename)
|
||||
|
||||
# Rename the file
|
||||
os.rename(file_path, new_file_path)
|
||||
|
||||
return new_file_path
|
||||
|
||||
|
||||
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
||||
def speech_to_text(video_url):
|
||||
"""
|
||||
Transcribes speech to text from a YouTube video URL using OpenAI's Whisper model.
|
||||
|
||||
Args:
|
||||
video_url (str): URL of the YouTube video to transcribe.
|
||||
output_path (str, optional): Directory where the audio file will be saved. Defaults to '.'.
|
||||
|
||||
Returns:
|
||||
str: The transcribed text from the video.
|
||||
|
||||
Raises:
|
||||
SystemExit: If a critical error occurs that prevents successful execution.
|
||||
"""
|
||||
output_path = os.getenv("CONTENT_SAVE_DIR")
|
||||
yt = None
|
||||
audio_file = None
|
||||
with st.status("Started Writing..", expanded=False) as status:
|
||||
try:
|
||||
if video_url.startswith("https://www.youtube.com/") or video_url.startswith("http://www.youtube.com/"):
|
||||
logger.info(f"Accessing YouTube URL: {video_url}")
|
||||
status.update(label=f"Accessing YouTube URL: {video_url}")
|
||||
try:
|
||||
vid_id = video_url.split("=")[1]
|
||||
yt = YouTube(video_url, on_progress_callback=progress_function)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to get pytube stream object: {err}")
|
||||
st.stop()
|
||||
|
||||
logger.info(f"Fetching the highest quality audio stream:{yt.title}")
|
||||
status.update(label=f"Fetching the highest quality audio stream: {yt.title}")
|
||||
try:
|
||||
audio_stream = yt.streams.filter(only_audio=True).first()
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to Download Youtube Audio: {err}")
|
||||
st.stop()
|
||||
|
||||
if audio_stream is None:
|
||||
logger.warning("No audio stream found for this video.")
|
||||
st.warning("No audio stream found for this video.")
|
||||
st.stop()
|
||||
|
||||
logger.info(f"Downloading audio for: {yt.title}")
|
||||
status.update(label=f"Downloading audio for: {yt.title}")
|
||||
global progress_bar
|
||||
progress_bar = tqdm(total=1.0, unit='iB', unit_scale=True, desc=yt.title)
|
||||
try:
|
||||
audio_filename = re.sub(r'[^\w\-_\.]', '_', yt.title) + '.mp4'
|
||||
audio_file = audio_stream.download(
|
||||
output_path=os.getenv("CONTENT_SAVE_DIR"),
|
||||
filename=audio_filename)
|
||||
#audio_file = rename_file_with_underscores(audio_file)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to download audio file: {audio_file}")
|
||||
|
||||
progress_bar.close()
|
||||
logger.info(f"Audio downloaded: {yt.title} to {audio_file}")
|
||||
status.update(label=f"Audio downloaded: {yt.title} to {output_path}")
|
||||
# Audio filepath from local directory.
|
||||
elif os.path.exists(audio_input):
|
||||
audio_file = video_url
|
||||
|
||||
# Checking file size
|
||||
max_file_size = 24 * 1024 * 1024 # 24MB
|
||||
file_size = os.path.getsize(audio_file)
|
||||
# Convert file size to MB for logging
|
||||
file_size_MB = file_size / (1024 * 1024) # Convert bytes to MB
|
||||
|
||||
logger.info(f"Downloaded Audio Size is: {file_size_MB:.2f} MB")
|
||||
status.update(label=f"Downloaded Audio Size is: {file_size_MB:.2f} MB")
|
||||
|
||||
if file_size > max_file_size:
|
||||
logger.error("File size exceeds 24MB limit.")
|
||||
# FIXME: We can chunk hour long videos, the code is not tested.
|
||||
#long_video(audio_file)
|
||||
sys.exit("File size limit exceeded.")
|
||||
st.error("Audio File size limit exceeded. File a fixme/issues at ALwrity github.")
|
||||
|
||||
try:
|
||||
print(f"Audio File: {audio_file}")
|
||||
transcript = transcribe_audio(audio_file)
|
||||
print(f"\n\n\n--- Tracribe: {transcript} ----\n\n\n")
|
||||
exit(1)
|
||||
status.update(label=f"Initializing OpenAI client for transcription: {audio_file}")
|
||||
logger.info(f"Initializing OpenAI client for transcription: {audio_file}")
|
||||
|
||||
# Use APIKeyManager instead of direct environment variable access
|
||||
api_key_manager = APIKeyManager()
|
||||
api_key = api_key_manager.get_api_key("openai")
|
||||
|
||||
if not api_key:
|
||||
raise ValueError("OpenAI API key not found. Please configure it in the onboarding process.")
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
|
||||
logger.info("Transcribing using OpenAI's Whisper model.")
|
||||
transcript = client.audio.transcriptions.create(
|
||||
model="whisper-1",
|
||||
file=open(audio_file, "rb"),
|
||||
response_format="text"
|
||||
)
|
||||
logger.info(f"\nYouTube video transcription:\n{yt.title}\n{transcript}\n")
|
||||
status.update(label=f"\nYouTube video transcription:\n{yt.title}\n{transcript}\n")
|
||||
return transcript, yt.title
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed in Whisper transcription: {e}")
|
||||
st.warning(f"Failed in Openai Whisper transcription: {e}")
|
||||
transcript = transcribe_audio(audio_file)
|
||||
print(f"\n\n\n--- Tracribe: {transcript} ----\n\n\n")
|
||||
return transcript, yt.title
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred during YouTube video processing: {e}")
|
||||
|
||||
finally:
|
||||
try:
|
||||
if os.path.exists(audio_file):
|
||||
os.remove(audio_file)
|
||||
logger.info("Temporary audio file removed.")
|
||||
except PermissionError:
|
||||
st.error(f"Permission error: Cannot remove '{audio_file}'. Please make sure of necessary permissions.")
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred removing audio file: {e}")
|
||||
|
||||
|
||||
def long_video(temp_file_name):
|
||||
"""
|
||||
Transcribes a YouTube video using OpenAI's Whisper API by processing the video in chunks.
|
||||
|
||||
This function handles videos longer than the context limit of the Whisper API by dividing the video into
|
||||
10-minute segments, transcribing each segment individually, and then combining the results.
|
||||
|
||||
Key Changes and Notes:
|
||||
1. Video Splitting: Splits the audio into 10-minute chunks using the moviepy library.
|
||||
2. Chunk Transcription: Each audio chunk is transcribed separately and the results are concatenated.
|
||||
3. Temporary Files for Chunks: Uses temporary files for each audio chunk for transcription.
|
||||
4. Error Handling: Exception handling is included to capture and return any errors during the process.
|
||||
5. Logging: Process steps are logged for debugging and monitoring.
|
||||
6. Cleaning Up: Removes temporary files for both the entire video and individual audio chunks after processing.
|
||||
|
||||
Args:
|
||||
video_url (str): URL of the YouTube video to be transcribed.
|
||||
"""
|
||||
# Extract audio and split into chunks
|
||||
logger.info(f"Processing the YT video: {temp_file_name}")
|
||||
full_audio = mp.AudioFileClip(temp_file_name)
|
||||
duration = full_audio.duration
|
||||
chunk_length = 600 # 10 minutes in seconds
|
||||
chunks = [full_audio.subclip(start, min(start + chunk_length, duration)) for start in range(0, int(duration), chunk_length)]
|
||||
|
||||
combined_transcript = ""
|
||||
for i, chunk in enumerate(chunks):
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as audio_chunk_file:
|
||||
chunk.write_audiofile(audio_chunk_file.name, codec="mp3")
|
||||
with open(audio_chunk_file.name, "rb", encoding="utf-8") as audio_file:
|
||||
# Transcribe each chunk using OpenAI's Whisper API
|
||||
app.logger.info(f"Transcribing chunk {i+1}/{len(chunks)}")
|
||||
transcript = openai.Audio.transcribe("whisper-1", audio_file)
|
||||
combined_transcript += transcript['text'] + "\n\n"
|
||||
|
||||
# Remove the chunk audio file
|
||||
os.remove(audio_chunk_file.name)
|
||||
|
||||
Reference in New Issue
Block a user