ALwrity Version 0.5.0 (Fastapi + React )

This commit is contained in:
ajaysi
2025-08-06 12:48:02 +05:30
parent f28a919caa
commit 32f97fa6b3
476 changed files with 115544 additions and 28747 deletions

View File

@@ -0,0 +1,311 @@
"""
Gemini Audio Text Generation Module
This module provides a comprehensive interface for working with audio files using Google's Gemini API.
It supports various audio processing capabilities including transcription, summarization, and analysis.
Key Features:
------------
1. Audio Transcription: Convert speech in audio files to text
2. Audio Summarization: Generate concise summaries of audio content
3. Segment Analysis: Analyze specific time segments of audio files
4. Timestamped Transcription: Generate transcriptions with timestamps
5. Token Counting: Count tokens in audio files
6. Format Support: Information about supported audio formats
Supported Audio Formats:
----------------------
- WAV (audio/wav)
- MP3 (audio/mp3)
- AIFF (audio/aiff)
- AAC (audio/aac)
- OGG Vorbis (audio/ogg)
- FLAC (audio/flac)
Technical Details:
----------------
- Each second of audio is represented as 32 tokens
- Maximum supported length of audio data in a single prompt is 9.5 hours
- Audio files are downsampled to 16 Kbps data resolution
- Multi-channel audio is combined into a single channel
Usage:
------
```python
from lib.gpt_providers.audio_to_text_generation.gemini_audio_text import transcribe_audio, summarize_audio
# Basic transcription
transcript = transcribe_audio("path/to/audio.mp3")
print(transcript)
# Summarization
summary = summarize_audio("path/to/audio.mp3")
print(summary)
# Analyze specific segment
segment_analysis = analyze_audio_segment("path/to/audio.mp3", "02:30", "03:29")
print(segment_analysis)
```
Requirements:
------------
- GEMINI_API_KEY environment variable must be set
- google-generativeai Python package
- python-dotenv for environment variable management
- loguru for logging
Dependencies:
------------
- google.genai
- dotenv
- loguru
- os, sys, base64, typing
"""
import os
import sys
from pathlib import Path
import google.genai as genai
from google.genai import types
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def load_environment():
"""Loads environment variables from a .env file."""
load_dotenv()
logger.info("Environment variables loaded successfully.")
def configure_google_api():
"""
Configures the Google Gemini API with the API key from environment variables.
Raises:
ValueError: If the GEMINI_API_KEY environment variable is not set.
"""
# Use APIKeyManager instead of direct environment variable access
api_key_manager = APIKeyManager()
api_key = api_key_manager.get_api_key("gemini")
if not api_key:
error_message = "Gemini API key not found. Please configure it in the onboarding process."
logger.error(error_message)
raise ValueError(error_message)
genai.configure(api_key=api_key)
logger.info("Google Gemini API configured successfully.")
def transcribe_audio(audio_file_path: str, prompt: str = "Transcribe the following audio:") -> Optional[str]:
"""
Transcribes audio using Google's Gemini model.
Args:
audio_file_path (str): The path to the audio file to be transcribed.
prompt (str, optional): The prompt to guide the transcription. Defaults to "Transcribe the following audio:".
Returns:
str: The transcribed text from the audio.
Returns None if transcription fails.
Raises:
FileNotFoundError: If the audio file is not found.
"""
try:
# Load environment variables and configure the Google API
load_environment()
configure_google_api()
logger.info(f"Attempting to transcribe audio file: {audio_file_path}")
# Check if file exists
if not os.path.exists(audio_file_path):
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
logger.error(error_message)
raise FileNotFoundError(error_message)
# Initialize a Gemini model appropriate for audio understanding
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Upload the audio file
try:
audio_file = genai.upload_file(audio_file_path)
logger.info(f"Audio file uploaded successfully: {audio_file=}")
except FileNotFoundError:
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
logger.error(error_message)
raise FileNotFoundError(error_message)
except Exception as e:
logger.error(f"Error uploading audio file: {e}")
return None
# Generate the transcription
try:
response = model.generate_content([
prompt,
audio_file
])
# Check for valid response and extract text
if response and hasattr(response, 'text'):
transcript = response.text
logger.info(f"Transcription successful:\n{transcript}")
return transcript
else:
logger.warning("Transcription failed: Invalid or empty response from API.")
return None
except Exception as e:
logger.error(f"Error during transcription: {e}")
return None
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
return None
def summarize_audio(audio_file_path: str) -> Optional[str]:
"""
Summarizes the content of an audio file using Google's Gemini model.
Args:
audio_file_path (str): The path to the audio file to be summarized.
Returns:
str: A summary of the audio content.
Returns None if summarization fails.
"""
return transcribe_audio(audio_file_path, prompt="Please summarize the audio content:")
def analyze_audio_segment(audio_file_path: str, start_time: str, end_time: str) -> Optional[str]:
"""
Analyzes a specific segment of an audio file using timestamps.
Args:
audio_file_path (str): The path to the audio file.
start_time (str): Start time in MM:SS format.
end_time (str): End time in MM:SS format.
Returns:
str: Analysis of the specified audio segment.
Returns None if analysis fails.
"""
prompt = f"Analyze the audio content from {start_time} to {end_time}."
return transcribe_audio(audio_file_path, prompt=prompt)
def transcribe_with_timestamps(audio_file_path: str) -> Optional[str]:
"""
Transcribes audio with timestamps for each segment.
Args:
audio_file_path (str): The path to the audio file.
Returns:
str: Transcription with timestamps.
Returns None if transcription fails.
"""
return transcribe_audio(audio_file_path, prompt="Transcribe the audio with timestamps for each segment:")
def count_tokens(audio_file_path: str) -> Optional[int]:
"""
Counts the number of tokens in an audio file.
Args:
audio_file_path (str): The path to the audio file.
Returns:
int: Number of tokens in the audio file.
Returns None if counting fails.
"""
try:
# Load environment variables and configure the Google API
load_environment()
configure_google_api()
logger.info(f"Attempting to count tokens in audio file: {audio_file_path}")
# Check if file exists
if not os.path.exists(audio_file_path):
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
logger.error(error_message)
raise FileNotFoundError(error_message)
# Initialize a Gemini model
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Upload the audio file
try:
audio_file = genai.upload_file(audio_file_path)
logger.info(f"Audio file uploaded successfully: {audio_file=}")
except Exception as e:
logger.error(f"Error uploading audio file: {e}")
return None
# Count tokens
try:
response = model.count_tokens([audio_file])
token_count = response.total_tokens
logger.info(f"Token count: {token_count}")
return token_count
except Exception as e:
logger.error(f"Error counting tokens: {e}")
return None
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
return None
def get_supported_formats() -> List[str]:
"""
Returns a list of supported audio formats.
Returns:
List[str]: List of supported MIME types.
"""
return [
"audio/wav",
"audio/mp3",
"audio/aiff",
"audio/aac",
"audio/ogg",
"audio/flac"
]
# Example usage
if __name__ == "__main__":
# Example 1: Basic transcription
audio_path = "path/to/your/audio.mp3"
transcript = transcribe_audio(audio_path)
print(f"Transcript: {transcript}")
# Example 2: Summarization
summary = summarize_audio(audio_path)
print(f"Summary: {summary}")
# Example 3: Analyze specific segment
segment_analysis = analyze_audio_segment(audio_path, "02:30", "03:29")
print(f"Segment Analysis: {segment_analysis}")
# Example 4: Transcription with timestamps
timestamped_transcript = transcribe_with_timestamps(audio_path)
print(f"Timestamped Transcript: {timestamped_transcript}")
# Example 5: Count tokens
token_count = count_tokens(audio_path)
print(f"Token Count: {token_count}")
# Example 6: Get supported formats
formats = get_supported_formats()
print(f"Supported Formats: {formats}")

View File

@@ -0,0 +1,218 @@
import os
import re
import sys
import tempfile
from pytubefix import YouTube
from loguru import logger
from openai import OpenAI
from tqdm import tqdm
import streamlit as st
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
) # for exponential backoff
from .gemini_audio_text import transcribe_audio
# Import APIKeyManager
from ...api_key_manager import APIKeyManager
def progress_function(stream, chunk, bytes_remaining):
# Calculate the percentage completion
current = ((stream.filesize - bytes_remaining) / stream.filesize)
progress_bar.update(current - progress_bar.n) # Update the progress bar
def rename_file_with_underscores(file_path):
"""Rename a file by replacing spaces and special characters with underscores.
Args:
file_path (str): The original file path.
Returns:
str: The new file path with underscores.
"""
# Extract the directory and the filename
dir_name, original_filename = os.path.split(file_path)
# Replace spaces and special characters with underscores in the filename
new_filename = re.sub(r'[^\w\-_\.]', '_', original_filename)
# Create the new file path
new_file_path = os.path.join(dir_name, new_filename)
# Rename the file
os.rename(file_path, new_file_path)
return new_file_path
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def speech_to_text(video_url):
"""
Transcribes speech to text from a YouTube video URL using OpenAI's Whisper model.
Args:
video_url (str): URL of the YouTube video to transcribe.
output_path (str, optional): Directory where the audio file will be saved. Defaults to '.'.
Returns:
str: The transcribed text from the video.
Raises:
SystemExit: If a critical error occurs that prevents successful execution.
"""
output_path = os.getenv("CONTENT_SAVE_DIR")
yt = None
audio_file = None
with st.status("Started Writing..", expanded=False) as status:
try:
if video_url.startswith("https://www.youtube.com/") or video_url.startswith("http://www.youtube.com/"):
logger.info(f"Accessing YouTube URL: {video_url}")
status.update(label=f"Accessing YouTube URL: {video_url}")
try:
vid_id = video_url.split("=")[1]
yt = YouTube(video_url, on_progress_callback=progress_function)
except Exception as err:
logger.error(f"Failed to get pytube stream object: {err}")
st.stop()
logger.info(f"Fetching the highest quality audio stream:{yt.title}")
status.update(label=f"Fetching the highest quality audio stream: {yt.title}")
try:
audio_stream = yt.streams.filter(only_audio=True).first()
except Exception as err:
logger.error(f"Failed to Download Youtube Audio: {err}")
st.stop()
if audio_stream is None:
logger.warning("No audio stream found for this video.")
st.warning("No audio stream found for this video.")
st.stop()
logger.info(f"Downloading audio for: {yt.title}")
status.update(label=f"Downloading audio for: {yt.title}")
global progress_bar
progress_bar = tqdm(total=1.0, unit='iB', unit_scale=True, desc=yt.title)
try:
audio_filename = re.sub(r'[^\w\-_\.]', '_', yt.title) + '.mp4'
audio_file = audio_stream.download(
output_path=os.getenv("CONTENT_SAVE_DIR"),
filename=audio_filename)
#audio_file = rename_file_with_underscores(audio_file)
except Exception as err:
logger.error(f"Failed to download audio file: {audio_file}")
progress_bar.close()
logger.info(f"Audio downloaded: {yt.title} to {audio_file}")
status.update(label=f"Audio downloaded: {yt.title} to {output_path}")
# Audio filepath from local directory.
elif os.path.exists(audio_input):
audio_file = video_url
# Checking file size
max_file_size = 24 * 1024 * 1024 # 24MB
file_size = os.path.getsize(audio_file)
# Convert file size to MB for logging
file_size_MB = file_size / (1024 * 1024) # Convert bytes to MB
logger.info(f"Downloaded Audio Size is: {file_size_MB:.2f} MB")
status.update(label=f"Downloaded Audio Size is: {file_size_MB:.2f} MB")
if file_size > max_file_size:
logger.error("File size exceeds 24MB limit.")
# FIXME: We can chunk hour long videos, the code is not tested.
#long_video(audio_file)
sys.exit("File size limit exceeded.")
st.error("Audio File size limit exceeded. File a fixme/issues at ALwrity github.")
try:
print(f"Audio File: {audio_file}")
transcript = transcribe_audio(audio_file)
print(f"\n\n\n--- Tracribe: {transcript} ----\n\n\n")
exit(1)
status.update(label=f"Initializing OpenAI client for transcription: {audio_file}")
logger.info(f"Initializing OpenAI client for transcription: {audio_file}")
# Use APIKeyManager instead of direct environment variable access
api_key_manager = APIKeyManager()
api_key = api_key_manager.get_api_key("openai")
if not api_key:
raise ValueError("OpenAI API key not found. Please configure it in the onboarding process.")
client = OpenAI(api_key=api_key)
logger.info("Transcribing using OpenAI's Whisper model.")
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=open(audio_file, "rb"),
response_format="text"
)
logger.info(f"\nYouTube video transcription:\n{yt.title}\n{transcript}\n")
status.update(label=f"\nYouTube video transcription:\n{yt.title}\n{transcript}\n")
return transcript, yt.title
except Exception as e:
logger.error(f"Failed in Whisper transcription: {e}")
st.warning(f"Failed in Openai Whisper transcription: {e}")
transcript = transcribe_audio(audio_file)
print(f"\n\n\n--- Tracribe: {transcript} ----\n\n\n")
return transcript, yt.title
except Exception as e:
st.error(f"An error occurred during YouTube video processing: {e}")
finally:
try:
if os.path.exists(audio_file):
os.remove(audio_file)
logger.info("Temporary audio file removed.")
except PermissionError:
st.error(f"Permission error: Cannot remove '{audio_file}'. Please make sure of necessary permissions.")
except Exception as e:
st.error(f"An error occurred removing audio file: {e}")
def long_video(temp_file_name):
"""
Transcribes a YouTube video using OpenAI's Whisper API by processing the video in chunks.
This function handles videos longer than the context limit of the Whisper API by dividing the video into
10-minute segments, transcribing each segment individually, and then combining the results.
Key Changes and Notes:
1. Video Splitting: Splits the audio into 10-minute chunks using the moviepy library.
2. Chunk Transcription: Each audio chunk is transcribed separately and the results are concatenated.
3. Temporary Files for Chunks: Uses temporary files for each audio chunk for transcription.
4. Error Handling: Exception handling is included to capture and return any errors during the process.
5. Logging: Process steps are logged for debugging and monitoring.
6. Cleaning Up: Removes temporary files for both the entire video and individual audio chunks after processing.
Args:
video_url (str): URL of the YouTube video to be transcribed.
"""
# Extract audio and split into chunks
logger.info(f"Processing the YT video: {temp_file_name}")
full_audio = mp.AudioFileClip(temp_file_name)
duration = full_audio.duration
chunk_length = 600 # 10 minutes in seconds
chunks = [full_audio.subclip(start, min(start + chunk_length, duration)) for start in range(0, int(duration), chunk_length)]
combined_transcript = ""
for i, chunk in enumerate(chunks):
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as audio_chunk_file:
chunk.write_audiofile(audio_chunk_file.name, codec="mp3")
with open(audio_chunk_file.name, "rb", encoding="utf-8") as audio_file:
# Transcribe each chunk using OpenAI's Whisper API
app.logger.info(f"Transcribing chunk {i+1}/{len(chunks)}")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
combined_transcript += transcript['text'] + "\n\n"
# Remove the chunk audio file
os.remove(audio_chunk_file.name)