Gemini AI common code and utils

This commit is contained in:
ajaysi
2025-04-11 17:47:55 +05:30
parent b556edb989
commit e41be5789a
18 changed files with 590 additions and 262 deletions

View File

@@ -1,8 +1,75 @@
"""
Gemini Audio Text Generation Module
This module provides a comprehensive interface for working with audio files using Google's Gemini API.
It supports various audio processing capabilities including transcription, summarization, and analysis.
Key Features:
------------
1. Audio Transcription: Convert speech in audio files to text
2. Audio Summarization: Generate concise summaries of audio content
3. Segment Analysis: Analyze specific time segments of audio files
4. Timestamped Transcription: Generate transcriptions with timestamps
5. Token Counting: Count tokens in audio files
6. Format Support: Information about supported audio formats
Supported Audio Formats:
----------------------
- WAV (audio/wav)
- MP3 (audio/mp3)
- AIFF (audio/aiff)
- AAC (audio/aac)
- OGG Vorbis (audio/ogg)
- FLAC (audio/flac)
Technical Details:
----------------
- Each second of audio is represented as 32 tokens
- Maximum supported length of audio data in a single prompt is 9.5 hours
- Audio files are downsampled to 16 Kbps data resolution
- Multi-channel audio is combined into a single channel
Usage:
------
```python
from lib.gpt_providers.audio_to_text_generation.gemini_audio_text import transcribe_audio, summarize_audio
# Basic transcription
transcript = transcribe_audio("path/to/audio.mp3")
print(transcript)
# Summarization
summary = summarize_audio("path/to/audio.mp3")
print(summary)
# Analyze specific segment
segment_analysis = analyze_audio_segment("path/to/audio.mp3", "02:30", "03:29")
print(segment_analysis)
```
Requirements:
------------
- GEMINI_API_KEY environment variable must be set
- google-generativeai Python package
- python-dotenv for environment variable management
- loguru for logging
Dependencies:
------------
- google.genai
- dotenv
- loguru
- os, sys, base64, typing
"""
import os
import sys
import google.generativeai as genai
import base64
from typing import Optional, Dict, Any, List, Union
from dotenv import load_dotenv
from google import genai
from google.genai import types
from loguru import logger
logger.remove()
@@ -34,12 +101,13 @@ def configure_google_api():
logger.info("Google Gemini API configured successfully.")
def transcribe_audio(audio_file_path):
def transcribe_audio(audio_file_path: str, prompt: str = "Transcribe the following audio:") -> Optional[str]:
"""
Transcribes audio using Google's Gemini Pro model.
Transcribes audio using Google's Gemini model.
Args:
audio_file_path (str): The path to the audio file to be transcribed.
prompt (str, optional): The prompt to guide the transcription. Defaults to "Transcribe the following audio:".
Returns:
str: The transcribed text from the audio.
@@ -61,7 +129,7 @@ def transcribe_audio(audio_file_path):
logger.error(error_message)
raise FileNotFoundError(error_message)
# Initialize a Gemini model appropriate for your use case.
# Initialize a Gemini model appropriate for audio understanding
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Upload the audio file
@@ -79,7 +147,7 @@ def transcribe_audio(audio_file_path):
# Generate the transcription
try:
response = model.generate_content([
"Transcribe the following audio:",
prompt,
audio_file
])
@@ -99,3 +167,143 @@ def transcribe_audio(audio_file_path):
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
return None
def summarize_audio(audio_file_path: str) -> Optional[str]:
"""
Summarizes the content of an audio file using Google's Gemini model.
Args:
audio_file_path (str): The path to the audio file to be summarized.
Returns:
str: A summary of the audio content.
Returns None if summarization fails.
"""
return transcribe_audio(audio_file_path, prompt="Please summarize the audio content:")
def analyze_audio_segment(audio_file_path: str, start_time: str, end_time: str) -> Optional[str]:
"""
Analyzes a specific segment of an audio file using timestamps.
Args:
audio_file_path (str): The path to the audio file.
start_time (str): Start time in MM:SS format.
end_time (str): End time in MM:SS format.
Returns:
str: Analysis of the specified audio segment.
Returns None if analysis fails.
"""
prompt = f"Analyze the audio content from {start_time} to {end_time}."
return transcribe_audio(audio_file_path, prompt=prompt)
def transcribe_with_timestamps(audio_file_path: str) -> Optional[str]:
"""
Transcribes audio with timestamps for each segment.
Args:
audio_file_path (str): The path to the audio file.
Returns:
str: Transcription with timestamps.
Returns None if transcription fails.
"""
return transcribe_audio(audio_file_path, prompt="Transcribe the audio with timestamps for each segment:")
def count_tokens(audio_file_path: str) -> Optional[int]:
"""
Counts the number of tokens in an audio file.
Args:
audio_file_path (str): The path to the audio file.
Returns:
int: Number of tokens in the audio file.
Returns None if counting fails.
"""
try:
# Load environment variables and configure the Google API
load_environment()
configure_google_api()
logger.info(f"Attempting to count tokens in audio file: {audio_file_path}")
# Check if file exists
if not os.path.exists(audio_file_path):
error_message = f"FileNotFoundError: The audio file at {audio_file_path} does not exist."
logger.error(error_message)
raise FileNotFoundError(error_message)
# Initialize a Gemini model
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Upload the audio file
try:
audio_file = genai.upload_file(audio_file_path)
logger.info(f"Audio file uploaded successfully: {audio_file=}")
except Exception as e:
logger.error(f"Error uploading audio file: {e}")
return None
# Count tokens
try:
response = model.count_tokens([audio_file])
token_count = response.total_tokens
logger.info(f"Token count: {token_count}")
return token_count
except Exception as e:
logger.error(f"Error counting tokens: {e}")
return None
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
return None
def get_supported_formats() -> List[str]:
"""
Returns a list of supported audio formats.
Returns:
List[str]: List of supported MIME types.
"""
return [
"audio/wav",
"audio/mp3",
"audio/aiff",
"audio/aac",
"audio/ogg",
"audio/flac"
]
# Example usage
if __name__ == "__main__":
# Example 1: Basic transcription
audio_path = "path/to/your/audio.mp3"
transcript = transcribe_audio(audio_path)
print(f"Transcript: {transcript}")
# Example 2: Summarization
summary = summarize_audio(audio_path)
print(f"Summary: {summary}")
# Example 3: Analyze specific segment
segment_analysis = analyze_audio_segment(audio_path, "02:30", "03:29")
print(f"Segment Analysis: {segment_analysis}")
# Example 4: Transcription with timestamps
timestamped_transcript = transcribe_with_timestamps(audio_path)
print(f"Timestamped Transcript: {timestamped_transcript}")
# Example 5: Count tokens
token_count = count_tokens(audio_path)
print(f"Token Count: {token_count}")
# Example 6: Get supported formats
formats = get_supported_formats()
print(f"Supported Formats: {formats}")

View File

@@ -0,0 +1,116 @@
"""
Gemini Image Description Module
This module provides functionality to generate text descriptions of images using Google's Gemini API.
"""
import os
import sys
from typing import Optional, Union, List
from google import genai
from PIL import Image
from dotenv import load_dotenv
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def describe_image(image_path: str, prompt: str = "Describe this image in detail:") -> Optional[str]:
"""
Generate a text description of an image using Google's Gemini API.
Parameters:
image_path (str): Path to the image file.
prompt (str, optional): Custom prompt to guide the image description.
Defaults to "Describe this image in detail:".
Returns:
Optional[str]: The generated description of the image, or None if an error occurs.
Raises:
FileNotFoundError: If the image file does not exist.
ValueError: If the API key is not set.
"""
try:
# Load environment variables
load_dotenv()
# Check if API key is set
api_key = os.getenv('GEMINI_API_KEY')
if not api_key:
error_message = "GEMINI_API_KEY environment variable is not set"
logger.error(error_message)
raise ValueError(error_message)
# Check if image file exists
if not os.path.exists(image_path):
error_message = f"Image file not found: {image_path}"
logger.error(error_message)
raise FileNotFoundError(error_message)
# Initialize the Gemini client
client = genai.Client(api_key=api_key)
# Open and process the image
try:
image = Image.open(image_path)
logger.info(f"Successfully opened image: {image_path}")
except Exception as e:
error_message = f"Failed to open image: {e}"
logger.error(error_message)
return None
# Generate content description
try:
response = client.models.generate_content(
model='gemini-2.0-flash',
contents=[
prompt,
image
]
)
# Extract and return the text
description = response.text
logger.info(f"Successfully generated description for image: {image_path}")
return description
except Exception as e:
error_message = f"Failed to generate content: {e}"
logger.error(error_message)
return None
except Exception as e:
error_message = f"An unexpected error occurred: {e}"
logger.error(error_message)
return None
def analyze_image_with_prompt(image_path: str, prompt: str) -> Optional[str]:
"""
Analyze an image with a custom prompt using Google's Gemini API.
Parameters:
image_path (str): Path to the image file.
prompt (str): Custom prompt for analyzing the image.
Returns:
Optional[str]: The generated analysis of the image, or None if an error occurs.
"""
return describe_image(image_path, prompt)
# Example usage
if __name__ == "__main__":
# Example usage of the function
image_path = "path/to/your/image.jpg"
description = describe_image(image_path)
if description:
print(f"Image description: {description}")
else:
print("Failed to generate image description")

View File

@@ -1,94 +0,0 @@
import requests
import re
import base64
import os
import sys
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
) # for exponential backoff
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def analyze_and_extract_details_from_image(image_path):
"""
Analyzes an image using OpenAI's Vision API to extract Alt Text, Description, Title, and Caption.
This function encodes an image to a base64 string and sends a request to the OpenAI API.
It interprets the contents of the image, returning a textual description.
Args:
image_path (str): Path to the image file.
Returns:
dict: A dictionary with extracted details including Alt Text, Description, Title, and Caption.
None: If an error occurs during processing.
Raises:
SystemExit: If a critical error occurs that prevents the function from executing successfully.
"""
try:
logger.info("Starting image analysis using OpenAI's Vision API.")
def encode_image(path):
""" Encodes an image to a base64 string. """
with open(path, "rb", encoding="utf-8") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
base64_image = encode_image(image_path)
logger.info("Image encoded to base64 successfully.")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze the given image and suggest the following: Alternative text(Alt Text), description, title, caption."
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
"max_tokens": 300
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response.raise_for_status()
assistant_message = response.json()['choices'][0]['message']['content']
logger.info("Received response from OpenAI API.")
# Extracting details using regular expressions
alt_text_match = re.search(r'Alt Text: "(.*?)"', assistant_message)
description_match = re.search(r'Description: (.*?)\n\n', assistant_message)
title_match = re.search(r'Title: "(.*?)"', assistant_message)
caption_match = re.search(r'Caption: "(.*?)"', assistant_message)
image_details = {
'alt_text': alt_text_match.group(1) if alt_text_match else "N/A",
'description': description_match.group(1) if description_match else "N/A",
'title': title_match.group(1) if title_match else "N/A",
'caption': caption_match.group(1) if caption_match else "N/A"
}
logger.info("Image analysis completed successfully.")
return image_details
except requests.RequestException as e:
logger.error(f"GPT-Vision API communication failure. Error: {e}")
sys.exit(f"Exiting due to GPT-Vision API communication failure: {e}")
except Exception as e:
logger.error(f"Unexpected error occurred during image analysis: {e}")
sys.exit(f"Exiting due to an unexpected error: {e}")