Gemini AI common code and utils

This commit is contained in:
ajaysi
2025-04-11 17:47:55 +05:30
parent b556edb989
commit e41be5789a
18 changed files with 590 additions and 262 deletions

View File

@@ -0,0 +1,116 @@
"""
Gemini Image Description Module
This module provides functionality to generate text descriptions of images using Google's Gemini API.
"""
import os
import sys
from typing import Optional, Union, List
from google import genai
from PIL import Image
from dotenv import load_dotenv
from loguru import logger
logger.remove()
logger.add(sys.stdout,
colorize=True,
format="<level>{level}</level>|<green>{file}:{line}:{function}</green>| {message}"
)
def describe_image(image_path: str, prompt: str = "Describe this image in detail:") -> Optional[str]:
"""
Generate a text description of an image using Google's Gemini API.
Parameters:
image_path (str): Path to the image file.
prompt (str, optional): Custom prompt to guide the image description.
Defaults to "Describe this image in detail:".
Returns:
Optional[str]: The generated description of the image, or None if an error occurs.
Raises:
FileNotFoundError: If the image file does not exist.
ValueError: If the API key is not set.
"""
try:
# Load environment variables
load_dotenv()
# Check if API key is set
api_key = os.getenv('GEMINI_API_KEY')
if not api_key:
error_message = "GEMINI_API_KEY environment variable is not set"
logger.error(error_message)
raise ValueError(error_message)
# Check if image file exists
if not os.path.exists(image_path):
error_message = f"Image file not found: {image_path}"
logger.error(error_message)
raise FileNotFoundError(error_message)
# Initialize the Gemini client
client = genai.Client(api_key=api_key)
# Open and process the image
try:
image = Image.open(image_path)
logger.info(f"Successfully opened image: {image_path}")
except Exception as e:
error_message = f"Failed to open image: {e}"
logger.error(error_message)
return None
# Generate content description
try:
response = client.models.generate_content(
model='gemini-2.0-flash',
contents=[
prompt,
image
]
)
# Extract and return the text
description = response.text
logger.info(f"Successfully generated description for image: {image_path}")
return description
except Exception as e:
error_message = f"Failed to generate content: {e}"
logger.error(error_message)
return None
except Exception as e:
error_message = f"An unexpected error occurred: {e}"
logger.error(error_message)
return None
def analyze_image_with_prompt(image_path: str, prompt: str) -> Optional[str]:
"""
Analyze an image with a custom prompt using Google's Gemini API.
Parameters:
image_path (str): Path to the image file.
prompt (str): Custom prompt for analyzing the image.
Returns:
Optional[str]: The generated analysis of the image, or None if an error occurs.
"""
return describe_image(image_path, prompt)
# Example usage
if __name__ == "__main__":
# Example usage of the function
image_path = "path/to/your/image.jpg"
description = describe_image(image_path)
if description:
print(f"Image description: {description}")
else:
print("Failed to generate image description")

View File

@@ -1,94 +0,0 @@
import requests
import re
import base64
import os
import sys
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
) # for exponential backoff
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def analyze_and_extract_details_from_image(image_path):
"""
Analyzes an image using OpenAI's Vision API to extract Alt Text, Description, Title, and Caption.
This function encodes an image to a base64 string and sends a request to the OpenAI API.
It interprets the contents of the image, returning a textual description.
Args:
image_path (str): Path to the image file.
Returns:
dict: A dictionary with extracted details including Alt Text, Description, Title, and Caption.
None: If an error occurs during processing.
Raises:
SystemExit: If a critical error occurs that prevents the function from executing successfully.
"""
try:
logger.info("Starting image analysis using OpenAI's Vision API.")
def encode_image(path):
""" Encodes an image to a base64 string. """
with open(path, "rb", encoding="utf-8") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
base64_image = encode_image(image_path)
logger.info("Image encoded to base64 successfully.")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze the given image and suggest the following: Alternative text(Alt Text), description, title, caption."
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
"max_tokens": 300
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response.raise_for_status()
assistant_message = response.json()['choices'][0]['message']['content']
logger.info("Received response from OpenAI API.")
# Extracting details using regular expressions
alt_text_match = re.search(r'Alt Text: "(.*?)"', assistant_message)
description_match = re.search(r'Description: (.*?)\n\n', assistant_message)
title_match = re.search(r'Title: "(.*?)"', assistant_message)
caption_match = re.search(r'Caption: "(.*?)"', assistant_message)
image_details = {
'alt_text': alt_text_match.group(1) if alt_text_match else "N/A",
'description': description_match.group(1) if description_match else "N/A",
'title': title_match.group(1) if title_match else "N/A",
'caption': caption_match.group(1) if caption_match else "N/A"
}
logger.info("Image analysis completed successfully.")
return image_details
except requests.RequestException as e:
logger.error(f"GPT-Vision API communication failure. Error: {e}")
sys.exit(f"Exiting due to GPT-Vision API communication failure: {e}")
except Exception as e:
logger.error(f"Unexpected error occurred during image analysis: {e}")
sys.exit(f"Exiting due to an unexpected error: {e}")