Youtube AI Writer Tools
This commit is contained in:
377
lib/gpt_providers/text_to_image_generation/gen_gemini_images.py
Normal file
377
lib/gpt_providers/text_to_image_generation/gen_gemini_images.py
Normal file
@@ -0,0 +1,377 @@
|
||||
import os
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
import PIL
|
||||
import streamlit as st
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
import logging
|
||||
import datetime
|
||||
import base64
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
from .save_image import save_generated_image
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger('gemini_image_generator')
|
||||
|
||||
# With image generation in Gemini, your imagination is the limit.
|
||||
# If what you see doesn't quite match what you had in mind, try adding more details to the prompt.
|
||||
# The more specific you are, the better Gemini can create images that reflect your vision.
|
||||
|
||||
# Generate images using Gemini
|
||||
# Gemini 2.0 Flash Experimental supports the ability to output text and inline images.
|
||||
# This lets you use Gemini to conversationally edit images or generate outputs with interwoven text (for example, generating a blog post with text and images in a single turn).
|
||||
# Note: Make sure to include responseModalities: ["Text", "Image"] in your generation configuration for text and image output with gemini-2.0-flash-exp-image-generation. Image only is not allowed.
|
||||
|
||||
|
||||
class AIPromptGenerator:
|
||||
"""
|
||||
Generates enhanced AI image prompts based on user keywords,
|
||||
following the guidelines of the Imagen documentation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.photography_styles = ["photo", "photograph"]
|
||||
self.art_styles = ["painting", "sketch", "drawing", "illustration", "digital art", "render"]
|
||||
self.art_techniques = ["technical pencil drawing", "charcoal drawing", "color pencil drawing", "pastel painting", "digital art", "art deco (poster)", "impressionist painting", "renaissance painting", "pop art"]
|
||||
self.camera_proximity = ["close-up", "zoomed out", "taken from far away"]
|
||||
self.camera_position = ["aerial", "from below"]
|
||||
self.lighting = ["natural lighting", "dramatic lighting", "warm lighting", "cold lighting", "studio lighting", "golden hour lighting"]
|
||||
self.camera_settings = ["motion blur", "soft focus", "bokeh", "portrait"]
|
||||
self.lens_types = ["35mm lens", "50mm lens", "fisheye lens", "wide angle lens", "macro lens", "telephoto lens"]
|
||||
self.film_types = ["black and white film", "polaroid"]
|
||||
self.materials = ["made of cheese", "made of paper", "made of neon tubes", "metallic", "glass", "wooden", "stone"]
|
||||
self.shapes = ["in the shape of a bird", "angular", "curved", "geometric"]
|
||||
self.quality_modifiers_general = ["high-quality", "beautiful", "stylized", "detailed", "epic", "grand"]
|
||||
self.quality_modifiers_photo = ["4K", "HDR", "studio photo", "professional photo", "photorealistic"]
|
||||
self.quality_modifiers_art = ["by a professional artist", "intricate details", "masterpiece"]
|
||||
self.aspect_ratios = ["1:1 aspect ratio", "4:3 aspect ratio", "3:4 aspect ratio", "16:9 aspect ratio", "9:16 aspect ratio"]
|
||||
self.photorealistic_modifiers = {
|
||||
"portraits": ["prime lens", "zoom lens", "24-35mm", "black and white film", "film noir", "shallow depth of field", "duotone (mention two colors)"],
|
||||
"objects": ["macro lens", "60-105mm", "high detail", "precise focusing", "controlled lighting"],
|
||||
"motion": ["telephoto zoom lens", "100-400mm", "fast shutter speed", "action shot", "movement tracking"],
|
||||
"wide-angle": ["wide-angle lens", "10-24mm", "long exposure", "sharp focus", "smooth water or clouds", "astro photography"]
|
||||
}
|
||||
|
||||
def generate_prompt(self, keywords):
|
||||
"""
|
||||
Generates an enhanced AI image prompt based on user-provided keywords.
|
||||
|
||||
Args:
|
||||
keywords (list): A list of keywords describing the desired image.
|
||||
|
||||
Returns:
|
||||
str: An enhanced AI image prompt.
|
||||
"""
|
||||
if not keywords:
|
||||
return "A beautiful image."
|
||||
|
||||
prompt_parts = []
|
||||
subject = " ".join(keywords)
|
||||
prompt_parts.append(subject)
|
||||
|
||||
# Add context and background (optional)
|
||||
context_options = ["in a detailed background", "outdoors", "indoors", "in a studio", "with a blurred background"]
|
||||
if random.random() < 0.6: # Add context with a probability
|
||||
prompt_parts.append(random.choice(context_options))
|
||||
|
||||
# Add style (optional)
|
||||
style_options = self.photography_styles + [f"{art} of" for art in self.art_styles]
|
||||
if random.random() < 0.7:
|
||||
prompt_parts.insert(0, random.choice(style_options))
|
||||
if prompt_parts[0].startswith("painting of") or prompt_parts[0].startswith("sketch of") or prompt_parts[0].startswith("drawing of"):
|
||||
if random.random() < 0.5:
|
||||
prompt_parts.append(f"in the style of {random.choice(self.art_techniques)}")
|
||||
|
||||
# Add photography modifiers (if photography style is chosen)
|
||||
if any(style in prompt_parts[0] for style in self.photography_styles):
|
||||
if random.random() < 0.4:
|
||||
prompt_parts.append(random.choice(self.camera_proximity))
|
||||
if random.random() < 0.3:
|
||||
prompt_parts.append(random.choice(self.camera_position))
|
||||
if random.random() < 0.5:
|
||||
prompt_parts.append(random.choice(self.lighting))
|
||||
if random.random() < 0.3:
|
||||
prompt_parts.append(random.choice(self.camera_settings))
|
||||
if random.random() < 0.2:
|
||||
prompt_parts.append(random.choice(self.lens_types))
|
||||
if random.random() < 0.1:
|
||||
prompt_parts.append(random.choice(self.film_types))
|
||||
|
||||
# Add shapes and materials (optional)
|
||||
if random.random() < 0.3:
|
||||
prompt_parts.append(random.choice(self.materials))
|
||||
if random.random() < 0.2:
|
||||
prompt_parts.append(random.choice(self.shapes))
|
||||
|
||||
# Add quality modifiers (optional)
|
||||
if random.random() < 0.6:
|
||||
quality_options = self.quality_modifiers_general
|
||||
if any(style in prompt_parts[0] for style in self.photography_styles):
|
||||
quality_options += self.quality_modifiers_photo
|
||||
else:
|
||||
quality_options += self.quality_modifiers_art
|
||||
prompt_parts.append(random.choice(list(set(quality_options)))) # Avoid duplicates
|
||||
|
||||
# Add aspect ratio (optional)
|
||||
if random.random() < 0.2:
|
||||
prompt_parts.append(random.choice(self.aspect_ratios))
|
||||
|
||||
return ", ".join(prompt_parts)
|
||||
|
||||
def generate_photorealistic_prompt(self, keywords, focus=""):
|
||||
"""
|
||||
Generates an enhanced AI image prompt specifically for photorealistic images.
|
||||
|
||||
Args:
|
||||
keywords (list): A list of keywords describing the desired image.
|
||||
focus (str, optional): The focus of the photorealistic image (e.g., "portraits", "objects", "motion", "wide-angle"). Defaults to "".
|
||||
|
||||
Returns:
|
||||
str: An enhanced photorealistic AI image prompt.
|
||||
"""
|
||||
if not keywords:
|
||||
return "A photorealistic image."
|
||||
|
||||
prompt_parts = ["A photo of", "photorealistic"]
|
||||
prompt_parts.append(" ".join(keywords))
|
||||
|
||||
if focus and focus in self.photorealistic_modifiers:
|
||||
modifiers = self.photorealistic_modifiers[focus]
|
||||
if modifiers:
|
||||
num_modifiers = random.randint(1, min(3, len(modifiers)))
|
||||
selected_modifiers = random.sample(modifiers, num_modifiers)
|
||||
prompt_parts.extend(selected_modifiers)
|
||||
|
||||
# Add general quality modifiers
|
||||
if random.random() < 0.5:
|
||||
prompt_parts.append(random.choice(self.quality_modifiers_photo))
|
||||
|
||||
# Add lighting
|
||||
if random.random() < 0.4:
|
||||
prompt_parts.append(random.choice(self.lighting))
|
||||
|
||||
return ", ".join(prompt_parts)
|
||||
|
||||
|
||||
def generate_gemini_image(prompt, keywords=None, style=None, focus=None, enhance_prompt=True, max_retries=3, initial_retry_delay=2):
|
||||
"""
|
||||
Generate images using Gemini
|
||||
Depending on the prompt and context, Gemini will generate content in different modes (text to image, text to image and text, etc.).
|
||||
Here are some examples:
|
||||
|
||||
1). Text to image
|
||||
Example prompt: "Generate an image of the Eiffel tower with fireworks in the background."
|
||||
2). Text to image(s) and text (interleaved)
|
||||
Example prompt: "Generate an illustrated recipe for a paella."
|
||||
|
||||
Image generation may not always trigger:
|
||||
- The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image").
|
||||
- The model may stop generating partway through. Try again or try a different prompt.
|
||||
|
||||
Args:
|
||||
prompt (str): The prompt to generate the image from.
|
||||
keywords (list, optional): Keywords to enhance the prompt. Defaults to None.
|
||||
style (str, optional): The style of the image. Defaults to None.
|
||||
focus (str, optional): The focus of the image (e.g., "portraits", "objects", "motion", "wide-angle"). Defaults to None.
|
||||
enhance_prompt (bool, optional): Whether to enhance the prompt using AIPromptGenerator. Defaults to True.
|
||||
max_retries (int, optional): Maximum number of retry attempts for handling 503 errors. Defaults to 3.
|
||||
initial_retry_delay (int, optional): Initial delay in seconds before retrying. Defaults to 2.
|
||||
|
||||
Returns:
|
||||
str: The path to the generated image.
|
||||
"""
|
||||
logger.info(f"Generating image with prompt: '{prompt[:100]}...'")
|
||||
|
||||
# Enhance the prompt if requested
|
||||
if enhance_prompt and keywords:
|
||||
prompt_generator = AIPromptGenerator()
|
||||
if style == "photorealistic" and focus:
|
||||
logger.info(f"Generating photorealistic prompt with focus: {focus}")
|
||||
enhanced_prompt = prompt_generator.generate_photorealistic_prompt(keywords, focus)
|
||||
else:
|
||||
logger.info("Generating enhanced prompt")
|
||||
enhanced_prompt = prompt_generator.generate_prompt(keywords)
|
||||
|
||||
# Combine the enhanced prompt with the original prompt
|
||||
prompt = f"{prompt}\n\nEnhanced prompt: {enhanced_prompt}"
|
||||
logger.info(f"Final prompt: '{prompt[:100]}...'")
|
||||
|
||||
retry_count = 0
|
||||
retry_delay = initial_retry_delay
|
||||
|
||||
while retry_count <= max_retries:
|
||||
try:
|
||||
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
||||
contents = (prompt)
|
||||
|
||||
logger.info("Sending request to Gemini API")
|
||||
response = client.models.generate_content(
|
||||
model="gemini-2.0-flash-exp-image-generation",
|
||||
contents=contents,
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['Text', 'Image']
|
||||
)
|
||||
)
|
||||
logger.info("Received response from Gemini API")
|
||||
|
||||
img_name = None
|
||||
for part in response.candidates[0].content.parts:
|
||||
if part.text is not None:
|
||||
logger.info(f"Received text response: '{part.text[:100]}...'")
|
||||
print(part.text)
|
||||
elif part.inline_data is not None:
|
||||
logger.info("Received image data from Gemini")
|
||||
image = Image.open(BytesIO((part.inline_data.data)))
|
||||
image.show()
|
||||
if part.text is not None:
|
||||
img_name = f'{part.text}-gemini-native-image.png'
|
||||
else:
|
||||
img_name = f'gemini-native-image-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.png'
|
||||
try:
|
||||
logger.info(f"Saving image to: {img_name}")
|
||||
image.save(img_name)
|
||||
|
||||
# Create a dictionary with the expected format for save_generated_image
|
||||
img_response = {
|
||||
"artifacts": [
|
||||
{
|
||||
"base64": base64.b64encode(open(img_name, "rb").read()).decode('utf-8')
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Call save_generated_image with the correct format
|
||||
save_generated_image(img_response)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to save image: {err}")
|
||||
st.error(f"Failed to save image: {err}")
|
||||
|
||||
logger.info(f"Image generation completed. Image name: {img_name}")
|
||||
return img_name
|
||||
except Exception as err:
|
||||
error_message = str(err)
|
||||
logger.error(f"Error in generate_gemini_image: {err}")
|
||||
|
||||
# Check if this is a 503 UNAVAILABLE error
|
||||
if "503 UNAVAILABLE" in error_message and retry_count < max_retries:
|
||||
retry_count += 1
|
||||
logger.info(f"Model is overloaded. Retrying in {retry_delay} seconds (attempt {retry_count}/{max_retries})")
|
||||
st.warning(f"The image generation service is currently busy. Retrying in {retry_delay} seconds...")
|
||||
time.sleep(retry_delay)
|
||||
# Exponential backoff
|
||||
retry_delay *= 2
|
||||
else:
|
||||
st.error(f"Error generating image: {err}")
|
||||
return None
|
||||
|
||||
# If we've exhausted all retries
|
||||
st.error("The image generation service is currently unavailable. Please try again later.")
|
||||
return None
|
||||
|
||||
|
||||
def edit_image(image_path, prompt, max_retries=3, initial_retry_delay=2):
|
||||
"""
|
||||
- Image editing (text and image to image)
|
||||
Example prompt: "Edit this image to make it look like a cartoon"
|
||||
Example prompt: [image of a cat] + [image of a pillow] + "Create a cross stitch of my cat on this pillow."
|
||||
|
||||
- Multi-turn image editing (chat)
|
||||
Example prompts: [upload an image of a blue car.] "Turn this car into a convertible." "Now change the color to yellow."
|
||||
|
||||
Image editing with Gemini
|
||||
To perform image editing, add an image as input.
|
||||
The following example demonstrats uploading base64 encoded images.
|
||||
For multiple images and larger payloads, check the image input section.
|
||||
|
||||
Args:
|
||||
image_path (str): The path to the image to edit.
|
||||
prompt (str): The prompt to edit the image with.
|
||||
max_retries (int, optional): Maximum number of retry attempts for handling 503 errors. Defaults to 3.
|
||||
initial_retry_delay (int, optional): Initial delay in seconds before retrying. Defaults to 2.
|
||||
|
||||
Returns:
|
||||
str: The path to the edited image.
|
||||
"""
|
||||
import PIL.Image
|
||||
image = PIL.Image.open(image_path)
|
||||
|
||||
retry_count = 0
|
||||
retry_delay = initial_retry_delay
|
||||
|
||||
while retry_count <= max_retries:
|
||||
try:
|
||||
client = genai.Client()
|
||||
text_input = (prompt)
|
||||
|
||||
logger.info("Sending request to Gemini API for image editing")
|
||||
response = client.models.generate_content(
|
||||
model="gemini-2.0-flash-exp-image-generation",
|
||||
contents=[text_input, image],
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['Text', 'Image']
|
||||
)
|
||||
)
|
||||
logger.info("Received response from Gemini API for image editing")
|
||||
|
||||
edited_img_name = None
|
||||
for part in response.candidates[0].content.parts:
|
||||
if part.text is not None:
|
||||
logger.info(f"Received text response: '{part.text[:100]}...'")
|
||||
st.write(part.text)
|
||||
elif part.inline_data is not None:
|
||||
logger.info("Received edited image data from Gemini")
|
||||
edited_image = Image.open(BytesIO(part.inline_data.data))
|
||||
edited_image.show()
|
||||
|
||||
# Save the edited image
|
||||
edited_img_name = f'edited-{os.path.basename(image_path)}'
|
||||
try:
|
||||
logger.info(f"Saving edited image to: {edited_img_name}")
|
||||
edited_image.save(edited_img_name)
|
||||
|
||||
# Create a dictionary with the expected format for save_generated_image
|
||||
img_response = {
|
||||
"artifacts": [
|
||||
{
|
||||
"base64": base64.b64encode(open(edited_img_name, "rb").read()).decode('utf-8')
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Call save_generated_image with the correct format
|
||||
save_generated_image(img_response)
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to save edited image: {err}")
|
||||
st.error(f"Failed to save edited image: {err}")
|
||||
|
||||
logger.info(f"Image editing completed. Edited image name: {edited_img_name}")
|
||||
return edited_img_name
|
||||
except Exception as err:
|
||||
error_message = str(err)
|
||||
logger.error(f"Error in edit_image: {err}")
|
||||
|
||||
# Check if this is a 503 UNAVAILABLE error
|
||||
if "503 UNAVAILABLE" in error_message and retry_count < max_retries:
|
||||
retry_count += 1
|
||||
logger.info(f"Model is overloaded. Retrying in {retry_delay} seconds (attempt {retry_count}/{max_retries})")
|
||||
st.warning(f"The image editing service is currently busy. Retrying in {retry_delay} seconds...")
|
||||
time.sleep(retry_delay)
|
||||
# Exponential backoff
|
||||
retry_delay *= 2
|
||||
else:
|
||||
st.error(f"Error editing image: {err}")
|
||||
return None
|
||||
|
||||
# If we've exhausted all retries
|
||||
st.error("The image editing service is currently unavailable. Please try again later.")
|
||||
return None
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ logger.add(sys.stdout,
|
||||
from .gen_dali3_images import generate_dalle3_images
|
||||
from .gen_stabl_diff_img import generate_stable_diffusion_image
|
||||
from ..text_generation.main_text_generation import llm_text_gen
|
||||
|
||||
from .gen_gemini_images import generate_gemini_image
|
||||
|
||||
def generate_image(user_prompt):
|
||||
"""
|
||||
@@ -44,7 +44,7 @@ def generate_image(user_prompt):
|
||||
--> user (str): A unique identifier representing your end-user, which will help OpenAI to monitor and detect abuse.
|
||||
"""
|
||||
# FIXME: Need to remove default value to match sidebar input.
|
||||
image_engine = 'Stability-AI'
|
||||
image_engine = 'Gemini-AI'
|
||||
image_stored_at = None
|
||||
|
||||
if user_prompt:
|
||||
@@ -57,6 +57,9 @@ def generate_image(user_prompt):
|
||||
logger.info(f"Calling Stable diffusion text-to-image with prompt: \n{img_prompt}")
|
||||
print("\n\n")
|
||||
image_stored_at = generate_stable_diffusion_image(img_prompt)
|
||||
elif 'Gemini-AI' in image_engine:
|
||||
logger.info(f"Calling Gemini text-to-image with prompt: \n{img_prompt}")
|
||||
image_stored_at = generate_gemini_image(img_prompt)
|
||||
return image_stored_at
|
||||
except Exception as err:
|
||||
logger.error(f"Failed to generate Image: {err}")
|
||||
|
||||
Reference in New Issue
Block a user